From e496640bf40bd68ad9feb991320d84e05c9e677a Mon Sep 17 00:00:00 2001
From: yaoxuefeng <yaoxuefeng@baidu.com>
Date: Wed, 30 Sep 2020 14:31:23 +0800
Subject: [PATCH 01/91] fix bmm enforce equal batch (#27694)

---
 python/paddle/fluid/tests/unittests/test_bmm_op.py | 2 ++
 python/paddle/tensor/linalg.py                     | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_bmm_op.py b/python/paddle/fluid/tests/unittests/test_bmm_op.py
index cb1b3ded53472..a1c8266842087 100644
--- a/python/paddle/fluid/tests/unittests/test_bmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bmm_op.py
@@ -79,8 +79,10 @@ def test_api_error(self):
         y_data = np.arange(16, dtype='float32').reshape((2, 4, 2))
         y_data_wrong1 = np.arange(16, dtype='float32').reshape((2, 2, 4))
         y_data_wrong2 = np.arange(16, dtype='float32').reshape((2, 2, 2, 2))
+        y_data_wrong3 = np.arange(24, dtype='float32').reshape((3, 4, 2))
         self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong1)
         self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong2)
+        self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong3)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index c41c9226d16b4..2dcdf1603a737 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -848,6 +848,10 @@ def bmm(x, y, name=None):
         raise ValueError(
             "x's width must be equal with y's height. But received x's shape: {}, y's shape: {}".
             format(x_shape, y_shape))
+    if x_shape[0] != y_shape[0]:
+        raise ValueError(
+            "x's batch (shape[0]) must be equal with y's batch (shape[0]). But received x's shape: {}, y's shape: {}".
+            format(x_shape, y_shape))
     helper = LayerHelper('bmm', **locals())
     if in_dygraph_mode():
         return core.ops.bmm(x, y)

From a01bc6b31ddd6fd65ffde553bb3f057b09f253f9 Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Wed, 30 Sep 2020 15:13:04 +0800
Subject: [PATCH 02/91] =?UTF-8?q?=E3=80=90paddle.fleet=E3=80=91fleet=20sup?=
 =?UTF-8?q?port=20non=5Fdistributed=20training=20in=20dygraph=20mode=20(#2?=
 =?UTF-8?q?7714)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fleet support non_distributed training in dygraph mode; test=develop
---
 .../distributed/fleet/base/fleet_base.py      |  2 +
 .../fluid/tests/unittests/test_fleet_base.py  | 39 +++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 3fdd6e9248303..7eb3a5659654a 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -187,6 +187,8 @@ def init(self, role_maker=None, is_collective=False):
 
         self.strategy_compiler = StrategyCompiler()
         if paddle.fluid.framework.in_dygraph_mode():
+            if self.worker_num() == 1:
+                return
             if parallel_helper._is_parallel_ctx_initialized():
                 warnings.warn(
                     "The dygraph parallel environment has been initialized.")
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index 4945c158025b7..3d4b2e218f725 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -18,6 +18,7 @@
 import paddle.distributed.fleet.base.role_maker as role_maker
 import os
 import paddle.fluid as fluid
+import paddle.nn as nn
 import numpy as np
 
 
@@ -170,6 +171,44 @@ def test_dygraph_method(self):
         final_strategy = fleet._final_strategy()
 
 
+class LinearNet(nn.Layer):
+    def __init__(self):
+        super(LinearNet, self).__init__()
+        self._linear1 = nn.Linear(10, 10)
+        self._linear2 = nn.Linear(10, 1)
+
+    def forward(self, x):
+        return self._linear2(self._linear1(x))
+
+
+class TestFleetDygraphSingle(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def test_dygraph_single(self):
+        paddle.disable_static()
+        fleet.init(is_collective=True)
+
+        layer = LinearNet()
+        loss_fn = nn.MSELoss()
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=layer.parameters())
+
+        adam = fleet.distributed_optimizer(adam)
+        dp_layer = fleet.distributed_model(layer)
+        for step in range(2):
+            inputs = paddle.randn([10, 10], 'float32')
+            outputs = dp_layer(inputs)
+            labels = paddle.randn([10, 1], 'float32')
+            loss = loss_fn(outputs, labels)
+            loss.backward()
+            adam.step()
+            adam.clear_grad()
+
+
 class TestFleetBaseSingleRunCollective(unittest.TestCase):
     def setUp(self):
         os.environ.pop("PADDLE_TRAINER_ENDPOINTS")

From 7f9b198d59ceaf2ec8373d41a0b34a364c6a47be Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Wed, 30 Sep 2020 15:18:16 +0800
Subject: [PATCH 03/91] Romove grid_sampler and refine example code (#27649)

* refine grid_sample and temporal_shift
---
 python/paddle/fluid/layers/nn.py              | 16 +++--
 .../unittests/test_grid_sample_function.py    | 10 +++
 .../tests/unittests/test_temporal_shift_op.py |  8 +++
 python/paddle/nn/functional/vision.py         | 66 +++++++++++--------
 4 files changed, 67 insertions(+), 33 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bec47d9227e1a..b3908e3c8ebc1 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -13421,7 +13421,7 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
     ${comment}
 
     Args:
-        x(Variable): ${x_comment}
+        x(Tensor): ${x_comment}
         seg_num(int): ${seg_num_comment}
         shift_ratio(float): ${shift_ratio_comment}
         name(str, optional): For detailed information, please refer
@@ -13429,7 +13429,7 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
                              None by default.
 
     Returns:
-        out(Variable): The temporal shifting result is a tensor variable with the
+        out(Tensor): The temporal shifting result is a tensor with the
         same shape and same data type as the input.
 
     Raises:
@@ -13438,9 +13438,11 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            input = fluid.data(name='input', shape=[None,4,2,2], dtype='float32')
-            out = fluid.layers.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
+            import paddle
+            import paddle.nn.functional as F
+
+            input = paddle.randn([6, 4, 2, 2])
+            out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
     """
     helper = LayerHelper("temporal_shift", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
@@ -13452,6 +13454,10 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
     if not isinstance(seg_num, int):
         raise TypeError("seg_num must be int type.")
 
+    if in_dygraph_mode():
+        return core.ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio',
+                                       shift_ratio)
+
     helper.append_op(
         type="temporal_shift",
         inputs={"X": x},
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
index ea94a8ba69a78..9ad0309a70e31 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
@@ -127,5 +127,15 @@ def load_tests(loader, standard_tests, pattern):
     return suite
 
 
+class TestGridSampleAPI(unittest.TestCase):
+    def test_errors(self):
+        with self.assertRaises(ValueError):
+            x = paddle.randn([1, 1, 3, 3])
+            F.grid_sample(x, 1.0)
+        with self.assertRaises(ValueError):
+            x = paddle.randn([1, 1, 3, 3])
+            F.grid_sample(1.0, x)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
index f800f7b2ca857..1fbc0fc4604c2 100644
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
@@ -18,6 +18,7 @@
 import numpy as np
 from op_test import OpTest
 
+import paddle
 from paddle.fluid import core
 
 
@@ -77,5 +78,12 @@ def initTestCase(self):
         self.shift_ratio = 0.3
 
 
+class TestTemporalShiftAPI(unittest.TestCase):
+    def test_api(self):
+        input = paddle.randn([6, 4, 2, 2])
+        out = paddle.nn.functional.temporal_shift(
+            x=input, seg_num=2, shift_ratio=0.2)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index a74a98d5ed45b..7f86e56df1b54 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -34,7 +34,6 @@
 from ...fluid.layers import generate_mask_labels  #DEFINE_ALIAS
 from ...fluid.layers import generate_proposal_labels  #DEFINE_ALIAS
 from ...fluid.layers import generate_proposals  #DEFINE_ALIAS
-from ...fluid.layers import grid_sampler  #DEFINE_ALIAS
 from ...fluid.layers import image_resize  #DEFINE_ALIAS
 from ...fluid.layers import prior_box  #DEFINE_ALIAS
 from ...fluid.layers import prroi_pool  #DEFINE_ALIAS
@@ -74,7 +73,7 @@
     'generate_mask_labels',
     'generate_proposal_labels',
     'generate_proposals',
-    'grid_sampler',
+    'grid_sample',
     'image_resize',
     'image_resize_short',
     #       'multi_box_head',
@@ -205,25 +204,35 @@ def grid_sample(x,
     data x and y is indexing the 3rd dimension (in height dimension),
     finally results is the bilinear interpolation or nearest value of 4 nearest corner
     points. The output tensor shape will be [N, C, H, W].
+
+
+    Step 1:
+
+    Get (x, y) grid coordinates and scale to [0, H-1/W-1].
+
+    .. code-block:: text
+
+        grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
+        grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
+
+    Step 2:
+    
+    Indices input data X with grid (x, y) in each [H, W] area, and bilinear
+    interpolate point value by 4 nearest points or nearest interpolate point value
+    by nearest point.
+
     .. code-block:: text
-        Step 1:
-        Get (x, y) grid coordinates and scale to [0, H-1/W-1].
-        .. code-block:: text
-            grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
-            grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
-        Step 2:
-        Indices input data X with grid (x, y) in each [H, W] area, and bilinear
-        interpolate point value by 4 nearest points or nearest interpolate point value
-        by nearest point.
-          wn ------- y_n ------- en
-          |           |           |
-          |          d_n          |
-          |           |           |
-         x_w --d_w-- grid--d_e-- x_e
-          |           |           |
-          |          d_s          |
-          |           |           |
-          ws ------- y_s ------- wn
+
+        wn ------- y_n ------- en
+        |           |           |
+        |          d_n          |
+        |           |           |
+        x_w --d_w-- grid--d_e-- x_e
+        |           |           |
+        |          d_s          |
+        |           |           |
+        ws ------- y_s ------- wn
+
         For bilinear interpolation:
         x_w = floor(x)              // west side x coord
         x_e = x_w + 1               // east side x coord
@@ -237,8 +246,10 @@ def grid_sample(x,
         en = X[:, :, y_n, x_e]      // north-east point value
         ws = X[:, :, y_s, x_w]      // south-east point value
         es = X[:, :, y_s, x_w]      // north-east point value
+
         output = wn * d_e * d_s + en * d_w * d_s
-               + ws * d_e * d_n + es * d_w * d_n
+                + ws * d_e * d_n + es * d_w * d_n
+
     Args:
         x(Tensor): The input tensor, which is a 4-d tensor with shape
                      [N, C, H, W], N is the batch size, C is the channel
@@ -262,7 +273,9 @@ def grid_sample(x,
         Tensor, The shape of output is [N, C, grid_H, grid_W] in which `grid_H` is the height of grid and `grid_W` is the width of grid. The data type is same as input tensor.
 
     Examples:
+
         .. code-block:: python
+        
             import paddle
             import paddle.nn.functional as F
             import numpy as np
@@ -287,7 +300,7 @@ def grid_sample(x,
                             [ 0.7,  0.4],
                             [ 0.2,  0.8]]]]).astype("float64")
             
-            paddle.disable_static()
+            
             x = paddle.to_tensor(x)
             grid = paddle.to_tensor(grid)
             y_t = F.grid_sample(
@@ -304,13 +317,10 @@ def grid_sample(x,
             #    [ 0.596  0.38   0.52   0.24 ]]]]
     """
     helper = LayerHelper("grid_sample", **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sampler')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sample')
     check_variable_and_dtype(grid, 'grid', ['float32', 'float64'],
-                             'grid_sampler')
-    if not isinstance(x, Variable):
-        raise ValueError("The x should be a Variable")
-    if not isinstance(grid, Variable):
-        raise ValueError("The grid should be a Variable")
+                             'grid_sample')
+
     _modes = ['bilinear', 'nearest']
     _padding_modes = ['zeros', 'reflection', 'border']
     if mode not in _modes:

From 488152a6d076eac91ef0921ff6e16c65777f814d Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Wed, 30 Sep 2020 15:22:03 +0800
Subject: [PATCH 04/91] [API 2.0]Update 2.0 api from fluid to paddle. (#27598)

---
 python/paddle/fluid/executor.py      |  9 +++--
 python/paddle/fluid/layers/nn.py     | 60 +++++++++++++++-------------
 python/paddle/fluid/layers/tensor.py | 12 +++---
 python/paddle/fluid/param_attr.py    | 20 +++++-----
 python/paddle/static/__init__.py     |  3 ++
 python/paddle/static/nn/__init__.py  |  2 +
 tools/wlist.json                     |  1 -
 7 files changed, 58 insertions(+), 49 deletions(-)

diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 3dc30767e5aa4..7d067b6347844 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -94,12 +94,13 @@ def scope_guard(scope):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import numpy
+            paddle.enable_static()
 
-            new_scope = fluid.Scope()
-            with fluid.scope_guard(new_scope):
-                 fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace())
+            new_scope = paddle.static.Scope()
+            with paddle.static.scope_guard(new_scope):
+                 paddle.static.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), paddle.CPUPlace())
             numpy.array(new_scope.find_var("data").get_tensor())
     """
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index b3908e3c8ebc1..1bd279c1e821d 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -13546,15 +13546,15 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
     """
     :api_attr: Static Graph
 
-    This OP is used to register customized Python OP to Paddle Fluid. The design
-    principe of py_func is that LodTensor and numpy array can be converted to each
+    This OP is used to register customized Python OP to Paddle. The design
+    principe of py_func is that Tensor and numpy array can be converted to each
     other easily. So you can use Python and numpy API to register a python OP.
 
     The forward  function of the registered OP is ``func`` and the backward function
     of that is  ``backward_func``. Paddle will call ``func`` at forward runtime and
     call ``backward_func`` at backward runtime(if ``backward_func`` is not  None).
-    ``x`` is the input of ``func``, whose type must be LoDTensor; ``out`` is
-    the output of ``func``, whose type can be either LoDTensor or numpy array.
+    ``x`` is the input of ``func``, whose type must be Tensor; ``out`` is
+    the output of ``func``, whose type can be either Tensor or numpy array.
 
     The input of the backward function ``backward_func`` is ``x``, ``out`` and
     the gradient of ``out``. If some variables of ``out`` have no gradient, the
@@ -13572,14 +13572,14 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
         func (callable): The forward function of the registered OP. When the network
             is running, the forward output ``out`` will be calculated according to this
             function and the forward input ``x``. In ``func`` , it's suggested that we
-            actively convert LoDTensor into a numpy array, so that we can use Python and
+            actively convert Tensor into a numpy array, so that we can use Python and
             numpy API arbitrarily. If not, some operations of numpy may not be compatible.
         x (Variable|tuple(Variale)|list[Variale]): The input of the forward function ``func``.
-            It can be Variable|tuple(Variale)|list[Variale], where Variable is LoDTensor or
+            It can be Variable|tuple(Variale)|list[Variale], where Variable is Tensor or
             Tenosor. In addition, Multiple Variable should be passed in the form of tuple(Variale)
             or list[Variale].
         out (Variable|tuple(Variale)|list[Variale]): The output of the forward function ``func``,
-            it can be Variable|tuple(Variale)|list[Variale], where Variable can be either LoDTensor
+            it can be Variable|tuple(Variale)|list[Variale], where Variable can be either Tensor
             or numpy array. Since Paddle cannot automatically infer the shape and type of ``out``,
             you must create ``out`` in advance.
         backward_func (callable, optional): The backward function of the registered OP.
@@ -13600,16 +13600,18 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
         .. code-block:: python
 
             # example 1:
-            import paddle.fluid as fluid
+            import paddle
             import six
 
-            # Creates a forward function, LodTensor can be input directly without
+            paddle.enable_static()
+
+            # Creates a forward function, Tensor can be input directly without
             # being converted into numpy array.
             def tanh(x):
                 return np.tanh(x)
 
             # Skip x in backward function and return the gradient of x
-            # LodTensor must be actively converted to numpy array, otherwise,
+            # Tensor must be actively converted to numpy array, otherwise,
             # operations such as +/- can't be used.
             def tanh_grad(y, dy):
                 return np.array(dy) * (1 - np.square(np.array(y)))
@@ -13619,36 +13621,38 @@ def debug_func(x):
                 print(x)
 
             def create_tmp_var(name, dtype, shape):
-                return fluid.default_main_program().current_block().create_var(
+                return paddle.static.default_main_program().current_block().create_var(
                     name=name, dtype=dtype, shape=shape)
 
             def simple_net(img, label):
                 hidden = img
                 for idx in six.moves.range(4):
-                    hidden = fluid.layers.fc(hidden, size=200)
+                    hidden = paddle.static.nn.fc(hidden, size=200)
                     new_hidden = create_tmp_var(name='hidden_{}'.format(idx),
                         dtype=hidden.dtype, shape=hidden.shape)
 
                     # User-defined forward and backward
-                    hidden = fluid.layers.py_func(func=tanh, x=hidden,
+                    hidden = paddle.static.nn.py_func(func=tanh, x=hidden,
                         out=new_hidden, backward_func=tanh_grad,
                         skip_vars_in_backward_input=hidden)
 
-                    # User-defined debug functions that print out the input LodTensor
-                    fluid.layers.py_func(func=debug_func, x=hidden, out=None)
+                    # User-defined debug functions that print out the input Tensor
+                    paddle.static.nn.py_func(func=debug_func, x=hidden, out=None)
 
-                prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-                loss = fluid.layers.cross_entropy(input=prediction, label=label)
-                return fluid.layers.mean(loss)
+                prediction = paddle.static.nn.fc(hidden, size=10, act='softmax')
+                loss = paddle.static.nn.cross_entropy(input=prediction, label=label)
+                return paddle.mean(loss)
 
             # example 2:
-            # This example shows how to turn LoDTensor into numpy array and
+            # This example shows how to turn Tensor into numpy array and
             # use numpy API to register an Python OP
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
+            paddle.enable_static()
+
             def element_wise_add(x, y):
-                # LodTensor must be actively converted to numpy array, otherwise,
+                # Tensor must be actively converted to numpy array, otherwise,
                 # numpy.shape can't be used.
                 x = np.array(x)
                 y = np.array(y)
@@ -13664,24 +13668,24 @@ def element_wise_add(x, y):
                 return result
 
             def create_tmp_var(name, dtype, shape):
-                return fluid.default_main_program().current_block().create_var(
+                return paddle.static.default_main_program().current_block().create_var(
                             name=name, dtype=dtype, shape=shape)
 
             def py_func_demo():
-                start_program = fluid.default_startup_program()
-                main_program = fluid.default_main_program()
+                start_program = paddle.static.default_startup_program()
+                main_program = paddle.static.default_main_program()
 
                 # Input of the forward function
-                x = fluid.data(name='x', shape=[2,3], dtype='int32')
-                y = fluid.data(name='y', shape=[2,3], dtype='int32')
+                x = paddle.static.data(name='x', shape=[2,3], dtype='int32')
+                y = paddle.static.data(name='y', shape=[2,3], dtype='int32')
 
                 # Output of the forward function, name/dtype/shape must be specified
                 output = create_tmp_var('output','int32', [3,1])
 
                 # Multiple Variable should be passed in the form of tuple(Variale) or list[Variale]
-                fluid.layers.py_func(func=element_wise_add, x=[x,y], out=output)
+                paddle.static.nn.py_func(func=element_wise_add, x=[x,y], out=output)
 
-                exe=fluid.Executor(fluid.CPUPlace())
+                exe=paddle.static.Executor(paddle.CPUPlace())
                 exe.run(start_program)
 
                 # Feed numpy array to main_program
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 2fba578ec077f..c633f7022d75e 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -103,9 +103,9 @@ def create_parameter(shape,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            W = layers.create_parameter(shape=[784, 200], dtype='float32')
+            import paddle
+            paddle.enable_static()
+            W = paddle.static.create_parameter(shape=[784, 200], dtype='float32')
     """
     check_type(shape, 'shape', (list, tuple, numpy.ndarray), 'create_parameter')
     for item in shape:
@@ -161,9 +161,9 @@ def create_global_var(shape,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            var = layers.create_global_var(shape=[2,3], value=1.0, dtype='float32',
+            import paddle
+            paddle.enable_static()
+            var = paddle.static.create_global_var(shape=[2,3], value=1.0, dtype='float32',
                                            persistable=True, force_cpu=True, name='new_var')
     """
     check_type(shape, 'shape', (list, tuple, numpy.ndarray),
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index a76faf1059068..83f54fc8208db 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -61,15 +61,15 @@ class ParamAttr(object):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            w_param_attrs = fluid.ParamAttr(name="fc_weight",
-                                            learning_rate=0.5,
-                                            regularizer=fluid.regularizer.L2Decay(1.0),
-                                            trainable=True)
-            print(w_param_attrs.name) # "fc_weight"
-            x = fluid.data(name='X', shape=[None, 1], dtype='float32')
-            y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs)
+            import paddle
+            paddle.enable_static()
+
+            weight_attr = paddle.ParamAttr(name="weight",
+                                           learning_rate=0.5,
+                                           regularizer=paddle.regularizer.L2Decay(1.0),
+                                           trainable=True)
+            print(weight_attr.name) # "weight"
+            paddle.nn.Linear(3, 4, weight_attr=weight_attr)
     """
 
     def __init__(self,
@@ -206,7 +206,7 @@ def _to_kwargs(self, with_initializer=False):
 
 class WeightNormParamAttr(ParamAttr):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     Note:
         Please use 'paddle.nn.utils.weight_norm' in dygraph mode.
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 0f65083dc52e7..909a1b6f39503 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -23,6 +23,7 @@
 ]
 
 from . import nn
+from ..fluid import Scope  #DEFINE_ALIAS
 from .input import data  #DEFINE_ALIAS
 from .input import InputSpec  #DEFINE_ALIAS
 from ..fluid.executor import Executor  #DEFINE_ALIAS
@@ -50,3 +51,5 @@
 from ..fluid.io import load_inference_model  #DEFINE_ALIAS
 from ..fluid.io import load_program_state  #DEFINE_ALIAS
 from ..fluid.io import set_program_state  #DEFINE_ALIAS
+from ..fluid.layers import create_parameter  #DEFINE_ALIAS
+from ..fluid.layers import create_global_var  #DEFINE_ALIAS
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index 510e11312f4ce..31a99f6282718 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -33,6 +33,7 @@
     'multi_box_head',
     'nce',
     'prelu',
+    'py_func',
     'row_conv',
     'spectral_norm',
     'switch_case',
@@ -57,6 +58,7 @@
 from ...fluid.layers import multi_box_head  #DEFINE_ALIAS
 from ...fluid.layers import nce  #DEFINE_ALIAS
 from ...fluid.layers import prelu  #DEFINE_ALIAS
+from ...fluid.layers import py_func  #DEFINE_ALIAS
 from ...fluid.layers import row_conv  #DEFINE_ALIAS
 from ...fluid.layers import spectral_norm  #DEFINE_ALIAS
 from ...fluid.layers import switch_case  #DEFINE_ALIAS
diff --git a/tools/wlist.json b/tools/wlist.json
index 3ca14cd1dd6f9..22bab658464cb 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -279,7 +279,6 @@
         "thresholded_relu",
         "group_norm",
         "random_crop",
-        "py_func",
         "row_conv",
         "hard_shrink",
         "ssd_loss",

From ab85a8910dd949439a4a955ac87ed9b1aff8a8e5 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Wed, 30 Sep 2020 15:27:42 +0800
Subject: [PATCH 05/91] [Dy2stat] Add Resnet Test for V2 APIs (#27459)

* Add test_resnet_v2.py test=develop
---
 .../dygraph_to_static/test_resnet_v2.py       | 362 ++++++++++++++++++
 1 file changed, 362 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py

diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
new file mode 100644
index 0000000000000..75c251253c05a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
@@ -0,0 +1,362 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import math
+import time
+import unittest
+
+import numpy as np
+
+import paddle
+
+from predictor_utils import PredictorTools
+
+SEED = 2020
+IMAGENET1000 = 1281167
+base_lr = 0.001
+momentum_rate = 0.9
+l2_decay = 1e-4
+# NOTE: Reduce batch_size from 8 to 2 to avoid unittest timeout.
+batch_size = 2
+epoch_num = 1
+place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \
+    else paddle.CPUPlace()
+MODEL_SAVE_PATH = "./resnet_v2.inference.model"
+DY_STATE_DICT_SAVE_PATH = "./resnet_v2.dygraph"
+program_translator = paddle.jit.ProgramTranslator()
+
+if paddle.is_compiled_with_cuda():
+    paddle.fluid.set_flags({'FLAGS_cudnn_deterministic': True})
+
+
+def optimizer_setting(parameter_list=None):
+    optimizer = paddle.optimizer.Momentum(
+        learning_rate=base_lr,
+        momentum=momentum_rate,
+        weight_decay=paddle.regularizer.L2Decay(l2_decay),
+        parameters=parameter_list)
+
+    return optimizer
+
+
+class ConvBNLayer(paddle.nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = paddle.nn.Conv2d(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            bias_attr=False)
+
+        self._batch_norm = paddle.nn.BatchNorm(num_filters, act=act)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+
+        return y
+
+
+class BottleneckBlock(paddle.nn.Layer):
+    def __init__(self, num_channels, num_filters, stride, shortcut=True):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu')
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride)
+
+        self.shortcut = shortcut
+
+        self._num_channels_out = num_filters * 4
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+
+        layer_helper = paddle.fluid.layer_helper.LayerHelper(
+            self.full_name(), act='relu')
+        return layer_helper.append_activation(y)
+
+
+class ResNet(paddle.nn.Layer):
+    def __init__(self, layers=50, class_dim=102):
+        super(ResNet, self).__init__()
+
+        self.layers = layers
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv = ConvBNLayer(
+            num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
+        self.pool2d_max = paddle.nn.Pool2D(
+            pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+
+        self.bottleneck_block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block]
+                        if i == 0 else num_filters[block] * 4,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        shortcut=shortcut))
+                self.bottleneck_block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = paddle.nn.Pool2D(
+            pool_size=7, pool_type='avg', global_pooling=True)
+
+        self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 4 * 1 * 1
+
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+
+        self.out = paddle.nn.Linear(
+            in_features=self.pool2d_avg_output,
+            out_features=class_dim,
+            weight_attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Uniform(-stdv, stdv)))
+
+    @paddle.jit.to_static
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        y = self.pool2d_max(y)
+        for bottleneck_block in self.bottleneck_block_list:
+            y = bottleneck_block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_output])
+        pred = self.out(y)
+        pred = paddle.nn.functional.softmax(pred)
+
+        return pred
+
+
+def reader_decorator(reader):
+    def __reader__():
+        for item in reader():
+            img = np.array(item[0]).astype('float32').reshape(3, 224, 224)
+            label = np.array(item[1]).astype('int64').reshape(1)
+            yield img, label
+
+    return __reader__
+
+
+def train(to_static):
+    """
+    Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
+    """
+    paddle.disable_static(place)
+    np.random.seed(SEED)
+    paddle.manual_seed(SEED)
+    paddle.framework.random._manual_program_seed(SEED)
+
+    train_reader = paddle.batch(
+        reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
+        batch_size=batch_size,
+        drop_last=True)
+    data_loader = paddle.io.DataLoader.from_generator(capacity=5, iterable=True)
+    data_loader.set_sample_list_generator(train_reader)
+
+    resnet = ResNet()
+    optimizer = optimizer_setting(parameter_list=resnet.parameters())
+
+    for epoch in range(epoch_num):
+        total_loss = 0.0
+        total_acc1 = 0.0
+        total_acc5 = 0.0
+        total_sample = 0
+
+        for batch_id, data in enumerate(data_loader()):
+            start_time = time.time()
+            img, label = data
+
+            pred = resnet(img)
+            loss = paddle.nn.functional.cross_entropy(input=pred, label=label)
+            avg_loss = paddle.mean(x=loss)
+            acc_top1 = paddle.metric.accuracy(input=pred, label=label, k=1)
+            acc_top5 = paddle.metric.accuracy(input=pred, label=label, k=5)
+
+            avg_loss.backward()
+            optimizer.minimize(avg_loss)
+            resnet.clear_gradients()
+
+            total_loss += avg_loss
+            total_acc1 += acc_top1
+            total_acc5 += acc_top5
+            total_sample += 1
+
+            end_time = time.time()
+            if batch_id % 2 == 0:
+                print( "epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f" % \
+                    ( epoch, batch_id, total_loss.numpy() / total_sample, \
+                        total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
+            if batch_id == 10:
+                if to_static:
+                    paddle.jit.save(resnet, MODEL_SAVE_PATH)
+                else:
+                    paddle.fluid.dygraph.save_dygraph(resnet.state_dict(),
+                                                      DY_STATE_DICT_SAVE_PATH)
+                    # avoid dataloader throw abort signaal
+                data_loader._reset()
+                break
+    paddle.enable_static()
+
+    return total_loss.numpy()
+
+
+def predict_dygraph(data):
+    program_translator.enable(False)
+    paddle.disable_static(place)
+    resnet = ResNet()
+
+    model_dict, _ = paddle.fluid.dygraph.load_dygraph(DY_STATE_DICT_SAVE_PATH)
+    resnet.set_dict(model_dict)
+    resnet.eval()
+
+    pred_res = resnet(
+        paddle.to_tensor(
+            data=data, dtype=None, place=None, stop_gradient=True))
+
+    ret = pred_res.numpy()
+    paddle.enable_static()
+    return ret
+
+
+def predict_static(data):
+    exe = paddle.static.Executor(place)
+    [inference_program, feed_target_names,
+     fetch_targets] = paddle.static.load_inference_model(
+         MODEL_SAVE_PATH,
+         executor=exe,
+         params_filename=paddle.fluid.dygraph.io.VARIABLE_FILENAME)
+
+    pred_res = exe.run(inference_program,
+                       feed={feed_target_names[0]: data},
+                       fetch_list=fetch_targets)
+
+    return pred_res[0]
+
+
+def predict_dygraph_jit(data):
+    paddle.disable_static(place)
+    resnet = paddle.jit.load(MODEL_SAVE_PATH)
+    resnet.eval()
+
+    pred_res = resnet(data)
+
+    ret = pred_res.numpy()
+    paddle.enable_static()
+    return ret
+
+
+def predict_analysis_inference(data):
+    output = PredictorTools(MODEL_SAVE_PATH,
+                            paddle.fluid.dygraph.io.VARIABLE_FILENAME, [data])
+    out = output()
+    return out
+
+
+class TestResnet(unittest.TestCase):
+    def train(self, to_static):
+        program_translator.enable(to_static)
+        return train(to_static)
+
+    def verify_predict(self):
+        image = np.random.random([1, 3, 224, 224]).astype('float32')
+        dy_pre = predict_dygraph(image)
+        st_pre = predict_static(image)
+        dy_jit_pre = predict_dygraph_jit(image)
+        predictor_pre = predict_analysis_inference(image)
+        self.assertTrue(
+            np.allclose(dy_pre, st_pre),
+            msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
+        self.assertTrue(
+            np.allclose(dy_jit_pre, st_pre),
+            msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre, st_pre))
+        self.assertTrue(
+            np.allclose(predictor_pre, st_pre),
+            msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
+                                                              st_pre))
+
+    def test_resnet(self):
+        static_loss = self.train(to_static=True)
+        dygraph_loss = self.train(to_static=False)
+        self.assertTrue(
+            np.allclose(static_loss, dygraph_loss),
+            msg="static_loss: {} \n dygraph_loss: {}".format(static_loss,
+                                                             dygraph_loss))
+        self.verify_predict()
+
+    def test_in_static_mode_mkldnn(self):
+        paddle.fluid.set_flags({'FLAGS_use_mkldnn': True})
+        try:
+            train(to_static=True)
+        finally:
+            paddle.fluid.set_flags({'FLAGS_use_mkldnn': False})
+
+
+if __name__ == '__main__':
+    unittest.main()

From 4d3eefbb95589c4d620d95d306a72e96861ea154 Mon Sep 17 00:00:00 2001
From: xiemoyuan <71377852+xiemoyuan@users.noreply.github.com>
Date: Wed, 30 Sep 2020 15:51:49 +0800
Subject: [PATCH 06/91] Modify the docs for Transformer's APIs.
 test=document_fix (#27729)

---
 python/paddle/nn/layer/transformer.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index e6df5366d216c..ea4f6970bc686 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -644,7 +644,7 @@ class TransformerDecoderLayer(Layer):
             `weight_attr` to create parameters. Default: None, which means the
             default weight parameter property is used. See usage for details
             in :ref:`api_fluid_ParamAttr` . 
-        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
             If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
             self attention, `bias_attr[1]` would be used as `bias_attr` for
             cross attention, and `bias_attr[2]` would be used as `bias_attr`
@@ -982,12 +982,12 @@ class Transformer(Layer):
     applies another layer normalization on the output of last encoder/decoder layer.
 
     Parameters:
-        d_model (int): The expected feature size in the encoder/decoder input
-            and output.
-        nhead (int): The number of heads in multi-head attention(MHA).
-        num_encoder_layers (int): The number of layers in encoder.
-        num_encoder_layers (int): The number of layers in decoder.
-        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        d_model (int, optional): The expected feature size in the encoder/decoder input
+            and output. Default 512
+        nhead (int, optional): The number of heads in multi-head attention(MHA). Default 8
+        num_encoder_layers (int, optional): The number of layers in encoder. Default 6
+        num_decoder_layers (int, optional): The number of layers in decoder. Default 6
+        dim_feedforward (int, optional): The hidden layer size in the feedforward network(FFN). Default 2048
         dropout (float, optional): The dropout probability used in pre-process
             and post-precess of MHA and FFN sub-layer. Default 0.1
         activation (str, optional): The activation function in the feedforward
@@ -1015,7 +1015,7 @@ class Transformer(Layer):
             Default: None, which means the default weight parameter property is used. 
             See usage for details
             in :code:`ParamAttr` . 
-        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
             If it is a tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, 
             `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]` 
             would be used as `bias_attr` for cross attention of `TransformerDecoder`, 
@@ -1028,9 +1028,9 @@ class Transformer(Layer):
             The `False` value means the corresponding layer would not have trainable 
             bias parameter. See usage for details in :code:`ParamAttr` . 
             Default: None,which means the default bias parameter property is used.
-        custom_encoder (Layer): If custom encoder is provided, use it as the encoder.
+        custom_encoder (Layer, optional): If custom encoder is provided, use it as the encoder.
             Default None
-        custom_decoder (Layer): If custom decoder is provided, use it as the decoder.
+        custom_decoder (Layer, optional): If custom decoder is provided, use it as the decoder.
             Default None
 
     Examples:

From 9b3ef5979d43ef13696a8af5c389e2b6c92fa79a Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 30 Sep 2020 17:12:26 +0800
Subject: [PATCH 07/91] add categorical class (#27695)

* add multinomial cpu kernel

* fix C++ notype error

* fix windows ci array len error

* let array len be const

* change array to vector

* add cuda kernrl with num_distribution is 1, and not support replacement=False

* add multinomial python api

* support num_distribution different multinomial distributions

* add categorical class

* fix test_distribution enable_static error

* add unittest for different setting of Categorical

* optimize format

* little change

* little change

* add raise error if shape not match, optimize format

* fix windows CI dtype error in concat

* little changes

* little changes2

* change values type to int64

* change values type to int64

* change values type to int64
---
 python/paddle/distribution.py                 | 318 ++++++++++++-
 .../tests/unittests/test_distribution.py      | 442 +++++++++++++++---
 2 files changed, 704 insertions(+), 56 deletions(-)

diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index 35204affb3fd1..ff3e882229ae8 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -28,13 +28,14 @@
 from .fluid import core
 from .fluid.framework import in_dygraph_mode
 from .tensor.math import elementwise_mul, elementwise_div, elementwise_add, elementwise_sub
+from .tensor import arange, gather_nd, concat, multinomial
 import math
 import numpy as np
 import warnings
 
 from .fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 
-__all__ = ['Distribution', 'Uniform', 'Normal']
+__all__ = ['Distribution', 'Uniform', 'Normal', 'Categorical']
 
 
 class Distribution(object):
@@ -640,3 +641,318 @@ def kl_divergence(self, other):
         t1 = (t1 * t1)
         return elementwise_add(
             0.5 * var_ratio, 0.5 * (t1 - 1. - nn.log(var_ratio)), name=name)
+
+
+class Categorical(Distribution):
+    """
+    Categorical distribution is a discrete probability distribution that 
+    describes the possible results of a random variable that can take on 
+    one of K possible categories, with the probability of each category 
+    separately specified.
+
+    The probability mass function (pmf) is:
+
+    .. math::
+
+        pmf(k; p_i) = \prod_{i=1}^{k} p_i^{[x=i]}
+
+    In the above equation:
+
+    * :math:`[x=i]` : it evaluates to 1 if :math:`x==i` , 0 otherwise.
+
+    Args:
+        logits(list|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle.distribution import Categorical
+
+          x = paddle.rand([6])
+          print(x.numpy())
+          # [0.32564053, 0.99334985, 0.99034804,
+          #  0.09053693, 0.30820143, 0.19095989]
+          y = paddle.rand([6])
+          print(y.numpy())
+          # [0.6365463 , 0.7278677 , 0.90260243, 
+          # 0.5226815 , 0.35837543, 0.13981032]
+
+          cat = Categorical(x)
+          cat2 = Categorical(y)
+
+          cat.sample([2,3])
+          # [[5, 1, 1],
+          # [0, 1, 2]]
+
+          cat.entropy()
+          # [1.71887]
+
+          cat.kl_divergence(cat2)
+          # [0.0278455]
+
+          value = paddle.to_tensor([2,1,3])
+          cat.probs(value)
+          # [0.341613 0.342648 0.03123]
+
+          cat.log_prob(value)
+          # [-1.07408 -1.07105 -3.46638]
+
+    """
+
+    def __init__(self, logits, name=None):
+        """
+        Args:
+            logits(list|numpy.ndarray|Variable): The logits input of categorical distribution. The data type is float32 or float64.
+        """
+        if not in_dygraph_mode():
+            check_type(logits, 'logits', (np.ndarray, tensor.Variable, list),
+                       'Categorical')
+
+        self.name = name if name is not None else 'Categorical'
+        self.dtype = 'float32'
+
+        if self._validate_args(logits):
+            self.logits = logits
+            self.dtype = convert_dtype(logits.dtype)
+        else:
+            if isinstance(logits, np.ndarray) and str(
+                    logits.dtype) in ['float32', 'float64']:
+                self.dtype = logits.dtype
+            self.logits = self._to_tensor(logits)[0]
+            if self.dtype != convert_dtype(self.logits.dtype):
+                self.logits = tensor.cast(self.logits, dtype=self.dtype)
+
+    def sample(self, shape):
+        """Generate samples of the specified shape.
+
+        Args:
+          shape (list): Shape of the generated samples.
+
+        Returns:
+          Tensor: A tensor with prepended dimensions shape.
+        
+        Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle.distribution import Categorical
+
+          x = paddle.rand([6])
+          print(x.numpy())
+          # [0.32564053, 0.99334985, 0.99034804,
+          #  0.09053693, 0.30820143, 0.19095989]
+
+          cat = Categorical(x)
+
+          cat.sample([2,3])
+          # [[5, 1, 1],
+          # [0, 1, 2]]
+
+        """
+        name = self.name + '_sample'
+        if not in_dygraph_mode():
+            check_type(shape, 'shape', (list), 'sample')
+
+        num_samples = np.prod(np.array(shape))
+
+        logits_shape = list(self.logits.shape)
+        if len(logits_shape) > 1:
+            sample_shape = shape + logits_shape[:-1]
+            logits = nn.reshape(self.logits,
+                                [np.prod(logits_shape[:-1]), logits_shape[-1]])
+        else:
+            sample_shape = shape
+            logits = self.logits
+
+        sample_index = multinomial(logits, num_samples, True)
+        return nn.reshape(sample_index, sample_shape, name=name)
+
+    def kl_divergence(self, other):
+        """The KL-divergence between two Categorical distributions.
+
+        Args:
+            other (Categorical): instance of Categorical. The data type is float32.
+
+        Returns:
+            Variable: kl-divergence between two Categorical distributions.
+        
+        Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle.distribution import Categorical
+
+          x = paddle.rand([6])
+          print(x.numpy())
+          # [0.32564053, 0.99334985, 0.99034804,
+          #  0.09053693, 0.30820143, 0.19095989]
+          y = paddle.rand([6])
+          print(y.numpy())
+          # [0.6365463 , 0.7278677 , 0.90260243, 
+          # 0.5226815 , 0.35837543, 0.13981032]
+
+          cat = Categorical(x)
+          cat2 = Categorical(y)
+
+          cat.kl_divergence(cat2)
+          # [0.0278455]
+
+        """
+        name = self.name + '_kl_divergence'
+        if not in_dygraph_mode():
+            check_type(other, 'other', Categorical, 'kl_divergence')
+
+        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
+        other_logits = other.logits - nn.reduce_max(
+            other.logits, dim=-1, keep_dim=True)
+        e_logits = ops.exp(logits)
+        other_e_logits = ops.exp(other_logits)
+        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
+        other_z = nn.reduce_sum(other_e_logits, dim=-1, keep_dim=True)
+        prob = e_logits / z
+        kl = nn.reduce_sum(
+            prob * (logits - nn.log(z) - other_logits + nn.log(other_z)),
+            dim=-1,
+            keep_dim=True,
+            name=name)
+
+        return kl
+
+    def entropy(self):
+        """Shannon entropy in nats.
+
+        Returns:
+          Variable: Shannon entropy of Categorical distribution. The data type is float32.
+        
+        Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle.distribution import Categorical
+
+          x = paddle.rand([6])
+          print(x.numpy())
+          # [0.32564053, 0.99334985, 0.99034804,
+          #  0.09053693, 0.30820143, 0.19095989]
+
+          cat = Categorical(x)
+
+          cat.entropy()
+          # [1.71887]
+
+        """
+        name = self.name + '_entropy'
+        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
+        e_logits = ops.exp(logits)
+        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
+        prob = e_logits / z
+
+        neg_entropy = nn.reduce_sum(
+            prob * (logits - nn.log(z)), dim=-1, keep_dim=True)
+        entropy = nn.scale(neg_entropy, scale=-1.0, name=name)
+        return entropy
+
+    def probs(self, value):
+        """Probabilities of the given category (``value``).
+
+        If ``logits`` is 2-D or higher dimension, the last dimension will be regarded as 
+        category, and the others represents the different distributions.
+        At the same time, if ``vlaue`` is 1-D Tensor, ``value`` will be broadcast to the 
+        same number of distributions as ``logits``.
+        If ``value`` is not 1-D Tensor, ``value`` should have the same number distributions
+        with ``logits. That is, ``value[:-1] = logits[:-1]``.
+
+        Args:
+          value (Tensor): The input tensor represents the selected category index.
+
+        Returns:
+          Tensor: probability according to the category index.
+        
+        Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle.distribution import Categorical
+
+          x = paddle.rand([6])
+          print(x.numpy())
+          # [0.32564053, 0.99334985, 0.99034804,
+          #  0.09053693, 0.30820143, 0.19095989]
+
+          cat = Categorical(x)
+
+          value = paddle.to_tensor([2,1,3])
+          cat.probs(value)
+          # [0.341613 0.342648 0.03123]
+
+        """
+        name = self.name + '_probs'
+
+        dist_sum = nn.reduce_sum(self.logits, dim=-1, keep_dim=True)
+        prob = self.logits / dist_sum
+
+        shape = list(prob.shape)
+        value_shape = list(value.shape)
+        if len(shape) == 1:
+            num_value_in_one_dist = np.prod(value_shape)
+            index_value = nn.reshape(value, [num_value_in_one_dist, 1])
+            index = index_value
+        else:
+            num_dist = np.prod(shape[:-1])
+            num_value_in_one_dist = value_shape[-1]
+            prob = nn.reshape(prob, [num_dist, shape[-1]])
+            if len(value_shape) == 1:
+                value = nn.expand(value, [num_dist])
+                value_shape = shape[:-1] + value_shape
+            index_value = nn.reshape(value, [num_dist, -1, 1])
+            if shape[:-1] != value_shape[:-1]:
+                raise ValueError(
+                    "shape of value {} must match shape of logits {}".format(
+                        str(value_shape[:-1]), str(shape[:-1])))
+
+            index_prefix = nn.unsqueeze(
+                arange(
+                    num_dist, dtype=index_value.dtype), axes=-1)
+            index_prefix = nn.expand(index_prefix, [1, num_value_in_one_dist])
+            index_prefix = nn.unsqueeze(index_prefix, axes=-1)
+
+            if index_value.dtype != index_prefix.dtype:
+                tensor.cast(index_prefix, dtype=index_value.dtype)
+            index = concat([index_prefix, index_value], axis=-1)
+
+        # value is the category index to search for the corresponding probability.
+        select_prob = gather_nd(prob, index)
+        return nn.reshape(select_prob, value_shape, name=name)
+
+    def log_prob(self, value):
+        """Log probabilities of the given category. Refer to ``probs`` method.
+
+        Args:
+          value (Tensor): The input tensor represents the selected category index.
+
+        Returns:
+          Tensor: Log probability.
+        
+        Examples:
+        .. code-block:: python
+
+          import paddle
+          from paddle.distribution import Categorical
+
+          x = paddle.rand([6])
+          print(x.numpy())
+          # [0.32564053, 0.99334985, 0.99034804,
+          #  0.09053693, 0.30820143, 0.19095989]
+
+          cat = Categorical(x)
+
+          value = paddle.to_tensor([2,1,3])
+
+          cat.log_prob(value)
+          # [-1.07408 -1.07105 -3.46638]
+
+        """
+        name = self.name + '_log_prob'
+
+        return nn.log(self.probs(value), name=name)
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
index 40611fed65260..d5790811df94f 100644
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -65,41 +65,6 @@ def entropy(self):
         return np.log(self.high - self.low)
 
 
-class NormalNumpy(DistributionNumpy):
-    def __init__(self, loc, scale):
-        self.loc = np.array(loc)
-        self.scale = np.array(scale)
-        if str(self.loc.dtype) not in ['float32', 'float64']:
-            self.loc = self.loc.astype('float32')
-            self.scale = self.scale.astype('float32')
-
-    def sample(self, shape):
-        shape = tuple(shape) + (self.loc + self.scale).shape
-        return self.loc + (np.random.randn(*shape) * self.scale)
-
-    def log_prob(self, value):
-        var = self.scale * self.scale
-        log_scale = np.log(self.scale)
-        return -((value - self.loc) * (value - self.loc)) / (
-            2. * var) - log_scale - math.log(math.sqrt(2. * math.pi))
-
-    def probs(self, value):
-        var = self.scale * self.scale
-        return np.exp(-1. * ((value - self.loc) * (value - self.loc)) /
-                      (2. * var)) / (math.sqrt(2 * math.pi) * self.scale)
-
-    def entropy(self):
-        return 0.5 + 0.5 * np.log(
-            np.array(2. * math.pi).astype(self.loc.dtype)) + np.log(self.scale)
-
-    def kl_divergence(self, other):
-        var_ratio = (self.scale / other.scale)
-        var_ratio = var_ratio * var_ratio
-        t1 = ((self.loc - other.loc) / other.scale)
-        t1 = (t1 * t1)
-        return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
-
-
 class UniformTest(unittest.TestCase):
     def setUp(self, use_gpu=False, batch_size=5, dims=6):
         self.use_gpu = use_gpu
@@ -336,6 +301,41 @@ def init_static_data(self, batch_size, dims):
                 name='values', shape=[dims], dtype='float32')
 
 
+class NormalNumpy(DistributionNumpy):
+    def __init__(self, loc, scale):
+        self.loc = np.array(loc)
+        self.scale = np.array(scale)
+        if str(self.loc.dtype) not in ['float32', 'float64']:
+            self.loc = self.loc.astype('float32')
+            self.scale = self.scale.astype('float32')
+
+    def sample(self, shape):
+        shape = tuple(shape) + (self.loc + self.scale).shape
+        return self.loc + (np.random.randn(*shape) * self.scale)
+
+    def log_prob(self, value):
+        var = self.scale * self.scale
+        log_scale = np.log(self.scale)
+        return -((value - self.loc) * (value - self.loc)) / (
+            2. * var) - log_scale - math.log(math.sqrt(2. * math.pi))
+
+    def probs(self, value):
+        var = self.scale * self.scale
+        return np.exp(-1. * ((value - self.loc) * (value - self.loc)) /
+                      (2. * var)) / (math.sqrt(2 * math.pi) * self.scale)
+
+    def entropy(self):
+        return 0.5 + 0.5 * np.log(
+            np.array(2. * math.pi).astype(self.loc.dtype)) + np.log(self.scale)
+
+    def kl_divergence(self, other):
+        var_ratio = (self.scale / other.scale)
+        var_ratio = var_ratio * var_ratio
+        t1 = ((self.loc - other.loc) / other.scale)
+        t1 = (t1 * t1)
+        return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
+
+
 class NormalTest(unittest.TestCase):
     def setUp(self, use_gpu=False, batch_size=2, dims=3):
         self.use_gpu = use_gpu
@@ -559,26 +559,6 @@ def init_static_data(self, batch_size, dims):
 
 
 class NormalTest6(NormalTest):
-    def init_data(self, batch_size=2, dims=3):
-        # loc and scale are Tensor with dtype 'VarType.FP32'.
-        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
-        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
-        while not np.all(self.scale_np > 0):
-            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
-        self.values_np = np.random.randn(batch_size, dims).astype('float32')
-        self.loc = paddle.to_tensor(self.loc_np)
-        self.scale = paddle.to_tensor(self.scale_np)
-        self.values = paddle.to_tensor(self.values_np)
-        # used to construct another Normal object to calculate kl_divergence
-        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
-        self.other_scale_np = np.random.randn(batch_size,
-                                              dims).astype('float32')
-        while not np.all(self.scale_np > 0):
-            self.other_scale_np = np.random.randn(batch_size,
-                                                  dims).astype('float32')
-        self.other_loc = paddle.to_tensor(self.other_loc_np)
-        self.other_scale = paddle.to_tensor(self.other_scale_np)
-
     def init_numpy_data(self, batch_size, dims):
         # loc and scale are Tensor with dtype 'VarType.FP32'.
         self.loc_np = np.random.randn(batch_size, dims).astype('float32')
@@ -693,6 +673,294 @@ def init_static_data(self, batch_size, dims):
                 name='other_scale', shape=[dims], dtype='float64')
 
 
+class CategoricalNumpy(DistributionNumpy):
+    def __init__(self, logits):
+        self.logits = np.array(logits).astype('float32')
+
+    def entropy(self):
+        logits = self.logits - np.max(self.logits, axis=-1, keepdims=True)
+        e_logits = np.exp(logits)
+        z = np.sum(e_logits, axis=-1, keepdims=True)
+        prob = e_logits / z
+        return -1. * np.sum(prob * (logits - np.log(z)), axis=-1, keepdims=True)
+
+    def kl_divergence(self, other):
+        logits = self.logits - np.max(self.logits, axis=-1, keepdims=True)
+        other_logits = other.logits - np.max(
+            other.logits, axis=-1, keepdims=True)
+        e_logits = np.exp(logits)
+        other_e_logits = np.exp(other_logits)
+        z = np.sum(e_logits, axis=-1, keepdims=True)
+        other_z = np.sum(other_e_logits, axis=-1, keepdims=True)
+        prob = e_logits / z
+        return np.sum(prob * (logits - np.log(z) - other_logits \
+            + np.log(other_z)), axis=-1, keepdims=True)
+
+
+class CategoricalTest(unittest.TestCase):
+    def setUp(self, use_gpu=False, batch_size=3, dims=5):
+        self.use_gpu = use_gpu
+        if not use_gpu:
+            self.place = fluid.CPUPlace()
+            self.gpu_id = -1
+        else:
+            self.place = fluid.CUDAPlace(0)
+            self.gpu_id = 0
+
+        self.batch_size = batch_size
+        self.dims = dims
+        self.init_numpy_data(batch_size, dims)
+
+        paddle.disable_static(self.place)
+        self.init_dynamic_data(batch_size, dims)
+
+        paddle.enable_static()
+        self.test_program = fluid.Program()
+        self.executor = fluid.Executor(self.place)
+        self.init_static_data(batch_size, dims)
+
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 2-D Tensor
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits_np = np.random.rand(batch_size, dims).astype('float32')
+        self.other_logits_np = np.random.rand(batch_size,
+                                              dims).astype('float32')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [batch_size, dims]
+        # dist_shape = logits_shape[:-1], it represents the number of 
+        #  different distributions.
+        self.dist_shape = [batch_size]
+        # sample shape represents the number of samples
+        self.sample_shape = [2, 4]
+        # value used in probs and log_prob method
+        # If value is 1-D and logits is 2-D or higher dimension, value will be
+        #  broadcasted to have the same number of distributions with logits.
+        # If value is 2-D or higher dimentsion, it should have the same number 
+        #  of distributions with logtis. ``value[:-1] = logits[:-1]
+        self.value_shape = [3]
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.logits = paddle.to_tensor(self.logits_np)
+        self.other_logits = paddle.to_tensor(self.other_logits_np)
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = fluid.data(
+                name='logits', shape=self.logits_shape, dtype='float32')
+            self.other_logits_static = fluid.data(
+                name='other_logits', shape=self.logits_shape, dtype='float32')
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+    def get_numpy_selected_probs(self, probability):
+        np_probs = np.zeros(self.dist_shape + self.value_shape)
+        for i in range(self.batch_size):
+            for j in range(3):
+                np_probs[i][j] = probability[i][self.value_np[j]]
+        return np_probs
+
+    def compare_with_numpy(self, fetch_list, tolerance=1e-6):
+        sample, entropy, kl, probs, log_prob = fetch_list
+        log_tolerance = 1e-4
+
+        np.testing.assert_equal(sample.shape,
+                                self.sample_shape + self.dist_shape)
+
+        np_categorical = CategoricalNumpy(self.logits_np)
+        np_other_categorical = CategoricalNumpy(self.other_logits_np)
+        np_entropy = np_categorical.entropy()
+        np_kl = np_categorical.kl_divergence(np_other_categorical)
+
+        np.testing.assert_allclose(
+            entropy, np_entropy, rtol=log_tolerance, atol=log_tolerance)
+        np.testing.assert_allclose(
+            kl, np_kl, rtol=log_tolerance, atol=log_tolerance)
+
+        sum_dist = np.sum(self.logits_np, axis=-1, keepdims=True)
+        probability = self.logits_np / sum_dist
+        np_probs = self.get_numpy_selected_probs(probability)
+        np_log_prob = np.log(np_probs)
+
+        np.testing.assert_allclose(
+            probs, np_probs, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(
+            log_prob, np_log_prob, rtol=tolerance, atol=tolerance)
+
+    def test_categorical_distribution_dygraph(self, tolerance=1e-6):
+        paddle.disable_static(self.place)
+        categorical = Categorical(self.logits)
+        other_categorical = Categorical(self.other_logits)
+
+        sample = categorical.sample(self.sample_shape).numpy()
+        entropy = categorical.entropy().numpy()
+        kl = categorical.kl_divergence(other_categorical).numpy()
+        probs = categorical.probs(self.value).numpy()
+        log_prob = categorical.log_prob(self.value).numpy()
+
+        fetch_list = [sample, entropy, kl, probs, log_prob]
+        self.compare_with_numpy(fetch_list)
+
+    def test_categorical_distribution_static(self, tolerance=1e-6):
+        paddle.enable_static()
+        with fluid.program_guard(self.test_program):
+            categorical = Categorical(self.logits_static)
+            other_categorical = Categorical(self.other_logits_static)
+
+            sample = categorical.sample(self.sample_shape)
+            entropy = categorical.entropy()
+            kl = categorical.kl_divergence(other_categorical)
+            probs = categorical.probs(self.value_static)
+            log_prob = categorical.log_prob(self.value_static)
+
+            fetch_list = [sample, entropy, kl, probs, log_prob]
+
+        feed_vars = {
+            'logits': self.logits_np,
+            'other_logits': self.other_logits_np,
+            'value': self.value_np
+        }
+
+        self.executor.run(fluid.default_startup_program())
+        fetch_list = self.executor.run(program=self.test_program,
+                                       feed=feed_vars,
+                                       fetch_list=fetch_list)
+
+        self.compare_with_numpy(fetch_list)
+
+
+class CategoricalTest2(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 2-D Tensor with dtype Float64
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits_np = np.random.rand(batch_size, dims).astype('float64')
+        self.other_logits_np = np.random.rand(batch_size,
+                                              dims).astype('float64')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [batch_size, dims]
+        self.dist_shape = [batch_size]
+        self.sample_shape = [2, 4]
+        self.value_shape = [3]
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = fluid.data(
+                name='logits', shape=self.logits_shape, dtype='float64')
+            self.other_logits_static = fluid.data(
+                name='other_logits', shape=self.logits_shape, dtype='float64')
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
+class CategoricalTest3(CategoricalTest):
+    def init_dynamic_data(self, batch_size, dims):
+        # input logtis is 2-D numpy.ndarray with dtype Float32
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits = self.logits_np
+        self.other_logits = self.other_logits_np
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = self.logits_np
+            self.other_logits_static = self.other_logits_np
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
+class CategoricalTest4(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 2-D numpy.ndarray with dtype Float64
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits_np = np.random.rand(batch_size, dims).astype('float64')
+        self.other_logits_np = np.random.rand(batch_size,
+                                              dims).astype('float64')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [batch_size, dims]
+        self.dist_shape = [batch_size]
+        self.sample_shape = [2, 4]
+        self.value_shape = [3]
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.logits = self.logits_np
+        self.other_logits = self.other_logits_np
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = self.logits_np
+            self.other_logits_static = self.other_logits_np
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
+# test shape of logits and value used in probs and log_prob method
+class CategoricalTest5(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 1-D Tensor
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits_np = np.random.rand(dims).astype('float32')
+        self.other_logits_np = np.random.rand(dims).astype('float32')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [dims]
+        self.dist_shape = []
+        self.sample_shape = [2, 4]
+        self.value_shape = [3]
+
+    def get_numpy_selected_probs(self, probability):
+        np_probs = np.zeros(self.value_shape)
+        for i in range(3):
+            np_probs[i] = probability[self.value_np[i]]
+        return np_probs
+
+
+class CategoricalTest6(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 2-D Tensor
+        # value used in probs and log_prob method has the same number of batches with input
+        self.logits_np = np.random.rand(3, 5).astype('float32')
+        self.other_logits_np = np.random.rand(3, 5).astype('float32')
+        self.value_np = np.array([[2, 1], [0, 3], [2, 3]]).astype('int64')
+
+        self.logits_shape = [3, 5]
+        self.dist_shape = [3]
+        self.sample_shape = [2, 4]
+        self.value_shape = [3, 2]
+
+    def get_numpy_selected_probs(self, probability):
+        np_probs = np.zeros(self.value_shape)
+        for i in range(3):
+            for j in range(2):
+                np_probs[i][j] = probability[i][self.value_np[i][j]]
+        return np_probs
+
+
+class CategoricalTest7(CategoricalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # input logtis is 3-D Tensor
+        # value used in probs and log_prob method has the same number of distribuions with input
+        self.logits_np = np.random.rand(3, 2, 5).astype('float32')
+        self.other_logits_np = np.random.rand(3, 2, 5).astype('float32')
+        self.value_np = np.array([2, 1, 3]).astype('int64')
+
+        self.logits_shape = [3, 2, 5]
+        self.dist_shape = [3, 2]
+        self.sample_shape = [2, 4]
+        self.value_shape = [3]
+
+    def get_numpy_selected_probs(self, probability):
+        np_probs = np.zeros(self.dist_shape + self.value_shape)
+        for i in range(3):
+            for j in range(2):
+                for k in range(3):
+                    np_probs[i][j][k] = probability[i][j][self.value_np[k]]
+        return np_probs
+
+
 class DistributionTestError(unittest.TestCase):
     def test_distribution_error(self):
         distribution = Distribution()
@@ -711,6 +979,7 @@ def test_distribution_error(self):
         self.assertRaises(NotImplementedError, distribution.probs, value_tensor)
 
     def test_normal_error(self):
+        paddle.enable_static()
         normal = Normal(0.0, 1.0)
 
         value = [1.0, 2.0]
@@ -734,6 +1003,7 @@ def test_normal_error(self):
         self.assertRaises(TypeError, normal.kl_divergence, normal_other)
 
     def test_uniform_error(self):
+        paddle.enable_static()
         uniform = Uniform(0.0, 1.0)
 
         value = [1.0, 2.0]
@@ -752,6 +1022,39 @@ def test_uniform_error(self):
         # type of seed must be int
         self.assertRaises(TypeError, uniform.sample, [2, 3], seed)
 
+    def test_categorical_error(self):
+        paddle.enable_static()
+
+        categorical = Categorical([0.4, 0.6])
+
+        value = [1, 0]
+        # type of value must be variable
+        self.assertRaises(AttributeError, categorical.log_prob, value)
+
+        value = [1, 0]
+        # type of value must be variable
+        self.assertRaises(AttributeError, categorical.probs, value)
+
+        shape = 1.0
+        # type of shape must be list
+        self.assertRaises(TypeError, categorical.sample, shape)
+
+        categorical_other = Uniform(1.0, 2.0)
+        # type of other must be an instance of Categorical
+        self.assertRaises(TypeError, categorical.kl_divergence,
+                          categorical_other)
+
+        def test_shape_not_match_error():
+            # shape of value must match shape of logits
+            # value_shape[:-1] == logits_shape[:-1]
+            paddle.disable_static()
+            logits = paddle.rand([3, 5])
+            cat = Categorical(logits)
+            value = paddle.to_tensor([[2, 1, 3], [3, 2, 1]], dtype='int64')
+            cat.log_prob(value)
+
+        self.assertRaises(ValueError, test_shape_not_match_error)
+
 
 class DistributionTestName(unittest.TestCase):
     def get_prefix(self, string):
@@ -812,6 +1115,35 @@ def test_uniform_name(self):
         p = uniform1.probs(value_tensor)
         self.assertEqual(self.get_prefix(p.name), name + '_probs')
 
+    def test_categorical_name(self):
+        name = 'test_categorical'
+        categorical1 = Categorical([0.4, 0.6], name=name)
+        self.assertEqual(categorical1.name, name)
+
+        categorical2 = Categorical([0.5, 0.5])
+        self.assertEqual(categorical2.name, 'Categorical')
+
+        paddle.enable_static()
+
+        sample = categorical1.sample([2])
+        self.assertEqual(self.get_prefix(sample.name), name + '_sample')
+
+        entropy = categorical1.entropy()
+        self.assertEqual(self.get_prefix(entropy.name), name + '_entropy')
+
+        kl = categorical1.kl_divergence(categorical2)
+        self.assertEqual(self.get_prefix(kl.name), name + '_kl_divergence')
+
+        value_npdata = np.array([0], dtype="int64")
+        value_tensor = layers.create_tensor(dtype="int64")
+        layers.assign(value_npdata, value_tensor)
+
+        p = categorical1.probs(value_tensor)
+        self.assertEqual(self.get_prefix(p.name), name + '_probs')
+
+        lp = categorical1.log_prob(value_tensor)
+        self.assertEqual(self.get_prefix(lp.name), name + '_log_prob')
+
 
 if __name__ == '__main__':
     unittest.main()

From a0f1dba37fc93283c9af893d1045968bec782e90 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Wed, 30 Sep 2020 17:14:28 +0800
Subject: [PATCH 08/91] Add visualdl callback function (#27565)

* add visualdl callback
---
 python/paddle/hapi/callbacks.py       | 112 +++++++++++++++++++++++++-
 python/paddle/tests/test_callbacks.py |  28 +++++++
 python/unittest_py/requirements.txt   |   1 +
 3 files changed, 140 insertions(+), 1 deletion(-)

diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index 69b7fedd72eed..4a1751b331d21 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -13,12 +13,14 @@
 # limitations under the License.
 
 import os
+import numbers
 
 from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.utils import try_import
 
 from .progressbar import ProgressBar
 
-__all__ = ['Callback', 'ProgBarLogger', 'ModelCheckpoint']
+__all__ = ['Callback', 'ProgBarLogger', 'ModelCheckpoint', 'VisualDL']
 
 
 def config_callbacks(callbacks=None,
@@ -471,3 +473,111 @@ def on_train_end(self, logs=None):
             path = '{}/final'.format(self.save_dir)
             print('save checkpoint at {}'.format(os.path.abspath(path)))
             self.model.save(path)
+
+
+class VisualDL(Callback):
+    """VisualDL callback function
+    Args:
+        log_dir (str): The directory to save visualdl log file.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.static import InputSpec
+
+            inputs = [InputSpec([-1, 1, 28, 28], 'float32', 'image')]
+            labels = [InputSpec([None, 1], 'int64', 'label')]
+
+            train_dataset = paddle.vision.datasets.MNIST(mode='train')
+            eval_dataset = paddle.vision.datasets.MNIST(mode='test')
+
+            net = paddle.vision.LeNet()
+            model = paddle.Model(net, inputs, labels)
+
+            optim = paddle.optimizer.Adam(0.001, parameters=net.parameters())
+            model.prepare(optimizer=optim,
+                        loss=paddle.nn.CrossEntropyLoss(),
+                        metrics=paddle.metric.Accuracy())
+            
+            ## uncomment following lines to fit model with visualdl callback function
+            # callback = paddle.callbacks.VisualDL(log_dir='visualdl_log_dir')
+            # model.fit(train_dataset, eval_dataset, batch_size=64, callbacks=callback)
+
+    """
+
+    def __init__(self, log_dir):
+        self.log_dir = log_dir
+        self.epochs = None
+        self.steps = None
+        self.epoch = 0
+
+    def _is_write(self):
+        return ParallelEnv().local_rank == 0
+
+    def on_train_begin(self, logs=None):
+        self.epochs = self.params['epochs']
+        assert self.epochs
+        self.train_metrics = self.params['metrics']
+        assert self.train_metrics
+        self._is_fit = True
+        self.train_step = 0
+
+    def on_epoch_begin(self, epoch=None, logs=None):
+        self.steps = self.params['steps']
+        self.epoch = epoch
+
+    def _updates(self, logs, mode):
+        if not self._is_write():
+            return
+        if not hasattr(self, 'writer'):
+            visualdl = try_import('visualdl')
+            self.writer = visualdl.LogWriter(self.log_dir)
+
+        metrics = getattr(self, '%s_metrics' % (mode))
+        current_step = getattr(self, '%s_step' % (mode))
+
+        if mode == 'train':
+            total_step = current_step
+        else:
+            total_step = self.epoch
+
+        for k in metrics:
+            if k in logs:
+                temp_tag = mode + '/' + k
+
+                if isinstance(logs[k], (list, tuple)):
+                    temp_value = logs[k][0]
+                elif isinstance(logs[k], numbers.Number):
+                    temp_value = logs[k]
+                else:
+                    continue
+
+                self.writer.add_scalar(
+                    tag=temp_tag, step=total_step, value=temp_value)
+
+    def on_train_batch_end(self, step, logs=None):
+        logs = logs or {}
+        self.train_step += 1
+
+        if self._is_write():
+            self._updates(logs, 'train')
+
+    def on_eval_begin(self, logs=None):
+        self.eval_steps = logs.get('steps', None)
+        self.eval_metrics = logs.get('metrics', [])
+        self.eval_step = 0
+        self.evaled_samples = 0
+
+    def on_train_end(self, logs=None):
+        if hasattr(self, 'writer'):
+            self.writer.close()
+            delattr(self, 'writer')
+
+    def on_eval_end(self, logs=None):
+        if self._is_write():
+            self._updates(logs, 'eval')
+
+            if (not hasattr(self, '_is_fit')) and hasattr(self, 'writer'):
+                self.writer.close()
+                delattr(self, 'writer')
diff --git a/python/paddle/tests/test_callbacks.py b/python/paddle/tests/test_callbacks.py
index f0d9a132b90eb..b9442c46b8fd4 100644
--- a/python/paddle/tests/test_callbacks.py
+++ b/python/paddle/tests/test_callbacks.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
 import unittest
 import time
 import random
 import tempfile
 import shutil
+import paddle
 
 from paddle import Model
 from paddle.static import InputSpec
@@ -102,6 +104,32 @@ def test_callback_verbose_2(self):
         self.verbose = 2
         self.run_callback()
 
+    def test_visualdl_callback(self):
+        # visualdl not support python3
+        if sys.version_info < (3, ):
+            return
+
+        inputs = [InputSpec([-1, 1, 28, 28], 'float32', 'image')]
+        labels = [InputSpec([None, 1], 'int64', 'label')]
+
+        train_dataset = paddle.vision.datasets.MNIST(mode='train')
+        eval_dataset = paddle.vision.datasets.MNIST(mode='test')
+
+        net = paddle.vision.LeNet()
+        model = paddle.Model(net, inputs, labels)
+
+        optim = paddle.optimizer.Adam(0.001, parameters=net.parameters())
+        model.prepare(
+            optimizer=optim,
+            loss=paddle.nn.CrossEntropyLoss(),
+            metrics=paddle.metric.Accuracy())
+
+        callback = paddle.callbacks.VisualDL(log_dir='visualdl_log_dir')
+        model.fit(train_dataset,
+                  eval_dataset,
+                  batch_size=64,
+                  callbacks=callback)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index 56c8be862f887..389d45fc6b95e 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -2,3 +2,4 @@ PyGithub
 coverage
 pycrypto ; platform_system != "Windows"
 mock
+visualdl ; python_version>="3.5"

From 69a3339aaa7429e72b9dc512143c101aad2ceeed Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 30 Sep 2020 18:04:11 +0800
Subject: [PATCH 09/91] Move dygraph amp api to paddle-2.0 (#27681)

* move dygraph amp api to paddle

* refine code and add unit test
---
 python/paddle/__init__.py                     |   1 +
 python/paddle/amp/__init__.py                 |  18 +++
 python/paddle/amp/auto_cast.py                |  52 +++++++
 python/paddle/amp/grad_scaler.py              | 136 ++++++++++++++++++
 .../test_imperative_auto_mixed_precision.py   |  78 ++++++++++
 python/setup.py.in                            |   1 +
 6 files changed, 286 insertions(+)
 create mode 100644 python/paddle/amp/__init__.py
 create mode 100644 python/paddle/amp/auto_cast.py
 create mode 100644 python/paddle/amp/grad_scaler.py

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 84713d513fb68..3c52bbdcccaf8 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -272,6 +272,7 @@
 
 from . import jit
 from . import static
+from . import amp
 
 # high-level api
 from .hapi import Model
diff --git a/python/paddle/amp/__init__.py b/python/paddle/amp/__init__.py
new file mode 100644
index 0000000000000..32587938512c4
--- /dev/null
+++ b/python/paddle/amp/__init__.py
@@ -0,0 +1,18 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .auto_cast import auto_cast
+from .grad_scaler import GradScaler
+
+__all__ = ['auto_cast', 'GradScaler']
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
new file mode 100644
index 0000000000000..e33f6e2afc846
--- /dev/null
+++ b/python/paddle/amp/auto_cast.py
@@ -0,0 +1,52 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.dygraph.amp import amp_guard
+
+__all__ = ['auto_cast']
+
+
+def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
+    """
+    Create a context which enables auto-mixed-precision(AMP) of operators executed in dynamic graph mode.
+    If enabled, the input data type (float32 or float16) of each operator is decided 
+    by autocast algorithm for better performance. 
+    
+    Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in 
+    imperative mode.
+
+    Args:
+        enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
+        custom_white_list(set|list, optional): The custom white_list.
+        custom_black_list(set|list, optional): The custom black_list.
+        
+    Examples:
+
+     .. code-block:: python
+
+        import paddle
+
+        conv2d = paddle.nn.Conv2d(3, 2, 3, bias_attr=False)
+        data = paddle.rand([10, 3, 32, 32])
+
+        with paddle.amp.auto_cast():
+            conv = conv2d(data)
+            print(conv.dtype) # FP16
+
+        with paddle.amp.auto_cast(enable=False):
+            conv = conv2d(data)
+            print(conv.dtype) # FP32
+
+    """
+    return amp_guard(enable, custom_white_list, custom_black_list)
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
new file mode 100644
index 0000000000000..9476f3765b3bc
--- /dev/null
+++ b/python/paddle/amp/grad_scaler.py
@@ -0,0 +1,136 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.dygraph.amp import AmpScaler
+
+__all__ = ['GradScaler']
+
+
+class GradScaler(AmpScaler):
+    """
+    GradScaler is used for Auto-Mixed-Precision training/inferring in dynamic graph
+    mode. It controls the scaling of loss, helps avoiding numerical overflow.
+    The object of this class has two methods `scale()`, `minimize()`.
+
+    `scale()` is used to multiply the loss by a scale ratio.
+    `minimize()` is similar as `Optimizer.minimize()`, performs parameters updating.
+
+    Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in 
+    dynamic graph mode.
+
+    Args:
+        enable(bool, optional): Enable loss scaling or not. Default is True.
+        init_loss_scaling (float, optional): The initial loss scaling factor. Default is 2**15.
+        incr_ratio(float, optional): The multiplier to use when increasing the loss 
+                        scaling. Default is 2.0.
+        decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing 
+                        the loss scaling. Default is 0.5.
+        incr_every_n_steps(int, optional): Increases loss scaling every n consecutive 
+                                steps with finite gradients. Default is 1000.
+        decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n 
+                                    accumulated steps with nan or inf gradients. Default is 2.
+        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
+    Returns:
+        An AmpScaler object.
+
+    Examples:
+
+     .. code-block:: python
+
+        import paddle
+
+        model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True)
+        optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+        data = paddle.rand([10, 3, 32, 32])
+        with paddle.amp.auto_cast():
+            conv = model(data)
+            loss = paddle.reduce_mean(conv) 
+            scaled = scaler.scale(loss)  # scale the loss 
+            scaled.backward()            # do backward
+            scaler.minimize(optimizer, scaled)  # update parameters     
+    """
+
+    def __init__(self,
+                 enable=True,
+                 init_loss_scaling=2.**15,
+                 incr_ratio=2.0,
+                 decr_ratio=0.5,
+                 incr_every_n_steps=1000,
+                 decr_every_n_nan_or_inf=1,
+                 use_dynamic_loss_scaling=True):
+        super(GradScaler, self).__init__(enable, init_loss_scaling, incr_ratio,
+                                         decr_ratio, incr_every_n_steps,
+                                         decr_every_n_nan_or_inf,
+                                         use_dynamic_loss_scaling)
+
+    def scale(self, var):
+        """
+        Multiplies a Tensor by the scale factor and returns scaled outputs.  
+        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
+
+        Args:
+            var (Tensor):  The tensor to scale.
+        Returns:
+            The scaled tensor or original tensor.
+        
+        Examples:
+            .. code-block:: python
+
+            import paddle
+
+            model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True)
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+            data = paddle.rand([10, 3, 32, 32])
+            with paddle.amp.auto_cast():
+                conv = model(data)
+                loss = paddle.reduce_mean(conv) 
+                scaled = scaler.scale(loss)  # scale the loss 
+                scaled.backward()            # do backward
+                scaler.minimize(optimizer, scaled)  # update parameters  
+        """
+        return super(GradScaler, self).scale(var)
+
+    def minimize(self, optimizer, *args, **kwargs):
+        """
+        This function is similar as `Optimizer.minimize()`, which performs parameters updating.
+        
+        If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
+        Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters.
+
+        Finally, the loss scaling ratio is updated.
+
+        Args:
+            optimizer(Optimizer):  The optimizer used to update parameters.
+            args:  Arguments, which will be forward to `optimizer.minimize()`.
+            kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`.
+
+        Examples:
+            .. code-block:: python
+
+            import paddle
+
+            model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True)
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+            data = paddle.rand([10, 3, 32, 32])
+            with paddle.amp.auto_cast():
+                conv = model(data)
+                loss = paddle.reduce_mean(conv) 
+                scaled = scaler.scale(loss)  # scale the loss 
+                scaled.backward()            # do backward
+                scaler.minimize(optimizer, scaled)  # update parameters  
+        """
+        return super(GradScaler, self).minimize(optimizer, *args, **kwargs)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index fdf7adbfb45f0..71381ecfde738 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -196,6 +196,84 @@ def test_nan_inf(self):
                     np.array_equal(param.numpy(), params_init[param.name]))
 
 
+class TestResnet2(unittest.TestCase):
+    def train_resnet(self, enable_amp=True):
+        seed = 90
+
+        batch_size = train_parameters["batch_size"]
+        batch_num = 1
+
+        paddle.disable_static()
+
+        paddle.manual_seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
+
+        resnet = ResNet(use_cudnn=True)
+        optimizer = optimizer_setting(
+            train_parameters, parameter_list=resnet.parameters())
+        np.random.seed(seed)
+        train_reader = paddle.batch(
+            paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size)
+
+        dy_param_init_value = {}
+        for param in resnet.parameters():
+            dy_param_init_value[param.name] = param.numpy()
+
+        program = None
+        scaler = paddle.amp.GradScaler(
+            enable=enable_amp, init_loss_scaling=2.**10)
+
+        for batch_id, data in enumerate(train_reader()):
+            if batch_id >= batch_num:
+                break
+            dy_x_data = np.array(
+                [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+            if len(np.array([x[1]
+                             for x in data]).astype('int64')) != batch_size:
+                continue
+            y_data = np.array([x[1] for x in data]).astype('int64').reshape(-1,
+                                                                            1)
+            img = paddle.to_tensor(dy_x_data)
+            label = paddle.to_tensor(y_data)
+            label.stop_gradient = True
+
+            with paddle.amp.auto_cast(enable=enable_amp):
+                out = resnet(img)
+
+            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+            avg_loss = paddle.mean(x=loss)
+
+            dy_out = avg_loss.numpy()
+
+            scaled_loss = scaler.scale(avg_loss)
+            scaled_loss.backward()
+
+            scaler.minimize(optimizer, scaled_loss)
+
+            dy_grad_value = {}
+            for param in resnet.parameters():
+                if param.trainable:
+                    np_array = np.array(param._grad_ivar().value().get_tensor())
+                    dy_grad_value[param.name + fluid.core.grad_var_suffix(
+                    )] = np_array
+
+            resnet.clear_gradients()
+
+            dy_param_value = {}
+            for param in resnet.parameters():
+                dy_param_value[param.name] = param.numpy()
+
+            paddle.enable_static()
+
+        return dy_out, dy_param_value, dy_grad_value
+
+    def test_resnet(self):
+        out_fp32 = self.train_resnet(enable_amp=False)
+        out_amp = self.train_resnet(enable_amp=True)
+        print(out_fp32[0], out_amp[0])
+        self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2))
+
+
 class TestResnet(unittest.TestCase):
     def train_resnet(self, enable_amp=True):
         seed = 90
diff --git a/python/setup.py.in b/python/setup.py.in
index 414258a3b3756..f09c189a68e1c 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -192,6 +192,7 @@ packages=['paddle',
           'paddle.fluid.incubate.fleet.parameter_server.ir',
           'paddle.fluid.incubate.fleet.collective',
           'paddle.fluid.incubate.fleet.utils',
+          'paddle.amp',
           'paddle.hapi',
           'paddle.vision',
           'paddle.vision.models',

From 54c368db1e3b1b34fa6a65f5249a84d2811dd62a Mon Sep 17 00:00:00 2001
From: 123malin <malin10@baidu.com>
Date: Wed, 30 Sep 2020 18:37:29 +0800
Subject: [PATCH 10/91] [API 2.0: doc] fix doc of nonzero (#27685)

* test=develop, update example
---
 python/paddle/fluid/optimizer.py              | 48 +++++++++++--------
 .../fluid/tests/unittests/test_nonzero_api.py |  8 ++++
 python/paddle/optimizer/__init__.py           | 14 +++---
 python/paddle/tensor/search.py                | 19 ++++----
 4 files changed, 49 insertions(+), 40 deletions(-)

diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 761f6409fed76..4a9ce4454af0b 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -4884,29 +4884,35 @@ class LookaheadOptimizer(object):
             import paddle
             import paddle.fluid as fluid
             import numpy as np
+            import numpy.random as random
 
-	    x = fluid.layers.data(name='x', shape=[2], dtype='float32')
-	    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-	    y = fluid.layers.fc(input=[x], size=2, act="softmax")
-	    loss = fluid.layers.cross_entropy(input=y, label=label)
-	    loss = fluid.layers.mean(x=loss)
-	    sgd = fluid.optimizer.SGD(learning_rate=0.01)
-	    optimizer = fluid.optimizer.LookaheadOptimizer(sgd,
-                                            alpha=0.5,
-                                            k=5)
-	    optimizer.minimize(loss)
-	    main_program = fluid.default_main_program()
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
-
-	    feeder = fluid.DataFeeder(feed_list=[x, label], place=place)
+            paddle.enable_static()
+        
+            x = fluid.layers.data(name='x', shape=[2], dtype='float32')
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+            y = fluid.layers.fc(input=[x], size=2, act="softmax")
+            loss = fluid.layers.cross_entropy(input=y, label=label)
+            loss = fluid.layers.mean(x=loss)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            optimizer = fluid.optimizer.LookaheadOptimizer(sgd,
+                                                alpha=0.5,
+                                                k=5)
+            optimizer.minimize(loss)
+            main_program = fluid.default_main_program()
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
 
-	    step = 0
-            while(step < 10):
-                step += 1
-		exe.run(fluid.default_main_program(),
-            	feed=feeder.feed(batch_data))
+            def train_reader(limit=5):
+                for i in range(limit):
+                    yield random.random([2]).astype('float32'), random.random([1]).astype('int64')
+            
+            feeder = fluid.DataFeeder(feed_list=[x, label], place=place)
+            reader = paddle.batch(paddle.reader.shuffle(train_reader, buf_size=50000),batch_size=1)
+            
+            for batch_data in reader():
+                exe.run(fluid.default_main_program(),
+                feed=feeder.feed(batch_data))
 
     """
 
diff --git a/python/paddle/fluid/tests/unittests/test_nonzero_api.py b/python/paddle/fluid/tests/unittests/test_nonzero_api.py
index 0e68f9d5be761..8569be82db09e 100644
--- a/python/paddle/fluid/tests/unittests/test_nonzero_api.py
+++ b/python/paddle/fluid/tests/unittests/test_nonzero_api.py
@@ -76,6 +76,14 @@ def test_nonzero_api(self):
         expect_out = np.array([[0], [1]])
         self.assertTrue(np.allclose(expect_out, np.array(res)))
 
+    def test_dygraph_api(self):
+        data_x = np.array([[True, False], [False, True]])
+        with fluid.dygraph.guard():
+            x = fluid.dygraph.to_variable(data_x)
+            z = paddle.nonzero(x)
+            np_z = z.numpy()
+        expect_out = np.array([[0, 0], [1, 1]])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index 30de88cc29e76..6f485e2e9d62f 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -15,19 +15,17 @@
 __all__ = [
     'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam',
     'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer', 'Dpsgd',
-    'DpsgdOptimizer', 'Ftrl', 'FtrlOptimizer', 'LookaheadOptimizer',
-    'ModelAverage', 'Momentum', 'MomentumOptimizer', 'RMSProp', 'SGD',
-    'SGDOptimizer', 'Optimizer', '_LRScheduler', 'NoamLR', 'PiecewiseLR',
-    'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR', 'LinearLrWarmup',
-    'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR', 'ReduceLROnPlateau',
-    'CosineAnnealingLR'
+    'DpsgdOptimizer', 'Ftrl', 'FtrlOptimizer', 'Momentum', 'MomentumOptimizer',
+    'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer', '_LRScheduler', 'NoamLR',
+    'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR',
+    'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR',
+    'ReduceLROnPlateau', 'CosineAnnealingLR'
 ]
 
 
 from ..fluid.optimizer import Momentum, Adagrad, Dpsgd, DecayedAdagrad, Ftrl,\
             AdagradOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, \
-            FtrlOptimizer, AdadeltaOptimizer, ModelAverage, \
-            LookaheadOptimizer
+            FtrlOptimizer, AdadeltaOptimizer
 
 from .optimizer import Optimizer
 from .adam import Adam
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index f55d285586f0e..19d8fc58b0e7e 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -339,11 +339,8 @@ def index_select(x, index, axis=0, name=None):
     return out
 
 
-def nonzero(input, as_tuple=False):
+def nonzero(x, as_tuple=False):
     """
-	:alias_main: paddle.nonzero
-	:alias: paddle.nonzero,paddle.tensor.nonzero,paddle.tensor.search.nonzero
-
     Return a tensor containing the indices of all non-zero elements of the `input` 
     tensor. If as_tuple is True, return a tuple of 1-D tensors, one for each dimension 
     in `input`, each containing the indices (in that dimension) of all non-zero elements 
@@ -353,17 +350,17 @@ def nonzero(input, as_tuple=False):
     a 1-D tensor tuple of length `n`, and the shape of each 1-D tensor is [z, 1].
 
     Args:
-        inputs (Variable): The input tensor variable.
+        x (Tensor): The input tensor variable.
         as_tuple (bool): Return type, Tensor or tuple of Tensor.
 
     Returns:
-        Variable. The data type is int64.
+        Tensor. The data type is int64.
 
     Examples:
+    
         .. code-block:: python
-            import paddle
 
-            paddle.disable_static()
+            import paddle
 
             x1 = paddle.to_tensor([[1.0, 0.0, 0.0],
                           [0.0, 2.0, 0.0],
@@ -402,13 +399,13 @@ def nonzero(input, as_tuple=False):
             #[]                    
     """
     list_out = []
-    shape = input.shape
+    shape = x.shape
     rank = len(shape)
 
     if in_dygraph_mode():
-        outs = core.ops.where_index(input)
+        outs = core.ops.where_index(x)
     else:
-        outs = layers.where(input)
+        outs = layers.where(x)
 
     if not as_tuple:
         return outs

From 9cd86487cc46ff933eda0b12cd4bfe5d303f4221 Mon Sep 17 00:00:00 2001
From: ysh329 <ysh329@users.noreply.github.com>
Date: Wed, 30 Sep 2020 07:19:22 -0500
Subject: [PATCH 11/91] Fix api for ErrorClipByValue, code demo of
 clip_by_norm. test=develop (#27654)

* Fix ErrorClipByValue api and demo code of clip_by_value. test=develop

Co-authored-by: tianshuo78520a <707759223@qq.com>
---
 python/paddle/fluid/layers/nn.py | 13 +++++++++----
 python/paddle/nn/__init__.py     |  1 -
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 1bd279c1e821d..733d8b5d29f1a 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -12415,12 +12415,17 @@ def clip_by_norm(x, max_norm, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            input = fluid.data(
-                name='data', shape=[None, 1], dtype='float32')
-            reward = fluid.layers.clip_by_norm(x=input, max_norm=1.0)
+            import paddle
+            import numpy as np
+
+            paddle.disable_static()
+            input = paddle.to_tensor(data=np.array([[0.1, 0.2], [0.3, 0.4]]), dtype="float32")
+            reward = paddle.nn.clip_by_norm(x=input, max_norm=1.0)
     """
 
+    if in_dygraph_mode():
+        return core.ops.clip_by_norm(x, 'max_norm', max_norm)
+
     helper = LayerHelper("clip_by_norm", **locals())
     check_variable_and_dtype(x, 'X', ['float32'], 'clip_by_norm')
     check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index b79b965f5b902..2452f196987b8 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -31,7 +31,6 @@
 __all__ += weight_norm_hook.__all__
 
 # TODO: define alias in nn directory
-# from .clip import ErrorClipByValue        #DEFINE_ALIAS
 from .clip import GradientClipByGlobalNorm  #DEFINE_ALIAS
 from .clip import GradientClipByNorm  #DEFINE_ALIAS
 from .clip import GradientClipByValue  #DEFINE_ALIAS

From 7a96d5788d9b15f61b60a33338dcf31ce28d5ac1 Mon Sep 17 00:00:00 2001
From: hong19860320 <9973393+hong19860320@users.noreply.github.com>
Date: Wed, 30 Sep 2020 22:28:57 +0800
Subject: [PATCH 12/91] Optimize the error messages of the CUDA implementation
 of activation ops (#27741)

test=develop
---
 paddle/fluid/operators/activation_cudnn_op.cu.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index 1903b9e30d800..26ad09cc265f1 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -41,7 +41,7 @@ struct CudnnActivationFunctor {
     TensorDescriptor x_desc, out_desc;
     x_desc.set(x);
     out_desc.set(GET_DATA_SAFELY(out, "Output", "Out", "CudnnActivation"));
-    PADDLE_ENFORCE(platform::dynload::cudnnActivationForward(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnActivationForward(
         ctx_.cudnn_handle(), act_desc.desc(),
         platform::CudnnDataType<T>::kOne(), x_desc.desc(), x.data<T>(),
         platform::CudnnDataType<T>::kZero(), out_desc.desc(),
@@ -67,7 +67,7 @@ struct CudnnActivationGradFunctor {
     out_desc.set(out);
     dout_desc.set(dout);
     dx_desc.set(GET_DATA_SAFELY(dx, "Output", "X@GRAD", "CudnnActivationGrad"));
-    PADDLE_ENFORCE(platform::dynload::cudnnActivationBackward(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnActivationBackward(
         ctx_.cudnn_handle(), act_desc.desc(),
         platform::CudnnDataType<T>::kOne(), out_desc.desc(), out.data<T>(),
         dout_desc.desc(), dout.data<T>(), x_desc.desc(), x.data<T>(),

From 0cd4907ebae1f3b096b0a004785d0f6966a26e62 Mon Sep 17 00:00:00 2001
From: "joanna.wozna.intel" <joanna.wozna@intel.com>
Date: Wed, 30 Sep 2020 17:15:39 +0200
Subject: [PATCH 13/91] Add avx512 core instructions check (#27732)

* Add avx instructions check

* Small fix

* Change function name

* Change uint to unsigned int
---
 paddle/fluid/platform/cpu_info.cc                    |  7 +++++++
 paddle/fluid/pybind/pybind.cc                        | 12 ++++++++++++
 .../unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py   |  5 +++--
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index e379832593c78..2df1f291f9f8c 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -164,6 +164,13 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
         // AVX512F: EBX Bit 16
         int avx512f_mask = (1 << 16);
         return (reg[1] & avx512f_mask) != 0;
+      } else if (cpu_isa == avx512_core) {
+        unsigned int avx512f_mask = (1 << 16);
+        unsigned int avx512dq_mask = (1 << 17);
+        unsigned int avx512bw_mask = (1 << 30);
+        unsigned int avx512vl_mask = (1 << 31);
+        return ((reg[1] & avx512f_mask) && (reg[1] & avx512dq_mask) &&
+                (reg[1] & avx512bw_mask) && (reg[1] & avx512vl_mask));
       }
     }
 #endif
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 0929febc4d46f..b303ddde1366e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -142,6 +142,17 @@ bool IsCompiledWithMKLDNN() {
 #endif
 }
 
+bool SupportsBfloat16() {
+#ifndef PADDLE_WITH_MKLDNN
+  return false;
+#else
+  if (platform::MayIUse(platform::cpu_isa_t::avx512_core))
+    return true;
+  else
+    return false;
+#endif
+}
+
 bool IsCompiledWithBrpc() {
 #ifndef PADDLE_WITH_DISTRIBUTE
   return false;
@@ -1661,6 +1672,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
+  m.def("supports_bfloat16", SupportsBfloat16);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
   m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) {
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
index 0ac33383fb26b..4b7b4b5811a67 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
@@ -19,7 +19,7 @@
 import struct
 
 import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
 from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp
 
 
@@ -205,4 +205,5 @@ def init_group(self):
 
 
 if __name__ == '__main__':
-    unittest.main()
+    if core.supports_bfloat16():
+        unittest.main()

From 966447e33863f37bcbed0f8655adfca3239cf6a3 Mon Sep 17 00:00:00 2001
From: Wojciech Uss <wojciech.uss@intel.com>
Date: Thu, 1 Oct 2020 10:14:37 +0200
Subject: [PATCH 14/91] Added support for quantization of fusion_gru (#27518)

---
 cmake/external/mkldnn.cmake                   |  2 +-
 .../framework/ir/graph_pattern_detector.cc    | 23 +++++-
 .../framework/ir/graph_pattern_detector.h     | 15 ++++
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  | 79 ++++++++++++++++--
 .../framework/ir/mkldnn/cpu_quantize_pass.h   | 19 ++---
 .../ir/mkldnn/cpu_quantize_pass_tester.cc     | 81 +++++++++++++++++++
 .../ir/mkldnn/cpu_quantize_squash_pass.cc     | 10 ++-
 .../fused/mkldnn/fusion_gru_mkldnn_op.cc      |  4 +-
 .../quantization/quant2_int8_mkldnn_pass.py   | 36 +++++++++
 .../fluid/contrib/slim/tests/CMakeLists.txt   | 26 +++---
 .../mkldnn/test_fusion_gru_int8_mkldnn_op.py  | 20 +++--
 11 files changed, 268 insertions(+), 47 deletions(-)

diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index c0adda0da31ae..e3ac8624a809a 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     https://github.com/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            64a48f9565aa72f6359917b3406328075a409939)
+SET(MKLDNN_TAG            361725600224f41b7347a1c6bee9b04d1e6c14d7)
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 96952e20c2158..449881a9f8feb 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1882,9 +1882,9 @@ PDNode *patterns::MultipleQuantize::operator()() {
 PDNode *patterns::QuantizePlacement::operator()(
     const std::unordered_set<std::string> &quantize_enabled_op_types) {
   std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>({"concat", "conv2d", "elementwise_add",
-                                       "fc", "matmul", "pool2d", "prior_box",
-                                       "relu", "reshape2", "transpose2"});
+      std::unordered_set<std::string>(
+          {"concat", "conv2d", "elementwise_add", "fc", "matmul", "pool2d",
+           "prior_box", "relu", "reshape2", "transpose2", "fusion_gru"});
   if (!quantize_enabled_op_types.empty()) {
     supported_op_types = quantize_enabled_op_types;
   }
@@ -2280,6 +2280,23 @@ PDNode *patterns::MatmulTransposeReshapePattern::operator()() {
   return reshape_out;
 }
 
+PDNode *patterns::FusionGru::operator()() {
+  auto op = pattern->NewNode(op_repr())->assert_is_op("fusion_gru");
+  auto x = pattern->NewNode(x_repr())->AsInput()->assert_is_op_input(
+      "fusion_gru", "X");
+  auto weight_h = pattern->NewNode(weight_h_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("fusion_gru", "WeightH");
+  auto weight_x = pattern->NewNode(weight_x_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("fusion_gru", "WeightX");
+  auto out = pattern->NewNode(out_repr())
+                 ->AsOutput()
+                 ->assert_is_op_output("fusion_gru", "Hidden");
+  op->LinksFrom({x, weight_h, weight_x}).LinksTo({out});
+  return out;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 7116b8a2a6f35..15f6ea1541d58 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1312,6 +1312,21 @@ struct MatmulTransposeReshapePattern : public PatternBase {
   PATTERN_DECL_NODE(reshape_out_xshape);
 };
 
+// fusion_gru op
+// Forward pass for fusion_gru.
+// fusion_gru out is a result of the operator.
+struct FusionGru : public PatternBase {
+  FusionGru(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "fusion_gru") {}
+
+  PDNode* operator()();
+  PATTERN_DECL_NODE(op);
+  PATTERN_DECL_NODE(x);
+  PATTERN_DECL_NODE(weight_h);
+  PATTERN_DECL_NODE(weight_x);
+  PATTERN_DECL_NODE(out);
+};
+
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 0254b5e757351..58931f3ed3872 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -63,8 +63,9 @@ enum { U8_MAX = 255, S8_MAX = 127 };
 
 void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
                                     std::string input_name, double scale_to_one,
-                                    bool is_unsigned,
-                                    std::string scale_attr_name) const {
+                                    bool is_input_unsigned,
+                                    std::string scale_attr_name, float shift,
+                                    std::string shift_attr_name) const {
   auto inputs = op->Op()->InputNames();
   bool name_found =
       std::find(inputs.begin(), inputs.end(), input_name) != inputs.end();
@@ -72,7 +73,7 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
                     platform::errors::InvalidArgument(
                         "Var(%s) isn't the input of the %s operator.",
                         input_name, op->Op()->Type()));
-  unsigned max = is_unsigned ? U8_MAX : S8_MAX;
+  unsigned max = is_input_unsigned ? U8_MAX : S8_MAX;
   float scale = scale_to_one * max;
 
   // Create quantize output variable
@@ -86,7 +87,8 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
   q_desc.SetOutput("Output",
                    std::vector<std::string>({quantize_out_node->Name()}));
   q_desc.SetAttr("Scale", scale);
-  q_desc.SetAttr("is_negative_input", !is_unsigned);
+  q_desc.SetAttr("Shift", shift);
+  q_desc.SetAttr("is_negative_input", !is_input_unsigned);
 
   q_desc.SetAttr("output_format",
                  Has("data_layout") ? Get<std::string>("data_layout") : "NHWC");
@@ -103,11 +105,13 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
   IR_NODE_LINK_TO(quantize_out_node, op);
 
   if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
+  if (!shift_attr_name.empty()) op->Op()->SetAttr(shift_attr_name, shift);
 }
 
 void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name,
-                                     bool are_unsigned,
-                                     std::string scale_attr_name) const {
+                                     bool are_inputs_unsigned,
+                                     std::string scale_attr_name, float shift,
+                                     std::string shift_attr_name) const {
   auto inputs = op->inputs;
   auto output = op->outputs[0];
   PADDLE_ENFORCE_GE(inputs.size(), 1,
@@ -127,7 +131,7 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name,
   std::vector<std::string> quantize_out_node_names(inputs.size());
 
   double scale_out = GetScaleValueForNode(output);
-  unsigned max = are_unsigned ? U8_MAX : S8_MAX;
+  unsigned max = are_inputs_unsigned ? U8_MAX : S8_MAX;
   float scale = scale_out * max;
 
   for (size_t i = 0; i < inputs.size(); i++) {
@@ -137,10 +141,11 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name,
     quantize_out_node_names[i] = quantize_out_nodes[i]->Name();
 
     q_desc.SetAttr("Scale", scale);
+    q_desc.SetAttr("Shift", shift);
     q_desc.SetInput("Input", std::vector<std::string>({inputs[i]->Name()}));
     q_desc.SetOutput("Output",
                      std::vector<std::string>({quantize_out_node_names[i]}));
-    q_desc.SetAttr("is_negative_input", !are_unsigned);
+    q_desc.SetAttr("is_negative_input", !are_inputs_unsigned);
     auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
 
     // link quantize op
@@ -154,6 +159,7 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name,
   op->Op()->SetInput(input_name, quantize_out_node_names);
 
   if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
+  if (!shift_attr_name.empty()) op->Op()->SetAttr(shift_attr_name, shift);
 }
 
 void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
@@ -782,6 +788,62 @@ void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
                   quantize_elementwise_add_count);
 }
 
+void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::FusionGru pattern{gpd.mutable_pattern(), name_scope_};
+  pattern();
+
+  int quantize_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize fusion_gru op";
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, pattern);
+
+    // skip if should not be quantized
+    if (!platform::HasOpINT8DataType(op->Op())) {
+      LogQuantizationDisabled(op);
+      return;
+    }
+
+    GET_IR_NODE_FROM_SUBGRAPH(x, x, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(weight_h, weight_h, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(weight_x, weight_x, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(out, out, pattern);
+
+    if (!AreScalesPresentForNodes(op, {x, weight_h, weight_x})) {
+      LogCannotQuantizeOp(op);
+      return;
+    }
+
+    bool is_x_unsigned{false};
+    auto input_x_scale = GetScaleValueForNode(x, &is_x_unsigned);
+
+    double input_x_shift{128.};
+    if (is_x_unsigned) input_x_shift = 0.;
+
+    QuantizeInput(g, op, x, "X", input_x_scale, is_x_unsigned, "Scale_data",
+                  input_x_shift, "Shift_data");
+
+    auto weight_scale_tensor = GetScaleTensorForNode(weight_x);
+    EigenVectorArrayMap eigen_tensor{weight_scale_tensor.data<double>(),
+                                     weight_scale_tensor.numel(), 1};
+    eigen_tensor *= static_cast<double>(S8_MAX);
+    std::vector<float> scale_weights{
+        weight_scale_tensor.data<double>(),
+        weight_scale_tensor.data<double>() + weight_scale_tensor.numel()};
+
+    op->Op()->SetAttr("Scale_weights", scale_weights);
+    // return fp32 data
+    op->Op()->SetAttr("force_fp32_output", true);
+
+    ++quantize_count;
+  };
+  gpd(graph, handler);
+  AddStatis(quantize_count);
+
+  PrettyLogDetail("---    quantized %d fusion_gru ops", quantize_count);
+}
+
 void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Quantizing the graph.";
   PADDLE_ENFORCE_NOT_NULL(
@@ -801,6 +863,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeReshape(graph);
   QuantizeMatmul(graph);
   QuantizeElementwiseAdd(graph);
+  QuantizeFusionGru(graph);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index bd87b31b781ec..0d4c424901081 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -49,31 +49,26 @@ class CPUQuantizePass : public FusePassBase {
   void ApplyImpl(ir::Graph* graph) const override;
 
   void QuantizeConv(Graph* graph, bool with_residual_data = false) const;
-
   void QuantizeFc(Graph* graph) const;
-
   void QuantizePool(Graph* graph) const;
-
   void QuantizeConcat(Graph* graph) const;
-
   void QuantizePriorBox(Graph* graph) const;
-
   void QuantizeTranspose(Graph* graph) const;
-
   void QuantizeReshape(Graph* graph) const;
-
   void QuantizeMatmul(Graph* graph) const;
-
   void QuantizeElementwiseAdd(Graph* graph) const;
+  void QuantizeFusionGru(Graph* graph) const;
 
   void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
-                     double scale_to_one, bool is_unsigned,
-                     std::string scale_attr_name = "") const;
+                     double scale_to_one, bool is_input_unsigned,
+                     std::string scale_attr_name = "", float shift = 0.0,
+                     std::string shift_attr_name = "") const;
 
   // quantize all inputs of given name with the same (minimum) scale
   void QuantizeInputs(Graph* g, Node* op, std::string input_name,
-                      bool are_unsigned,
-                      std::string scale_attr_name = "") const;
+                      bool are_inputs_unsigned,
+                      std::string scale_attr_name = "", float shift = 0.0,
+                      std::string shift_attr_name = "") const;
 
   void DequantizeOutput(Graph* g, Node* op, Node* output,
                         std::string output_name, double scale_to_one,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index a66e9f0e93898..65be404dfef2f 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -91,6 +91,16 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
+  } else if (type == "fusion_gru") {
+    op->SetInput("X", {inputs[0]});
+    op->SetInput("Bias", {inputs[1]});
+    op->SetInput("WeightX", {inputs[2]});
+    op->SetInput("WeightH", {inputs[3]});
+    op->SetOutput("Hidden", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+    op->SetAttr("Scale_data", 1.0f);
+    op->SetAttr("Shift_data", 0.0f);
+    op->SetAttr("Weight_scale", std::vector<float>{1.0f});
   }
 }
 
@@ -389,6 +399,77 @@ TEST(CpuQuantizePass, transpose) {
                     quant_count, dequant_count, added_nodes_count, 2.0f * 127);
 }
 
+static const std::initializer_list<std::string> variable_names_fusion_gru = {
+    "x", "wx", "wh", "b", "h"};
+
+// x->Fusion_gru->h
+ProgramDesc BuildProgramDescFusionGru() {
+  ProgramDesc prog;
+  for (auto& v : variable_names_transpose) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    if (v.find("wx") == 0 || v.find("wh") || v.find("b")) {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "fusion_gru", "Fusion_gru", {"x", "wx", "wh", "b"}, {"h"}, true,
+        "int8");
+
+  return prog;
+}
+
+void MainTestFusionGru(const ProgramDesc& prog, int gru_count, int quant_count,
+                       int dequant_count, int added_nodes_count, float scale,
+                       float shift) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int original_nodes_num, current_nodes_num;
+  PreparePass(&graph, prog, variable_names_fusion_gru, &original_nodes_num,
+              &current_nodes_num);
+
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int gru_nodes_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "fusion_gru") {
+        gru_nodes_count++;
+
+        auto op_name = BOOST_GET_CONST(std::string, op->GetAttr("name"));
+        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_data")), scale)
+            << "Scale_data for node '" + op_name + "'.";
+        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Shift_data")), shift)
+            << "Shift_data for node '" + op_name + "'.";
+        EXPECT_EQ(BOOST_GET_CONST(std::vector<float>,
+                                  op->GetAttr("Scale_weights"))[0],
+                  scale)
+            << "Scale_weights for node '" + op_name + "'.";
+        EXPECT_EQ(BOOST_GET_CONST(bool, op->GetAttr("force_fp32_output")), true)
+            << "force_fp32_output for node '" + op_name + "'.";
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+      }
+    }
+  }
+  EXPECT_EQ(gru_nodes_count, gru_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+
+TEST(CpuQuantizePass, fusion_gru) {
+  // x->Fusion_gru->h
+  int gru_count = 1;
+  int quant_count = 1;
+  int dequant_count = 0;
+  // 1 Quant + 1 IN + 0 DeQuant + 0 OUT
+  int added_nodes_count = 1 + 1 + 0 + 0;
+  MainTestFusionGru(BuildProgramDescFusionGru(), gru_count, quant_count,
+                    dequant_count, added_nodes_count, 2. * 127, 128.);
+}
+
 static const std::initializer_list<std::string> variable_names_reshape = {
     "a", "w1", "b", "c", "d", "e", "f"};
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
index 54ab244a99bd4..d6146f264ab8d 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -76,6 +76,8 @@ void CPUQuantizeSquashPass::DequantQuantSquash(
         BOOST_GET_CONST(float, dequant_op->Op()->GetAttr("Scale"));
     float quant_scale =
         BOOST_GET_CONST(float, quant_op->Op()->GetAttr("Scale"));
+    float dequant_shift = dequant_op->Op()->GetAttrIfExists<float>("Shift");
+    float quant_shift = quant_op->Op()->GetAttrIfExists<float>("Shift");
     PADDLE_ENFORCE_NE(
         nodes_keep_counter->find(dequant_out), nodes_keep_counter->end(),
         platform::errors::NotFound("The dequant output node is not found."));
@@ -83,7 +85,7 @@ void CPUQuantizeSquashPass::DequantQuantSquash(
     // check if dequantize op should be kept or removed, decrease the counter
     bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1;
 
-    if (dequant_scale == quant_scale) {
+    if (dequant_scale == quant_scale && dequant_shift == quant_shift) {
       // squash dequantize-quantize to nothing
       auto quant_out_var_name = quant_out->Name();
       auto next_op_inputs = next_op_desc->InputNames();
@@ -110,7 +112,9 @@ void CPUQuantizeSquashPass::DequantQuantSquash(
       desc.SetInput("Input", std::vector<std::string>({dequant_in->Name()}));
       desc.SetOutput("Output", std::vector<std::string>({quant_out->Name()}));
       desc.SetAttr("Scale_in", dequant_scale);
+      desc.SetAttr("Shift_in", dequant_shift);
       desc.SetAttr("Scale_out", quant_scale);
+      desc.SetAttr("Shift_out", quant_shift);
 
       auto requant_op = g->CreateOpNode(&desc);
 
@@ -293,6 +297,7 @@ void CPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const {
         }));
     auto* first_quant_out = first_quant_op->outputs[0];
     float scale = first_quant_op->Op()->GetAttrIfExists<float>("Scale");
+    float shift = first_quant_op->Op()->GetAttrIfExists<float>("Shift");
 
     PADDLE_ENFORCE_NE(scale, 0,
                       platform::errors::InvalidArgument(
@@ -302,7 +307,8 @@ void CPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const {
       auto quant_op = prev_out->outputs[iter];
       if (quant_op->IsOp() && quant_op->Op()->Type() == "quantize" &&
           quant_op->id() != first_quant_op->id() &&
-          quant_op->Op()->GetAttrIfExists<float>("Scale") == scale) {
+          quant_op->Op()->GetAttrIfExists<float>("Scale") == scale &&
+          quant_op->Op()->GetAttrIfExists<float>("Shift") == shift) {
         auto quant_out = quant_op->outputs[0];
         auto last_op = quant_out->outputs[0];
 
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
index 5fad1b116de64..58ecc6731f00b 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -95,7 +95,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
 
       // Create memory descriptors
       auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(),
-                                    MKLDNNMemoryFormat::any);
+                                    MKLDNNMemoryFormat::ntc);
       auto weight_x_md =
           MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
       auto weight_h_md =
@@ -103,7 +103,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
       auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
                                    MKLDNNMemoryFormat::ldgo);
       auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
-                                     MKLDNNMemoryFormat::any);
+                                     MKLDNNMemoryFormat::ntc);
       auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
                                  MKLDNNMemoryFormat::ldnc);
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index dadc756c43ecc..45df381b63183 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -66,6 +66,7 @@ def __init__(self,
         self._fc_ops = ['fc']
         self._relu_ops = ['relu', 'relu6']
         self._matmul_ops = ['matmul']
+        self._gru_ops = ['fusion_gru']
         self._weight_scales = {}
         # Collect the Input and Output sclaes from Fake quant models
         self._var_quant_scales = {}
@@ -449,8 +450,43 @@ def _compute_var_scales(ops, w_name, axis):
                     self._var_quant_scales[weight_var_name] = (use_unsigned_int,
                                                                lod_tensor)
 
+        def _compute_gru_weight_scales(wx_name, wh_name):
+            for op in graph.all_op_nodes():
+                if op.op().type() in self._gru_ops:
+                    wx_var_name = op.input(wx_name)[0]
+                    wh_var_name = op.input(wh_name)[0]
+                    wx = np.array(self._load_param(self._scope, wx_var_name))
+                    wh = np.array(self._load_param(self._scope, wh_var_name))
+                    OC = wh.shape[0]
+                    scale_ur = 1.0 / np.max(np.abs(
+                        np.concatenate(
+                            [
+                                wx[:, :2 * OC], wh.flatten()[:2 * OC * OC]
+                                .reshape(OC, 2 * OC)
+                            ],
+                            axis=0)),
+                                            axis=0)
+                    scale_o = 1.0 / np.max(np.abs(
+                        np.concatenate(
+                            [
+                                wx[:, 2 * OC:], wh.flatten()[2 * OC * OC:]
+                                .reshape(OC, OC)
+                            ],
+                            axis=0)),
+                                           axis=0)
+
+                    gru_weights_scale = np.concatenate(
+                        [scale_ur, scale_o]).astype('float')
+
+                    lod_tensor = self._convert_scale2tensor(gru_weights_scale)
+                    use_unsigned_int = False
+                    self._var_quant_scales[wx_var_name] = (use_unsigned_int,
+                                                           lod_tensor)
+
         _compute_var_scales(self._conv_ops, "Filter", axis=1)
         _compute_var_scales(self._fc_ops, "W", axis=0)
+        _compute_var_scales(self._gru_ops, "WeightH", axis=0)
+        _compute_gru_weight_scales("WeightX", "WeightH")
         return graph
 
     def _find_avg_pooling_ids(self, graph):
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index dd4bea06572fb..6c02076eae0de 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -98,18 +98,16 @@ function(download_quant_model install_dir data_file)
     endif()
 endfunction()
 
-function(save_quant_ic_model_test target quant_model_dir fp32_model_save_path int8_model_save_path)
+function(save_quant_ic_model_test target quant_model_dir int8_model_save_path)
     py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_quant_model.py
             ARGS --quant_model_path ${quant_model_dir}
-	         --fp32_model_save_path ${fp32_model_save_path}
 	         --int8_model_save_path ${int8_model_save_path}
 		 --debug)
 endfunction()
 
-function(save_quant_nlp_model_test target quant_model_dir fp32_model_save_path int8_model_save_path ops_to_quantize)
+function(save_quant_nlp_model_test target quant_model_dir int8_model_save_path ops_to_quantize)
     py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_quant_model.py
             ARGS --quant_model_path ${quant_model_dir}
-	         --fp32_model_save_path ${fp32_model_save_path}
 	         --int8_model_save_path ${int8_model_save_path}
 		 --ops_to_quantize ${ops_to_quantize})
 endfunction()
@@ -227,8 +225,6 @@ if(LINUX AND WITH_MKLDNN)
 	set(NLP_LABLES_PATH "${NLP_DATA_DIR}/Ernie_dataset/label.xnli.dev")
 	download_quant_data(${NLP_DATA_DIR} ${NLP_DATA_ARCHIVE})
 
-	set(QUANT2_NLP_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add")
-
 	# Quant2 Ernie
 	set(QUANT2_ERNIE_MODEL_ARCHIVE "ernie_qat.tar.gz")
 	set(QUANT2_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_quant2")
@@ -236,17 +232,25 @@ if(LINUX AND WITH_MKLDNN)
 	set(FP32_ERNIE_MODEL_ARCHIVE "ernie_fp32_model.tar.gz")
 	set(FP32_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_float")
 	download_quant_fp32_model(${FP32_ERNIE_MODEL_DIR} ${FP32_ERNIE_MODEL_ARCHIVE})
-	inference_quant2_int8_nlp_test(test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH} ${QUANT2_NLP_OPS_TO_QUANTIZE})
+	set(QUANT2_ERNIE_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add")
+	inference_quant2_int8_nlp_test(test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE})
+
+	# Quant2 GRU
+	set(QUANT2_GRU_MODEL_ARCHIVE "GRU_quant_acc.tar.gz")
+	set(QUANT2_GRU_MODEL_DIR "${QUANT_INSTALL_DIR}/GRU_quant2")
+	download_quant_model(${QUANT2_GRU_MODEL_DIR} ${QUANT2_GRU_MODEL_ARCHIVE})
+	set(QUANT2_GRU_OPS_TO_QUANTIZE "fusion_gru")
 
 	### Save FP32 model or INT8 model from Quant model
         
 	set(QUANT2_INT8_RESNET50_SAVE_PATH "${QUANT_INSTALL_DIR}/ResNet50_quant2_int8")
-	set(QUANT2_FP32_RESNET50_SAVE_PATH "${QUANT_INSTALL_DIR}/ResNet50_quant2_fp32")
-	save_quant_ic_model_test(save_quant2_model_resnet50 ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${QUANT2_FP32_RESNET50_SAVE_PATH} ${QUANT2_INT8_RESNET50_SAVE_PATH})
+	save_quant_ic_model_test(save_quant2_model_resnet50 ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${QUANT2_INT8_RESNET50_SAVE_PATH})
 
 	set(QUANT2_INT8_ERNIE_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8")
-	set(QUANT2_FP32_ERNIE_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_fp32")
-	save_quant_nlp_model_test(save_quant2_model_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QUANT2_FP32_ERNIE_SAVE_PATH} ${QUANT2_INT8_ERNIE_SAVE_PATH} ${QUANT2_NLP_OPS_TO_QUANTIZE})
+	save_quant_nlp_model_test(save_quant2_model_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QUANT2_INT8_ERNIE_SAVE_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE})
+
+	set(QUANT2_INT8_GRU_SAVE_PATH "${QUANT_INSTALL_DIR}/GRU_quant2_int8")
+	save_quant_nlp_model_test(save_quant2_model_gru ${QUANT2_GRU_MODEL_DIR}/GRU_quant_acc ${QUANT2_INT8_GRU_SAVE_PATH} ${QUANT2_GRU_OPS_TO_QUANTIZE})
 
 	# Convert Quant2 model to dot and pdf files 
 	set(QUANT2_INT8_ERNIE_DOT_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8_dot_file")
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
index ff4531f0e250e..89343c9fae459 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
@@ -45,9 +45,10 @@ def setUp(self):
 
         # Input data
         x_f32 = np.random.rand(T, self.IC).astype('float32') * 2 - 1
-        scale_data = 63
-        shift_data = 64
-        x_u8 = (x_f32 * scale_data + shift_data).astype(np.uint8)
+        scale_data = 63.0
+        shift_data = 64.0
+        x_u8 = np.rint(x_f32 * scale_data + shift_data).astype(np.uint8)
+        #  x_u8 = (x_f32 * scale_data + shift_data).astype(np.uint8)
 
         # WeightX/WeightH data
         wx = np.random.rand(self.IC, 3 * self.OC).astype('float32') * 2 - 1
@@ -58,22 +59,23 @@ def setUp(self):
         # WeightX data shape in PP: [IC, 3 * OC]
         # WeightH data shape in PP: [OC, 2 * OC] + [OC, OC]
         # Scales shape in oneDNN:   [3, OC]
-        scale_ur = 63 / np.max(np.abs(
+        s8_max = 127.0
+        scale_ur = s8_max / np.max(np.abs(
             np.concatenate(
                 [
                     wx[:, :2 * self.OC], wh.flatten()[:2 * self.OC * self.OC]
                     .reshape(self.OC, 2 * self.OC)
                 ],
                 axis=0)),
-                               axis=0)
-        scale_o = 63 / np.max(np.abs(
+                                   axis=0)
+        scale_o = s8_max / np.max(np.abs(
             np.concatenate(
                 [
                     wx[:, 2 * self.OC:], wh.flatten()[2 * self.OC * self.OC:]
                     .reshape(self.OC, self.OC)
                 ],
                 axis=0)),
-                              axis=0)
+                                  axis=0)
 
         scale_weights = np.concatenate([scale_ur, scale_o]).astype('float')
 
@@ -102,7 +104,9 @@ def setUp(self):
             self.outputs = {'Hidden': (hidden_f32, self.lod)}
         else:
             self.error_margin = 1
-            hidden_u8 = (hidden_f32 * scale_data + shift_data).astype(np.uint8)
+            hidden_u8 = np.rint(hidden_f32 * scale_data + shift_data).astype(
+                np.uint8)
+            #  hidden_u8 = (hidden_f32 * scale_data + shift_data).astype(np.uint8)
             self.outputs = {'Hidden': (hidden_u8, self.lod)}
 
         self.attrs = {

From b9fda2ff096a907e67e77833fe52b72fd7cc4db3 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Thu, 1 Oct 2020 10:23:16 +0200
Subject: [PATCH 15/91] Fix to issue #25537 (#27546)

* - condidate fix to issue #25537

test=develop

* - UT for transpose NHWC

test=develop
---
 .../fluid/framework/data_layout_transform.cc  |  2 +-
 paddle/fluid/operators/CMakeLists.txt         |  1 +
 .../operators/mkldnn/nhwc_op_tests.cmake      |  2 +
 .../operators/mkldnn/test_mkldnn_op_nhwc.cc   | 94 +++++++++++++++++++
 paddle/fluid/operators/transpose_op.cc        | 13 +++
 paddle/fluid/platform/mkldnn_helper.h         | 22 +++++
 6 files changed, 133 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
 create mode 100644 paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc

diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 108cd9ac6d1c0..8563b5b6d3695 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -203,7 +203,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
   // As MKL-DNN description was in NCHW and paddle is expecting NHWC
   platform::MatchShapeToLayout(out, in_layout, out_layout);
 
-  out->set_layout(out_layout);
+  out->set_layout(DataLayout::kNCHW);
   // reset format since the out tensor will be feed to non-MKLDNN OPkernel
   out->set_format(MKLDNNMemoryFormat::undef);
 }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 53e6f4aa6e41b..5fa8f6bab8cca 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -144,4 +144,5 @@ cc_test(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_o
 
 if(WITH_MKLDNN)
 include(mkldnn/inplace_op_tests.cmake)
+include(mkldnn/nhwc_op_tests.cmake)
 endif()
diff --git a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
new file mode 100644
index 0000000000000..232626df02e50
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
@@ -0,0 +1,2 @@
+cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op pooling transpose_op scope device_context enforce executor)
+
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
new file mode 100644
index 0000000000000..e7caeef85f5f9
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+#include <random>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+USE_OP(pool2d);
+USE_OP_DEVICE_KERNEL(pool2d, MKLDNN);
+USE_OP(transpose);
+USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
+
+namespace paddle {
+namespace operators {
+
+struct InputVars {
+  std::string name;
+  framework::LoDTensor *tensor;
+};
+
+TEST(test_pool2d_transpose_nhwc, cpu_place) {
+  framework::DDim dims({1, 4, 8, 512});           // NHWC shape
+  framework::DDim expected_dims({1, 7, 512, 3});  // NHWC expected shape
+  platform::CPUPlace p;
+  framework::Scope scope;
+
+  InputVars input_name = {"x",
+                          scope.Var("x")->GetMutable<framework::LoDTensor>()};
+  // Initialize input data
+  std::uniform_real_distribution<float> dist(static_cast<float>(10.0),
+                                             static_cast<float>(20.0));
+  std::mt19937 engine;
+  size_t numel = static_cast<size_t>(framework::product(dims));
+  input_name.tensor->Resize(dims);
+  auto data_ptr = input_name.tensor->mutable_data<float>(p);
+  for (size_t i = 0; i < numel; ++i) {
+    data_ptr[i] = dist(engine);
+  }
+
+  scope.Var("y")->GetMutable<framework::LoDTensor>();
+  auto *z = scope.Var("z")->GetMutable<framework::LoDTensor>();
+
+  auto &pool = platform::DeviceContextPool::Instance();
+
+  // Make pool2d followed by transpose
+
+  auto ksize = std::vector<int>(2, 2);
+  auto op_pool = framework::OpRegistry::CreateOp(
+      "pool2d", {{"X", {"x"}}}, {{"Out", {"y"}}},
+      {{"pooling_type", {std::string("max")}},
+       {"ksize", {ksize}},
+       {"data_format", {std::string("NHWC")}},
+       {"use_mkldnn", {true}}});
+
+  auto axis = std::vector<int>(4, 0);
+  axis[1] = 2;
+  axis[2] = 3;
+  axis[3] = 1;
+  auto op_transpose = framework::OpRegistry::CreateOp(
+      "transpose", {{"X", {"y"}}}, {{"Out", {"z"}}},
+      {{"axis", {axis}}, {"use_mkldnn", {true}}});
+
+  op_pool->Run(scope, p);
+  op_transpose->Run(scope, p);
+  pool.Get(p)->Wait();
+
+  // Verify shape of output
+  PADDLE_ENFORCE_EQ(z->dims(), expected_dims,
+                    platform::errors::InvalidArgument(
+                        "Computed shape does not match expected shape"));
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 946fa6305d737..0e870937ec1a5 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -61,6 +61,19 @@ class TransposeOp : public framework::OperatorWithKernel {
     }
 
     framework::DDim out_dims(x_dims);
+#ifdef PADDLE_WITH_MKLDNN
+    // Here we need to match dims to paddle layout
+    // as we are producing non-oneDNN result
+    if ((x_dims.size() >= 3) &&
+        (paddle::platform::MKLDNNDeviceContext::tls()
+             .get_cur_paddle_data_layout() == framework::DataLayout::kNHWC)) {
+      auto dims = framework::vectorize<int>(x_dims);
+      std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end());
+      x_dims = x_dims.reshape(dims);
+      VLOG(3)
+          << "Rotating Shape in Transpose from: kMKLDNN to: kNHWC output_shape";
+    }
+#endif
     for (size_t i = 0; i < axis_size; i++) {
       out_dims[i] = x_dims[axis[i]];
     }
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index b012a103ea303..d8dd166f325c8 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -14,7 +14,9 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
+#include <iostream>
 #include <memory>
+#include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
@@ -81,12 +83,30 @@ inline void MatchShapeToLayout(framework::Tensor* tensor_in,
     return;
   }
 
+  auto print_dims = [](const std::vector<int>& dims) {
+    std::ostringstream oss;
+
+    if (!dims.empty()) {
+      oss << "[";
+      // Convert all but the last element to avoid a trailing ","
+      std::copy(dims.begin(), dims.end() - 1,
+                std::ostream_iterator<int>(oss, ","));
+
+      // Now add the last element with no delimiter
+      oss << dims.back() << "]";
+    }
+
+    return oss.str();
+  };
+
   switch (from) {
     case framework::DataLayout::kMKLDNN:
       if (to == framework::DataLayout::kNHWC) {
         auto dims = framework::vectorize<int>(tensor_in->dims());
         std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end());
         tensor_in->Resize(framework::make_ddim(dims));
+        VLOG(3) << "Rotating Shape from: kMKLDNN to: kNHWC output_shape"
+                << print_dims(dims);
       }
       break;
     case framework::DataLayout::kNHWC:
@@ -94,6 +114,8 @@ inline void MatchShapeToLayout(framework::Tensor* tensor_in,
         auto dims = framework::vectorize<int>(tensor_in->dims());
         std::rotate(dims.begin() + 1, dims.end() - 1, dims.end());
         tensor_in->Resize(framework::make_ddim(dims));
+        VLOG(3) << "Rotating Shape from: kNHWC to: kMKLDNN output_shape"
+                << print_dims(dims);
       }
       break;
     default:

From f399bed8d9bb78b2bf0e82bea47ae5241058b67a Mon Sep 17 00:00:00 2001
From: Wojciech Uss <wojciech.uss@intel.com>
Date: Thu, 1 Oct 2020 15:04:11 +0200
Subject: [PATCH 16/91] Add an option to set number of warmup iterations
 (#27739)

---
 .../fluid/inference/tests/api/tester_helper.h | 24 +++++++++++++++----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 723e989be8de8..252bca2d5522e 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -65,6 +65,7 @@ DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
 DEFINE_bool(warmup, false,
             "Use warmup to calculate elapsed_time more accurately. "
             "To reduce CI time, it sets false in default.");
+DEFINE_int32(warmup_iters, 1, "Number of batches to process during warmup.");
 
 DEFINE_bool(enable_profile, false, "Turn on profiler for fluid");
 DEFINE_int32(cpu_num_threads, 1, "Number of threads for each paddle instance.");
@@ -364,15 +365,28 @@ void PredictionWarmUp(PaddlePredictor *predictor,
   if (FLAGS_zero_copy) {
     ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]);
   }
-  outputs->resize(1);
+  int iterations = 1;
+  if (FLAGS_warmup_iters > 1)
+    iterations = std::min(FLAGS_warmup_iters, static_cast<int>(inputs.size()));
+  outputs->resize(iterations);
   Timer warmup_timer;
-  warmup_timer.tic();
+  double elapsed_time = 0;
   if (!FLAGS_zero_copy) {
-    predictor->Run(inputs[0], &(*outputs)[0], batch_size);
+    for (int i = 0; i < iterations; ++i) {
+      warmup_timer.tic();
+      predictor->Run(inputs[i], &(*outputs)[i], batch_size);
+      elapsed_time += warmup_timer.toc();
+    }
   } else {
-    predictor->ZeroCopyRun();
+    for (int i = 0; i < iterations; ++i) {
+      warmup_timer.tic();
+      predictor->ZeroCopyRun();
+      elapsed_time += warmup_timer.toc();
+    }
   }
-  PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1, data_type);
+  auto batch_latency = elapsed_time / iterations;
+  PrintTime(batch_size, 1, num_threads, tid, batch_latency, iterations,
+            data_type);
   if (FLAGS_enable_profile) {
     paddle::platform::ResetProfiler();
   }

From a90711c7a23a3ce456f6e9f7057e7e3a735d6127 Mon Sep 17 00:00:00 2001
From: "joanna.wozna.intel" <joanna.wozna@intel.com>
Date: Thu, 1 Oct 2020 15:35:14 +0200
Subject: [PATCH 17/91] Add avx512 core instructions check 2  (#27750)

* Add test skip from cmake

* Remove print
---
 .../tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
index 4b7b4b5811a67..6f0b4f9076ec4 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
@@ -35,6 +35,8 @@ def conv2d_residual_naive(out, residual):
     return out
 
 
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
 class TestConv2dBf16Op(TestConv2dOp):
     def setUp(self):
         self.op_type = "conv2d"
@@ -42,9 +44,9 @@ def setUp(self):
         self.exhaustive_search = False
         self.use_cuda = False
         self.use_mkldnn = True
+        self._cpu_only = True
         self.weight_type = np.float32
         self.input_type = np.float32
-        self.use_mkldnn = True
         self.mkldnn_data_type = "bfloat16"
         self.force_fp32_output = False
         self.init_group()
@@ -205,5 +207,4 @@ def init_group(self):
 
 
 if __name__ == '__main__':
-    if core.supports_bfloat16():
-        unittest.main()
+    unittest.main()

From 9f3fb95b34ce80b4363713cd674a09d13de8f865 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Fri, 2 Oct 2020 13:31:46 +0800
Subject: [PATCH 18/91] [Dy2stat] Add Simnet Test for V2 APIs (#27460)

Add Simnet Test for V2 APIs.

We used tool from https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0-beta/guides/migration_cn.html#paddle1-xpaddle2-0beta to do v1.x to v2 transformation.

This PR pulled changes from #27430, please DO NOT merge before #27430 is merged
---
 .../simnet_dygraph_model_v2.py                | 493 ++++++++++++++++++
 .../dygraph_to_static/test_simnet_v2.py       | 168 ++++++
 2 files changed, 661 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
 create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py

diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
new file mode 100644
index 0000000000000..6612450b7cff8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
@@ -0,0 +1,493 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import reduce
+import paddle
+
+
+class EmbeddingLayer(object):
+    """
+    Embedding Layer class
+    """
+
+    def __init__(self, dict_size, emb_dim, name="emb", padding_idx=None):
+        """
+        initialize
+        """
+        self.dict_size = dict_size
+        self.emb_dim = emb_dim
+        self.name = name
+        self.padding_idx = padding_idx
+
+    def ops(self):
+        """
+        operation
+        """
+        # TODO(huihuangzheng): The original code set the is_sparse=True, but it
+        # causes crush in dy2stat. Set it to True after fixing it.
+        emb = paddle.fluid.dygraph.Embedding(
+            size=[self.dict_size, self.emb_dim],
+            is_sparse=True,
+            padding_idx=self.padding_idx,
+            param_attr=paddle.ParamAttr(
+                name=self.name, initializer=paddle.nn.initializer.Xavier()))
+
+        return emb
+
+
+class FCLayer(object):
+    """
+    Fully Connect Layer class
+    """
+
+    def __init__(self, fc_dim, act, name="fc"):
+        """
+        initialize
+        """
+        self.fc_dim = fc_dim
+        self.act = act
+        self.name = name
+
+    def ops(self):
+        """
+        operation
+        """
+        fc = FC(size=self.fc_dim,
+                param_attr=paddle.ParamAttr(name="%s.w" % self.name),
+                bias_attr=paddle.ParamAttr(name="%s.b" % self.name),
+                act=self.act)
+        return fc
+
+
+class ConcatLayer(object):
+    """
+    Connection Layer class
+    """
+
+    def __init__(self, axis):
+        """
+        initialize
+        """
+        self.axis = axis
+
+    def ops(self, inputs):
+        """
+        operation
+        """
+        concat = paddle.concat(x=inputs, axis=self.axis)
+        return concat
+
+
+class ReduceMeanLayer(object):
+    """
+    Reduce Mean Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, input):
+        """
+        operation
+        """
+        mean = paddle.reduce_mean(input)
+        return mean
+
+
+class CosSimLayer(object):
+    """
+    Cos Similarly Calculate Layer
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, x, y):
+        """
+        operation
+        """
+        sim = paddle.nn.functional.cosine_similarity(x, y)
+        return sim
+
+
+class ElementwiseMaxLayer(object):
+    """
+    Elementwise Max Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, x, y):
+        """
+        operation
+        """
+        max = paddle.maximum(x=x, y=y)
+        return max
+
+
+class ElementwiseAddLayer(object):
+    """
+    Elementwise Add Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, x, y):
+        """
+        operation
+        """
+        add = paddle.add(x=x, y=y)
+        return add
+
+
+class ElementwiseSubLayer(object):
+    """
+    Elementwise Add Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, x, y):
+        """
+        operation
+        """
+        sub = paddle.elementwise_sub(x, y)
+        return sub
+
+
+class ConstantLayer(object):
+    """
+    Generate A Constant Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, input, shape, dtype, value):
+        """
+        operation
+        """
+        shape = list(shape)
+        input_shape = paddle.shape(input)
+        shape[0] = input_shape[0]
+        constant = paddle.fill_constant(shape, dtype, value)
+        return constant
+
+
+class SoftsignLayer(object):
+    """
+    Softsign Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, input):
+        """
+        operation
+        """
+        softsign = paddle.nn.functional.softsign(input)
+        return softsign
+
+
+class FC(paddle.nn.Layer):
+    """
+    This interface is used to construct a callable object of the ``FC`` class.
+    For more details, refer to code examples.
+    It creates a fully connected layer in the network. It can take
+    one or multiple ``Tensor`` as its inputs. It creates a Variable called weights for each input tensor,
+    which represents a fully connected weight matrix from each input unit to
+    each output unit. The fully connected layer multiplies each input tensor
+    with its corresponding weight to produce an output Tensor with shape [N, `size`],
+    where N is batch size. If multiple input tensors are given, the results of
+    multiple output tensors with shape [N, `size`] will be summed up. If ``bias_attr``
+    is not None, a bias variable will be created and added to the output.
+    Finally, if ``act`` is not None, it will be applied to the output as well.
+    When the input is single ``Tensor`` :
+    .. math::
+        Out = Act({XW + b})
+    When the input are multiple ``Tensor`` :
+    .. math::
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+    In the above equation:
+    * :math:`N`: Number of the input. N equals to len(input) if input is list of ``Tensor`` .
+    * :math:`X_i`: The i-th input ``Tensor`` .
+    * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
+    * :math:`b`: The bias parameter created by this layer (if needed).
+    * :math:`Act`: The activation function.
+    * :math:`Out`: The output ``Tensor`` .
+    See below for an example.
+    .. code-block:: text
+        Given:
+            data_1.data = [[[0.1, 0.2]]]
+            data_1.shape = (1, 1, 2) # 1 is batch_size
+            data_2.data = [[[0.1, 0.2, 0.3]]]
+            data_2.shape = (1, 1, 3) # 1 is batch_size
+            fc = FC("fc", 2, num_flatten_dims=2)
+            out = fc(input=[data_1, data_2])
+        Then:
+            out.data = [[[0.182996 -0.474117]]]
+            out.shape = (1, 1, 2)
+    Parameters:
+        
+        size(int): The number of output units in this layer.
+        num_flatten_dims (int, optional): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multi-dimension tensor will first be flattened
+            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+            form the second dimension of the final matrix (width of the matrix). For example, suppose
+            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
+        param_attr (ParamAttr or list of ParamAttr, optional): The parameter attribute for learnable
+            weights(Parameter) of this layer. Default: None.
+        bias_attr (ParamAttr or list of ParamAttr, optional): The attribute for the bias
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        act (str, optional): Activation to be applied to the output of this layer. Default: None.
+        is_test(bool, optional): A flag indicating whether execution is in test phase. Default: False.
+        dtype(str, optional): Dtype used for weight, it can be "float32" or "float64". Default: "float32".
+    Attribute:
+        **weight** (list of Parameter): the learnable weights of this layer.
+        **bias** (Parameter or None): the learnable bias of this layer.
+    Returns:
+        None
+    
+    """
+
+    def __init__(self,
+                 size,
+                 num_flatten_dims=1,
+                 param_attr=None,
+                 bias_attr=None,
+                 act=None,
+                 is_test=False,
+                 dtype="float32"):
+        super(FC, self).__init__(dtype)
+
+        self._size = size
+        self._num_flatten_dims = num_flatten_dims
+        self._dtype = dtype
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._act = act
+        self.__w = list()
+
+    def _build_once(self, input):
+        i = 0
+        for inp, param in self._helper.iter_inputs_and_params(input,
+                                                              self._param_attr):
+            input_shape = inp.shape
+
+            param_shape = [
+                reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
+                       1)
+            ] + [self._size]
+            self.__w.append(
+                self.add_parameter(
+                    '_w%d' % i,
+                    self.create_parameter(
+                        attr=param,
+                        shape=param_shape,
+                        dtype=self._dtype,
+                        is_bias=False)))
+            i += 1
+
+        size = list([self._size])
+        self._b = self.create_parameter(
+            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
+
+    # TODO(songyouwei): We should remove _w property
+    @property
+    def _w(self, i=0):
+        return self.__w[i]
+
+    @_w.setter
+    def _w(self, value, i=0):
+        assert isinstance(self.__w[i], Variable)
+        self.__w[i].set_value(value)
+
+    @property
+    def weight(self):
+        if len(self.__w) > 1:
+            return self.__w
+        else:
+            return self.__w[0]
+
+    @weight.setter
+    def weight(self, value):
+        if len(self.__w) == 1:
+            self.__w[0] = value
+
+    @property
+    def bias(self):
+        return self._b
+
+    @bias.setter
+    def bias(self, value):
+        self._b = value
+
+    def forward(self, input):
+        mul_results = list()
+        i = 0
+        for inp, param in self._helper.iter_inputs_and_params(input,
+                                                              self._param_attr):
+            tmp = self._helper.create_variable_for_type_inference(self._dtype)
+            self._helper.append_op(
+                type="mul",
+                inputs={"X": inp,
+                        "Y": self.__w[i]},
+                outputs={"Out": tmp},
+                attrs={
+                    "x_num_col_dims": self._num_flatten_dims,
+                    "y_num_col_dims": 1
+                })
+            i += 1
+            mul_results.append(tmp)
+
+        if len(mul_results) == 1:
+            pre_bias = mul_results[0]
+        else:
+            pre_bias = self._helper.create_variable_for_type_inference(
+                self._dtype)
+            self._helper.append_op(
+                type="sum",
+                inputs={"X": mul_results},
+                outputs={"Out": pre_bias},
+                attrs={"use_mkldnn": False})
+
+        if self._b is not None:
+            pre_activation = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
+            self._helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [pre_bias],
+                        'Y': [self._b]},
+                outputs={'Out': [pre_activation]},
+                attrs={'axis': self._num_flatten_dims})
+        else:
+            pre_activation = pre_bias
+        # Currently, we don't support inplace in dygraph mode
+        return self._helper.append_activation(pre_activation, act=self._act)
+
+
+class HingeLoss(object):
+    """
+    Hing Loss Calculate class
+    """
+
+    def __init__(self, conf_dict):
+        """
+        initialize
+        """
+        self.margin = conf_dict["loss"]["margin"]
+
+    def compute(self, pos, neg):
+        """
+        compute loss
+        """
+        elementwise_max = ElementwiseMaxLayer()
+        elementwise_add = ElementwiseAddLayer()
+        elementwise_sub = ElementwiseSubLayer()
+        constant = ConstantLayer()
+        reduce_mean = ReduceMeanLayer()
+        loss = reduce_mean.ops(
+            elementwise_max.ops(
+                constant.ops(neg, neg.shape, "float32", 0.0),
+                elementwise_add.ops(
+                    elementwise_sub.ops(neg, pos),
+                    constant.ops(neg, neg.shape, "float32", self.margin))))
+        return loss
+
+
+class BOW(paddle.nn.Layer):
+    """
+    BOW
+    """
+
+    def __init__(self, conf_dict):
+        """
+        initialize
+        """
+        super(BOW, self).__init__()
+        self.dict_size = conf_dict["dict_size"]
+        self.task_mode = conf_dict["task_mode"]
+        self.emb_dim = conf_dict["net"]["emb_dim"]
+        self.bow_dim = conf_dict["net"]["bow_dim"]
+        self.seq_len = conf_dict["seq_len"]
+        self.emb_layer = EmbeddingLayer(self.dict_size, self.emb_dim,
+                                        "emb").ops()
+        self.bow_layer = paddle.nn.Linear(
+            in_features=self.bow_dim, out_features=self.bow_dim)
+        self.bow_layer_po = FCLayer(self.bow_dim, None, "fc").ops()
+        self.softmax_layer = FCLayer(2, "softmax", "cos_sim").ops()
+
+    @paddle.jit.to_static
+    def forward(self, left, right):
+        """
+        Forward network
+        """
+
+        # embedding layer
+        left_emb = self.emb_layer(left)
+        right_emb = self.emb_layer(right)
+        left_emb = paddle.reshape(
+            left_emb, shape=[-1, self.seq_len, self.bow_dim])
+        right_emb = paddle.reshape(
+            right_emb, shape=[-1, self.seq_len, self.bow_dim])
+
+        bow_left = paddle.reduce_sum(left_emb, dim=1)
+        bow_right = paddle.reduce_sum(right_emb, dim=1)
+        softsign_layer = SoftsignLayer()
+        left_soft = softsign_layer.ops(bow_left)
+        right_soft = softsign_layer.ops(bow_right)
+
+        # matching layer
+        if self.task_mode == "pairwise":
+            left_bow = self.bow_layer(left_soft)
+            right_bow = self.bow_layer(right_soft)
+            cos_sim_layer = CosSimLayer()
+            pred = cos_sim_layer.ops(left_bow, right_bow)
+            return left_bow, pred
+        else:
+            concat_layer = ConcatLayer(1)
+            concat = concat_layer.ops([left_soft, right_soft])
+            concat_fc = self.bow_layer_po(concat)
+            pred = self.softmax_layer(concat_fc)
+            return left_soft, pred
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
new file mode 100644
index 0000000000000..284087e61ec64
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
@@ -0,0 +1,168 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import random
+import unittest
+
+from simnet_dygraph_model_v2 import BOW, HingeLoss
+
+SEED = 102
+random.seed(SEED)
+
+
+def create_conf_dict():
+    conf_dict = {}
+    conf_dict["task_mode"] = "pairwise"
+    conf_dict["net"] = {"emb_dim": 128, "bow_dim": 128, "hidden_dim": 128}
+    conf_dict["loss"] = {"margin": 0.1}
+    return conf_dict
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="Total examples' number in batch for training.")
+    parser.add_argument(
+        "--seq_len", type=int, default=32, help="The length of each sentence.")
+    parser.add_argument(
+        "--epoch", type=int, default=1, help="The number of training epoch.")
+    parser.add_argument(
+        "--fake_sample_size",
+        type=int,
+        default=128,
+        help="The number of samples of fake data.")
+    args = parser.parse_args([])
+    return args
+
+
+args = parse_args()
+
+
+def fake_vocabulary():
+    vocab = {}
+    vocab["<unk>"] = 0
+    for i in range(26):
+        c = chr(ord('a') + i)
+        vocab[c] = i + 1
+    return vocab
+
+
+vocab = fake_vocabulary()
+
+
+class FakeReaderProcessor(object):
+    def __init__(self, args, vocab):
+        self.vocab = vocab
+        self.seq_len = args.seq_len
+        self.sample_size = args.fake_sample_size
+        self.data_samples = []
+        for i in range(self.sample_size):
+            query = [random.randint(0, 26) for i in range(self.seq_len)]
+            pos_title = query[:]
+            neg_title = [26 - q for q in query]
+            self.data_samples.append(
+                np.array([query, pos_title, neg_title]).astype(np.int64))
+
+    def get_reader(self, mode, epoch=0):
+        def reader_with_pairwise():
+            if mode == "train":
+                for i in range(self.sample_size):
+                    yield self.data_samples[i]
+
+        return reader_with_pairwise
+
+
+simnet_process = FakeReaderProcessor(args, vocab)
+
+
+def train(conf_dict, to_static):
+    """
+    train process
+    """
+    program_translator = paddle.jit.ProgramTranslator()
+    program_translator.enable(to_static)
+
+    # Get device
+    if paddle.is_compiled_with_cuda():
+        place = paddle.CUDAPlace(0)
+    else:
+        place = paddle.CPUPlace()
+
+    paddle.disable_static(place)
+    paddle.manual_seed(SEED)
+    paddle.framework.random._manual_program_seed(SEED)
+
+    conf_dict['dict_size'] = len(vocab)
+    conf_dict['seq_len'] = args.seq_len
+
+    net = BOW(conf_dict)
+    loss = HingeLoss(conf_dict)
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        parameters=net.parameters())
+
+    metric = paddle.metric.Auc(name="auc")
+
+    global_step = 0
+    losses = []
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=16, return_list=True, iterable=True, use_double_buffer=True)
+    get_train_examples = simnet_process.get_reader("train", epoch=args.epoch)
+    train_loader.set_sample_list_generator(
+        paddle.batch(
+            get_train_examples, batch_size=args.batch_size), place)
+
+    for left, pos_right, neg_right in train_loader():
+        left = paddle.reshape(left, shape=[-1, 1])
+        pos_right = paddle.reshape(pos_right, shape=[-1, 1])
+        neg_right = paddle.reshape(neg_right, shape=[-1, 1])
+        net.train()
+        global_step += 1
+        left_feat, pos_score = net(left, pos_right)
+        pred = pos_score
+        _, neg_score = net(left, neg_right)
+        avg_cost = loss.compute(pos_score, neg_score)
+        losses.append(np.mean(avg_cost.numpy()))
+        avg_cost.backward()
+        optimizer.minimize(avg_cost)
+        net.clear_gradients()
+    paddle.enable_static()
+    return losses
+
+
+class TestSimnet(unittest.TestCase):
+    def test_dygraph_static_same_loss(self):
+        if paddle.is_compiled_with_cuda():
+            paddle.fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+        conf_dict = create_conf_dict()
+        dygraph_loss = train(conf_dict, to_static=False)
+        static_loss = train(conf_dict, to_static=True)
+
+        self.assertEqual(len(dygraph_loss), len(static_loss))
+        for i in range(len(dygraph_loss)):
+            self.assertAlmostEqual(dygraph_loss[i], static_loss[i])
+
+
+if __name__ == '__main__':
+    unittest.main()

From 65207b4560f3f7a79ca2d742d8c362823cbac69b Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Sun, 4 Oct 2020 16:49:55 +0800
Subject: [PATCH 19/91] Polish the error message of fc,
 fused_fc_elementwise_layernorm and fused_embedding_seq_pool. (#27692)

* Polish the error message of fc_op.

* Polish the error message of fused_fc_elementwise_layer_norm op.

* Polish an error message in fused_embedding_seq_pool_op.
---
 paddle/fluid/operators/fc_op.cc               |  88 +++++++-----
 paddle/fluid/operators/fc_op.h                |  14 +-
 .../fused/fused_embedding_seq_pool_op.h       |   4 +-
 .../fused_fc_elementwise_layernorm_op.cc      | 132 +++++++++++++-----
 4 files changed, 161 insertions(+), 77 deletions(-)

diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 847b24f4f0b0b..d791b2bcfd09f 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -23,64 +23,80 @@ class FCOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      "X(Input) of Fully Connected should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Out(Output) of Fully Connected should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                      "W(Input) of Fully Connected should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "FC");
+    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "FC");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FC");
 
-    auto in_dims = ctx->GetInputDim("Input");
     auto w_dims = ctx->GetInputDim("W");
     bool padding_weights = ctx->Attrs().Get<bool>("padding_weights");
+    PADDLE_ENFORCE_EQ(
+        w_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The input Weight of fc is expected to be a 2-D tensor. "
+            "But received the number of Weight's dimensions is %d, "
+            "Weight's shape is %s.",
+            w_dims.size(), w_dims));
 
     if (ctx->HasInput("Bias")) {
       auto bias_dims = ctx->GetInputDim("Bias");
       auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
+
+      PADDLE_ENFORCE_LE(
+          bias_dims.size(), 2,
+          platform::errors::InvalidArgument(
+              "The input Bias of fc is expected to be a 1-D or 2-D tensor. But "
+              "received the number of Bias's dimensions is %d, "
+              "Bias's shape is %s.",
+              bias_dims.size(), bias_dims));
+
+      PADDLE_ENFORCE_EQ(
+          bias_dims[bias_dims.size() - 1], w_dims1,
+          platform::errors::InvalidArgument(
+              "The last dimension of input Bias is expected be equal "
+              "to the actual width of input Weight. But received the last "
+              "dimension of Bias is %d, Bias's shape is %s; "
+              "the actual width of Weight is %d, Weight's shape is %s.",
+              bias_dims[bias_dims.size() - 1], bias_dims, w_dims1, w_dims));
+
       if (bias_dims.size() == 2) {
-        PADDLE_ENFORCE_EQ(bias_dims[0], 1,
-                          platform::errors::InvalidArgument(
-                              "The shape of Bias is invalid."
-                              "The height of Bias should be 1."
-                              "But received height of Bias is %d.",
-                              bias_dims[0]));
-        PADDLE_ENFORCE_EQ(
-            bias_dims[1], w_dims1,
-            platform::errors::InvalidArgument(
-                "The shape of Bias is invalid."
-                "The width of Bias should be equal to width of Weight."
-                "But received width of Bias is %d and width of Weight is %d.",
-                bias_dims[1], w_dims1));
-      } else if (bias_dims.size() == 1) {
         PADDLE_ENFORCE_EQ(
-            bias_dims[0], w_dims1,
+            bias_dims[0], 1,
             platform::errors::InvalidArgument(
-                "The shape of Bias is invalid."
-                "The height of Bias should be equal to the width of weight."
-                "But received height of Bias is %d and width of Weight is %d.",
-                bias_dims[0], w_dims1));
+                "The first dimension of input Bias is expected to be 1, "
+                "but received %d, Bias's shape is %s.",
+                bias_dims[0], bias_dims));
       }
     }
 
+    auto in_dims = ctx->GetInputDim("Input");
+    int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims");
+    PADDLE_ENFORCE_LT(
+        in_num_col_dims, in_dims.size(),
+        platform::errors::InvalidArgument(
+            "The attribute in_num_col_dims used to flatten Input to "
+            "a 2-D tensor, is expected to be less than the number of "
+            "Input's dimensions. But recieved in_num_col_dims is %d, "
+            "the number of Input's dimensions is %d, Input's shape is %s.",
+            in_num_col_dims, in_dims.size(), in_dims));
+
     auto& activation_type = ctx->Attrs().Get<std::string>("activation_type");
     if (!activation_type.empty()) {
       PADDLE_ENFORCE_EQ(activation_type, "relu",
-                        "Activation %s is not supportetd in fc now.",
-                        activation_type.c_str());
+                        platform::errors::InvalidArgument(
+                            "The attribute activation_type of fc is expected "
+                            "to be \"relu\", but received %s.",
+                            activation_type.c_str()));
     }
+
     if (ctx->Attrs().Get<bool>("use_mkldnn")) {
       PADDLE_ENFORCE_EQ(
           in_dims.size() >= 2 && in_dims.size() <= 4, true,
           platform::errors::Unimplemented(
-              "Fully Connected input should be 2D, 3D or 4D tensor."));
+              "The Input of fc is expected to be a 2-D, 3-D or 4-D tensor when "
+              "use_mkldnn is set. But recieved the number of Input's "
+              "dimensions is %d, Input's shape is %s.",
+              in_dims.size(), in_dims));
     }
-    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
-                      "Fully Connected weights should be 2-D tensor.");
-    int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims");
-    PADDLE_ENFORCE_GT(
-        in_dims.size(), in_num_col_dims,
-        "The input tensor Input's rank of FCOp should be larger than "
-        "in_num_col_dims.");
 
     std::vector<int64_t> output_dims;
     FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims,
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index 907f61196d61b..6258dd0a3868f 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -32,11 +32,15 @@ inline void FCOutputSize(const framework::DDim& in_dims,
   auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
   auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0];
   auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
-  PADDLE_ENFORCE_EQ(in_mat_dims[1], w_dims0,
-                    platform::errors::InvalidArgument(
-                        "Fully Connected input and weigth size do not match. "
-                        "input width: %d,weight height: %d",
-                        in_mat_dims[1], w_dims0));
+  PADDLE_ENFORCE_EQ(
+      in_mat_dims[1], w_dims0,
+      platform::errors::InvalidArgument(
+          "The input's second dimension and weight's first dimension is "
+          "expected to be the same. But recieved input's second dimension is "
+          "%d, input's shape is %s; weight's first dimension is %d, weight's "
+          "shape is %s.",
+          in_mat_dims[1], in_mat_dims, w_dims0,
+          framework::make_ddim({w_dims0, w_dims1})));
 
   out_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
   for (int i = 0; i < in_num_col_dims; ++i) {
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
index aeaec84ba5c94..8713d58034241 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -204,9 +204,9 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
       auto *table_t = context.Input<SelectedRows>("W");
       table_dim = table_t->value().dims();
     } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::PermissionDenied(
           "The parameter W of a LookupTable "
-          "must be either LoDTensor or SelectedRows");
+          "must be either LoDTensor or SelectedRows."));
     }
 
     bool is_sparse = context.Attr<bool>("is_sparse");
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
index ea7d6a93d1b28..08909bcb6fcb9 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
@@ -22,47 +22,73 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        "Input(X) of fused_fc_elementwise_layernorm should not be null.");
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("W"), true,
-        "Input(W) of fused_fc_elementwise_layernorm should not be null.");
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Y"), true,
-        "Input(Y) of fused_fc_elementwise_layernorm should not be null.");
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        "Output(Out) of fused_fc_elementwise_layernorm should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                   "FusedFcElementwiseLayernorm");
+    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W",
+                   "FusedFcElementwiseLayernorm");
+    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y",
+                   "FusedFcElementwiseLayernorm");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "FusedFcElementwiseLayernorm");
 
     auto w_dims = ctx->GetInputDim("W");
-    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
-                      "Fully Connected input should be 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        w_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "The input Weight of fc is expected to be a 2-D tensor. "
+            "But received the number of Weight's dimensions is %d, ",
+            "Weight's shape is %s.", w_dims.size(), w_dims));
 
     if (ctx->HasInput("Bias0")) {
       auto bias0_dims = ctx->GetInputDim("Bias0");
+
+      PADDLE_ENFORCE_LE(bias0_dims.size(), 2,
+                        platform::errors::InvalidArgument(
+                            "The input Bias of fc is expected to be an 1-D or "
+                            "2-D tensor. But received the number of Bias's "
+                            "dimensions is %d, Bias's shape is %s.",
+                            bias0_dims.size(), bias0_dims));
+
+      PADDLE_ENFORCE_EQ(
+          bias0_dims[bias0_dims.size() - 1], w_dims[1],
+          platform::errors::InvalidArgument(
+              "The last dimension of input Bias is expected be equal "
+              "to the actual width of input Weight. But received the last "
+              "dimension of Bias is %d, Bias's shape is %s; "
+              "the actual width of Weight is %d, Weight's shape is %s.",
+              bias0_dims[bias0_dims.size() - 1], bias0_dims, w_dims[1],
+              w_dims));
+
       if (bias0_dims.size() == 2) {
-        PADDLE_ENFORCE_EQ(bias0_dims[0], 1,
-                          "The shape of Bias must be [1, dim].");
-        PADDLE_ENFORCE_EQ(bias0_dims[1], w_dims[1],
-                          "The shape of Bias must be [1, dim].");
-      } else if (bias0_dims.size() == 1) {
-        PADDLE_ENFORCE_EQ(bias0_dims[0], w_dims[1],
-                          "The shape of Bias must be [1, dim].");
+        PADDLE_ENFORCE_EQ(
+            bias0_dims[0], 1,
+            platform::errors::InvalidArgument(
+                "The first dimension of input Bias is expected to be 1, "
+                "but received %d, Bias's shape is %s.",
+                bias0_dims[0], bias0_dims));
       }
     }
 
     auto x_dims = ctx->GetInputDim("X");
     int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
-    PADDLE_ENFORCE_GT(
-        x_dims.size(), x_num_col_dims,
-        "The input tensor Input's rank of FCOp should be larger than "
-        "in_num_col_dims.");
+    PADDLE_ENFORCE_LT(
+        x_num_col_dims, x_dims.size(),
+        platform::errors::InvalidArgument(
+            "The attribute x_num_col_dims used to flatten input X to "
+            "a 2-D tensor, is expected to be less than the number of "
+            "input X's dimensions. But recieved x_num_col_dims is %d, "
+            "the number of input X's dimensions is %d, input X's shape is %s.",
+            x_num_col_dims, x_dims.size(), x_dims));
 
     auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
     PADDLE_ENFORCE_EQ(
         x_mat_dims[1], w_dims[0],
-        "Fully Connected input and weigth size do not match. %s, %s");
+        platform::errors::InvalidArgument(
+            "The input's second dimension and weight's first dimension is "
+            "expected to be the same. But recieved input's second dimension is "
+            "%d, input's shape is %s; weight's first dimension is %d, weight's "
+            "shape is %s.",
+            x_mat_dims[1], x_mat_dims, w_dims[0], w_dims));
 
     std::vector<int64_t> fc_out_dims;
     for (int i = 0; i < x_num_col_dims; ++i) {
@@ -71,29 +97,67 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
     fc_out_dims.push_back(w_dims[1]);
 
     auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(framework::make_ddim(fc_out_dims), y_dims);
+    PADDLE_ENFORCE_EQ(framework::make_ddim(fc_out_dims), y_dims,
+                      platform::errors::InvalidArgument(
+                          "The output's shape of fc is expected to be equal to "
+                          "that of input Y. But recieved output's shape of fc "
+                          "is %s, input Y's shape is %s.",
+                          framework::make_ddim(fc_out_dims), y_dims));
 
     auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
     PADDLE_ENFORCE_LT(
         begin_norm_axis, y_dims.size(),
-        "'begin_norm_axis' must be less than the rank of Input(Y).");
+        platform::errors::InvalidArgument(
+            "The attribute begin_norm_axis used to flatten input Y to a 2-D "
+            "tensor, is expected to be less than the number of input Y's "
+            "dimensions. But recieved begin_norm_axis is %d, the number of "
+            "input Y's dimensions is %d, input Y's shape is %s.",
+            begin_norm_axis, y_dims.size(), y_dims));
 
     auto y_mat_dim = framework::flatten_to_2d(y_dims, begin_norm_axis);
     int64_t dim_0 = y_mat_dim[0];
     int64_t dim_1 = y_mat_dim[1];
     if (ctx->HasInput("Scale")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1);
+      auto scale_dims = ctx->GetInputDim("Scale");
+      PADDLE_ENFORCE_EQ(scale_dims.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "The input Scale is expected to be an 1-D tensor. "
+                            "But recieved the number of input Scale's "
+                            "dimensions is %d, input Scale's shape is %s.",
+                            scale_dims.size(), scale_dims));
 
       if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], dim_1,
-                          "scale should with right");
+        PADDLE_ENFORCE_EQ(
+            scale_dims[0], dim_1,
+            platform::errors::InvalidArgument(
+                "The first dimension of input Scale is expected to be equal to "
+                "the second dimension of input Y after flattened. "
+                "But recieved the first dimension of input Scale is %d, input "
+                "Scale's shape is %s; the second dimension of flattened input "
+                "Y is %d, input Y's shape is %s, flattened axis is %d.",
+                scale_dims[0], scale_dims, dim_1, y_dims, begin_norm_axis));
       }
     }
     if (ctx->HasInput("Bias1")) {
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias1").size(), 1);
+      auto bias1_dims = ctx->GetInputDim("Bias1");
+      PADDLE_ENFORCE_EQ(
+          bias1_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The input Bias1 is expected to be an 1-D tensor. "
+              "But recieved the number of input Bias1's dimension is %d, "
+              "input Bias1's shape is %s.",
+              bias1_dims.size(), bias1_dims));
+
       if (ctx->IsRuntime()) {
-        PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias1")[0], dim_1,
-                          "bias should with right");
+        PADDLE_ENFORCE_EQ(
+            bias1_dims[0], dim_1,
+            platform::errors::InvalidArgument(
+                "The first dimension of input Bias1 is expected to be equal to "
+                "the second dimension of input Y after flattened. "
+                "But recieved the first dimension of input Bias1 is %d, input "
+                "Bias1's shape is %s; the second dimension of flatten input "
+                "Y is %d, input Y's shape is %s, flattened axis is %d.",
+                bias1_dims[0], bias1_dims, dim_1, y_dims, begin_norm_axis));
       }
     }
 

From de60c54938d9b946fa7a9bad835011624603643b Mon Sep 17 00:00:00 2001
From: huangjun12 <2399845970@qq.com>
Date: Thu, 8 Oct 2020 11:03:51 +0800
Subject: [PATCH 20/91] fix doc and example code of huber_loss and npair_loss
 (#27678)

* fix huber_loss ans npair_loss doc and example code, test=document_fix

* remove disable_static in example code, test=document_fix

* remove huber_loss and refine npair_loss example code, test=document_fix

* remove huber_loss in functional/__init__.py, test=document_fix
---
 python/paddle/fluid/layers/loss.py      | 31 +++++++++++--------------
 python/paddle/nn/functional/__init__.py |  1 -
 python/paddle/nn/functional/loss.py     |  2 --
 3 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 3610efdd505bd..2b1449a94e6e5 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -1681,11 +1681,6 @@ def kldiv_loss(x, target, reduction='mean', name=None):
 
 def npair_loss(anchor, positive, labels, l2_reg=0.002):
     '''
-    :alias_main: paddle.nn.functional.npair_loss
-	:alias: paddle.nn.functional.npair_loss,paddle.nn.functional.loss.npair_loss
-	:old_api: paddle.fluid.layers.npair_loss
-
-  **Npair Loss Layer**
 
   Read `Improved Deep Metric Learning with Multi class N pair Loss Objective\
        <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/\
@@ -1696,29 +1691,31 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
   takes the similarity matrix of anchor and positive as logits.
 
   Args:
-    anchor(Variable): embedding vector for the anchor image. shape=[batch_size, embedding_dims], 
+    anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims], 
                       the data type is float32 or float64.
-    positive(Variable): embedding vector for the positive image. shape=[batch_size, embedding_dims], 
+    positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims], 
                       the data type is float32 or float64.
-    labels(Variable): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64.
+    labels(Tensor): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64.
     l2_reg(float32): L2 regularization term on embedding vector, default: 0.002.
 
   Returns:
-    A Variable holding Tensor representing the npair loss, the data type is the same as 
+    A Tensor representing the npair loss, the data type is the same as 
     anchor, the shape is [1].
 
   Examples:
     .. code-block:: python
 
-       import paddle.fluid as fluid
-       anchor = fluid.data(
-                     name = 'anchor', shape = [18, 6], dtype = 'float32')
-       positive = fluid.data(
-                     name = 'positive', shape = [18, 6], dtype = 'float32')
-       labels = fluid.data(
-                     name = 'labels', shape = [18], dtype = 'float32')
+        import paddle
+        
+        DATATYPE = "float32"
+
+        anchor = paddle.rand(shape=(18, 6), dtype=DATATYPE)
+        positive = paddle.rand(shape=(18, 6), dtype=DATATYPE)
+        labels = paddle.rand(shape=(18,), dtype=DATATYPE)
+        
+        npair_loss = paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg = 0.002)
+        print(npair_loss.numpy())
 
-       npair_loss = fluid.layers.npair_loss(anchor, positive, labels, l2_reg = 0.002)
   '''
     check_variable_and_dtype(anchor, 'anchor', ['float32', 'float64'],
                              'npair_loss')
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 13bc99875638d..d129194112a8f 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -141,7 +141,6 @@
 from .loss import cross_entropy  #DEFINE_ALIAS
 from .loss import dice_loss  #DEFINE_ALIAS
 from .loss import edit_distance  #DEFINE_ALIAS
-from .loss import huber_loss  #DEFINE_ALIAS
 from .loss import iou_similarity  #DEFINE_ALIAS
 from .loss import kl_div  #DEFINE_ALIAS
 from .loss import l1_loss  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index d27bac14d0a84..4299b17ebd234 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -40,7 +40,6 @@
 from ...fluid.layers import teacher_student_sigmoid_loss  #DEFINE_ALIAS
 
 from ...fluid.layers import edit_distance  #DEFINE_ALIAS
-from ...fluid.layers import huber_loss  #DEFINE_ALIAS
 from ...fluid.layers import sampled_softmax_with_cross_entropy  #DEFINE_ALIAS
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode
@@ -55,7 +54,6 @@
     'cross_entropy',
     'dice_loss',
     'edit_distance',
-    'huber_loss',
     'iou_similarity',
     'kl_div',
     'l1_loss',

From 7793fd28770c02f2a8bb97ecd06dfa8394f3e818 Mon Sep 17 00:00:00 2001
From: huangjun12 <2399845970@qq.com>
Date: Thu, 8 Oct 2020 16:02:20 +0800
Subject: [PATCH 21/91] add huber_loss to fix ci, test=develop (#27766)

---
 python/paddle/nn/functional/loss.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 4299b17ebd234..76722f26007c4 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -41,6 +41,7 @@
 
 from ...fluid.layers import edit_distance  #DEFINE_ALIAS
 from ...fluid.layers import sampled_softmax_with_cross_entropy  #DEFINE_ALIAS
+from ...fluid.layers import huber_loss
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.framework import _varbase_creator

From 7698e199286a112bcbc66c57a2b457a088d1140e Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 8 Oct 2020 16:20:28 +0800
Subject: [PATCH 22/91] modify doc for paddle.nn.Layer (#27624)

* modify doc for Layer, test=develop

* modify doc for Layer2, test=develop

* set dtype default value to float32, test=develop

* add example code for paddle.nn.Layer, test=develop

* set create_parameter and create_variable dtype default value to None, test=develop

* modify some example code, tet=develop

* refine, test=develop

* del no ues code, test=develop

* modify doc, example code, args, test=develop

* modify doc, test=develop
---
 python/paddle/fluid/dygraph/layers.py | 513 +++++++++++++++++---------
 1 file changed, 343 insertions(+), 170 deletions(-)

diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 88e24e7e1ea99..3ae6d384be7e3 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -62,10 +62,6 @@ def remove(self):
 
 class Layer(core.Layer):
     """
-    :alias_main: paddle.nn.Layer
-	:alias: paddle.nn.Layer
-	:old_api: paddle.fluid.dygraph.layers.Layer
-
     Dynamic graph Layer based on OOD, includes the parameters of the layer, the structure of the forward graph and so on.
 
     Parameters:
@@ -74,16 +70,16 @@ class Layer(core.Layer):
             can be "my_layer_0.w_n", where "w" is the parameter
             base name and "n" is an unique suffix auto-generated.
             If None, prefix name will be snake cased class name. Default: None.
-        dtype(str or core.VarDesc.VarType, optional): data type of this parameter.
+        dtype(str, optional): data type of this parameter.
                 If set str, it can be "bool",  "float16", "float32", "float64",
                 "int8", "int16", "int32", "int64", "uint8" or "uint16".
-                Default: ``core.VarDesc.VarType.FP32``
+                Default: "float32"
     
     Returns:
         None
     """
 
-    def __init__(self, name_scope=None, dtype=core.VarDesc.VarType.FP32):
+    def __init__(self, name_scope=None, dtype="float32"):
         self.training = True
         if name_scope is None:
             name_scope = _convert_camel_to_snake(self.__class__.__name__)
@@ -110,6 +106,30 @@ def train(self):
 
         Returns:
             None
+
+        Example::
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self._linear = paddle.nn.Linear(1, 1)
+                        self._dropout = paddle.nn.Dropout(p=0.5)
+
+                    def forward(self, input):
+                        temp = self._linear(input)
+                        temp = self._dropout(temp)
+                        return temp
+
+                x = paddle.randn([10, 1], 'float32')
+                mylayer = MyLayer()
+                mylayer.eval()  # set mylayer._dropout to eval mode
+                out = mylayer(x)
+                mylayer.train()  # set mylayer._dropout to train mode
+                out = mylayer(x)
+
         """
         # global setting
         framework._dygraph_tracer().train_mode()
@@ -125,6 +145,29 @@ def eval(self):
 
         Returns:
             None
+
+        Example::
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self._linear = paddle.nn.Linear(1, 1)
+                        self._dropout = paddle.nn.Dropout(p=0.5)
+
+                    def forward(self, input):
+                        temp = self._linear(input)
+                        temp = self._dropout(temp)
+                        return temp
+
+                x = paddle.randn([10, 1], 'float32')
+                mylayer = MyLayer()
+                mylayer.eval()  # set mylayer._dropout to eval mode
+                out = mylayer(x)
+                print(out)
+
         """
         # global setting
         framework._dygraph_tracer().eval_mode()
@@ -149,15 +192,13 @@ def apply(self, fn):
 
               import paddle
               import paddle.nn as nn
-              
-              paddle.disable_static()
-              
+
               net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2))
 
               def init_weights(layer):
                   if type(layer) == nn.Linear:
                       print('before init weight:', layer.weight.numpy())
-                      new_weight = paddle.fill_constant(layer.weight.shape, layer.weight.dtype, value=0.9)
+                      new_weight = paddle.full(shape=layer.weight.shape, dtype=layer.weight.dtype, fill_value=0.9)
                       layer.weight.set_value(new_weight)
                       print('after init weight:', layer.weight.numpy())
 
@@ -177,6 +218,23 @@ def full_name(self):
 
         Returns:
             str: full name of this layer.
+
+        Example::
+            .. code-block:: python
+
+                import paddle
+
+                class LinearNet(paddle.nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__(name_scope = "demo_linear_net")
+                        self._linear = paddle.nn.Linear(1, 1)
+
+                    def forward(self, x):
+                        return self._linear(x)
+
+                linear_net = LinearNet()
+                print(linear_net.full_name())   # demo_linear_net_0
+
         """
         return self._full_name
 
@@ -197,34 +255,33 @@ def register_forward_post_hook(self, hook):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              import numpy as np
+                import paddle
+                import numpy as np
+
+                # the forward_post_hook change the output of the layer: output = output * 2
+                def forward_post_hook(layer, input, output):
+                    # user can use layer, input and output for information statistis tasks
 
-              # the forward_post_hook change the output of the layer: output = output * 2 
-              def forward_post_hook(layer, input, output):
-                  # user can use layer, input and output for information statistis tasks
+                    # change the output
+                    return output * 2
 
-                  # change the output 
-                  return output * 2
+                linear = paddle.nn.Linear(13, 5)
 
-              with fluid.dygraph.guard():
-                  linear = fluid.Linear(13, 5, dtype="float32")
+                # register the hook
+                forward_post_hook_handle = linear.register_forward_post_hook(forward_post_hook)
 
-                  # register the hook
-                  forward_post_hook_handle = linear.register_forward_post_hook(forward_post_hook)
-                  
-                  value1 = np.arange(26).reshape(2, 13).astype("float32")
-                  in1 = fluid.dygraph.to_variable(value1)
-                  
-                  out0 = linear(in1)
-                  
-                  # remove the hook
-                  forward_post_hook_handle.remove()
+                value1 = np.arange(26).reshape(2, 13).astype("float32")
+                in1 = paddle.to_tensor(value1)
 
-                  out1 = linear(in1)
+                out0 = linear(in1)
 
-                  # hook change the linear's output to output * 2, so out0 is equal to out1 * 2.
-                  assert (out0.numpy() == (out1.numpy()) * 2).any()
+                # remove the hook
+                forward_post_hook_handle.remove()
+
+                out1 = linear(in1)
+
+                # hook change the linear's output to output * 2, so out0 is equal to out1 * 2.
+                assert (out0.numpy() == (out1.numpy()) * 2).any()
         """
         hook_remove_helper = HookRemoveHelper(self._forward_post_hooks)
         self._forward_post_hooks[hook_remove_helper._hook_id] = hook
@@ -249,36 +306,35 @@ def register_forward_pre_hook(self, hook):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              import numpy as np
+                import paddle
+                import numpy as np
 
-              # the forward_post_hook change the input of the layer: input = input * 2
-              def forward_pre_hook(layer, input):
-                  # user can use layer and input for information statistis tasks
+                # the forward_post_hook change the input of the layer: input = input * 2
+                def forward_pre_hook(layer, input):
+                    # user can use layer and input for information statistis tasks
 
-                  # change the input
-                  input_return = (input[0] * 2)
-                  return input_return
+                    # change the input
+                    input_return = (input[0] * 2)
+                    return input_return
 
-              with fluid.dygraph.guard():
-                  linear = fluid.Linear(13, 5, dtype="float32")
+                linear = paddle.nn.Linear(13, 5)
 
-                  # register the hook
-                  forward_pre_hook_handle = linear.register_forward_pre_hook(forward_pre_hook)
+                # register the hook
+                forward_pre_hook_handle = linear.register_forward_pre_hook(forward_pre_hook)
 
-                  value0 = np.arange(26).reshape(2, 13).astype("float32")
-                  in0 = fluid.dygraph.to_variable(value0)
-                  out0 = linear(in0)
+                value0 = np.arange(26).reshape(2, 13).astype("float32")
+                in0 = paddle.to_tensor(value0)
+                out0 = linear(in0)
 
-                  # remove the hook
-                  forward_pre_hook_handle.remove()
+                # remove the hook
+                forward_pre_hook_handle.remove()
 
-                  value1 = value0 * 2
-                  in1 = fluid.dygraph.to_variable(value1)
-                  out1 = linear(in1)
+                value1 = value0 * 2
+                in1 = paddle.to_tensor(value1)
+                out1 = linear(in1)
 
-                  # hook change the linear's input to input * 2, so out0 is equal to out1.
-                  assert (out0.numpy() == out1.numpy()).any()
+                # hook change the linear's input to input * 2, so out0 is equal to out1.
+                assert (out0.numpy() == out1.numpy()).any()
         """
         hook_remove_helper = HookRemoveHelper(self._forward_pre_hooks)
         self._forward_pre_hooks[hook_remove_helper._hook_id] = hook
@@ -294,17 +350,37 @@ def create_parameter(self,
         
         Parameters:
             shape(list): Shape of the parameter.
-            attr(ParamAttr, optional): Parameter attribute of weight. Please refer to :ref:`api_fluid_ParamAttr`. Default: None.
-            dtype(str or core.VarDesc.VarType or str, optional): Data type of this parameter.
+            attr(ParamAttr, optional): Parameter attribute of weight. Please refer to :ref:`api_paddle_ParamAttr`. Default: None.
+            dtype(str, optional): Data type of this parameter.
                 If set str, it can be "bool",  "float16", "float32", "float64",
                 "int8", "int16", "int32", "int64", "uint8" or "uint16". Default: "float32".
             is_bias(bool, optional): if this is a bias parameter. Default: False.
             default_initializer(Initializer, optional): the default initializer for this parameter.
-                If set None, default initializer will be set to :ref:`api_fluid_initializer_XavierInitializer` and :ref:`api_fluid_initializer_ConstantInitializer`
+                If set None, default initializer will be set to paddle.nn.initializer.Xavier and paddle.nn.initializer.Constant
                 for non-bias and bias parameter, respectively. Default: None.
 
         Returns:
-            :ref:`api_guide_Variable_en` : created parameter.
+            :Tensor, created parameter.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self._linear = paddle.nn.Linear(1, 1)
+                        w_tmp = self.create_parameter([1,1])
+                        self.add_parameter("w_tmp", w_tmp)
+
+                    def forward(self, input):
+                        return self._linear(input)
+
+                mylayer = MyLayer()
+                for name, param in mylayer.named_parameters():
+                    print(name, param)      # will print w_tmp,_linear.weight,_linear.bias
+
         """
         temp_attr = copy.deepcopy(attr)
         if isinstance(temp_attr, six.string_types) and temp_attr == "":
@@ -313,24 +389,40 @@ def create_parameter(self,
                                              default_initializer)
 
     # TODO: Add more parameter list when we need them
-    def create_variable(self,
-                        name=None,
-                        persistable=None,
-                        dtype=None,
-                        type=core.VarDesc.VarType.LOD_TENSOR):
+    def create_variable(self, name=None, persistable=None, dtype=None):
         """Create Variable for this layer.
 
         Parameters:
             name(str, optional): name of the variable. Please refer to :ref:`api_guide_Name` . Default: None
             persistable(bool, optional): if set this variable persistable. Default: False
-            dtype(str or core.VarDesc.VarType, optional): data type of this parameter.
+            dtype(str, optional): data type of this parameter.
                 If set str, it can be "bool",  "float16", "float32", "float64",
                 "int8", "int16", "int32", "int64", "uint8" or "uint16".
-                If set None, it will be ``core.VarDesc.VarType.FP32``. Default: None
-            type(core.VarDesc.VarType, optional): type of the variable. No need to set this parameter. Default: ``core.VarDesc.VarType.LOD_TENSOR``
+                If set None, it will be "float32". Default: None
 
         Returns:
-            :ref:`api_guide_Variable_en` : created Variable.
+            Tensor, created Variable.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class MyLinear(paddle.nn.Layer):
+                    def __init__(self,
+                                in_features,
+                                out_features):
+                        super(MyLinear, self).__init__()
+                        self.linear = paddle.nn.Linear( 10, 10)
+                            
+                        self.back_var = self.create_variable(name = "linear_tmp_0", dtype=self._dtype)
+                    
+                    def forward(self, input):
+                        out = self.linear(input)
+                        paddle.assign( out, self.back_var)
+                        
+                        return out
+
         """
         if name is not None:
             var_name = ".".join([self._full_name, name])
@@ -339,7 +431,10 @@ def create_variable(self,
                 [self._full_name, "_generated_var"]))
 
         return self._helper.main_program.current_block().create_var(
-            name=var_name, persistable=persistable, dtype=dtype, type=type)
+            name=var_name,
+            persistable=persistable,
+            dtype=dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR)
 
     def parameters(self, include_sublayers=True):
         """Returns a list of all Parameters from current layer and its sub-layers.
@@ -348,7 +443,16 @@ def parameters(self, include_sublayers=True):
             include_sublayers(bool, optional): Whether include the parameters of sublayers. If True, also include the parameters from sublayers. Default: True
 
         Returns:
-            list of :ref:`api_guide_Variable_en` : a list of Parameters.
+            list of Tensor : a list of Parameters.
+
+        Examples:
+            .. code-block:: python
+
+            import paddle
+
+            linear = paddle.nn.Linear(1,1)
+            print(linear.parameters())  # print linear_0.w_0 and linear_0.b_0
+
         """
         ret = [
             param
@@ -366,16 +470,15 @@ def children(self):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
 
-                with fluid.dygraph.guard():
-                    fc1 = fluid.Linear(10, 3)
-                    fc2 = fluid.Linear(3, 10, bias_attr=False)
-                    model = fluid.dygraph.Sequential(fc1, fc2)
-                    
-                    layer_list = list(model.children())
+                linear1 = paddle.nn.Linear(10, 3)
+                linear2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                model = paddle.nn.Sequential(linear1, linear2)
+
+                layer_list = list(model.children())
 
-                    print(layer_list)
+                print(layer_list)   # [<paddle.nn.layer.common.Linear object at 0x7f7b8113f830>, <paddle.nn.layer.common.Linear object at 0x7f7b8113f950>]
 
         """
         for _, layer in self.named_children():
@@ -391,14 +494,15 @@ def named_children(self):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
 
-                with fluid.dygraph.guard():
-                    fc1 = fluid.Linear(10, 3)
-                    fc2 = fluid.Linear(3, 10, bias_attr=False)
-                    model = fluid.dygraph.Sequential(fc1, fc2)
-                    for prefix, layer in model.named_children():
-                        print(prefix, layer)
+                linear1 = paddle.nn.Linear(10, 3)
+                linear2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                model = paddle.nn.Sequential(linear1, linear2)
+                for prefix, layer in model.named_children():
+                    print(prefix, layer)
+                    # ('0', <paddle.nn.layer.common.Linear object at 0x7fb61ed85830>)
+                    # ('1', <paddle.nn.layer.common.Linear object at 0x7fb61ed85950>)
 
         """
         memo = set()
@@ -415,6 +519,26 @@ def sublayers(self, include_sublayers=True):
 
         Returns:
             list of Layer : a list of sub layers.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self._linear = paddle.nn.Linear(1, 1)
+                        self._dropout = paddle.nn.Dropout(p=0.5)
+
+                    def forward(self, input):
+                        temp = self._linear(input)
+                        temp = self._dropout(temp)
+                        return temp
+
+                mylayer = MyLayer()
+                print(mylayer.sublayers())  # [<paddle.nn.layer.common.Linear object at 0x7f44b58977d0>, <paddle.nn.layer.common.Dropout object at 0x7f44b58978f0>]
+
         """
         ret = [
             layer
@@ -438,14 +562,13 @@ def named_parameters(self, prefix='', include_sublayers=True):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
 
-                with fluid.dygraph.guard():
-                    fc1 = fluid.Linear(10, 3)
-                    fc2 = fluid.Linear(3, 10, bias_attr=False)
-                    model = fluid.dygraph.Sequential(fc1, fc2)
-                    for name, param in model.named_parameters():
-                        print(name, param)
+                fc1 = paddle.nn.Linear(10, 3)
+                fc2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                model = paddle.nn.Sequential(fc1, fc2)
+                for name, param in model.named_parameters():
+                    print(name, param)
 
         """
         params_set = set()
@@ -483,14 +606,13 @@ def named_sublayers(self,
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
 
-                with fluid.dygraph.guard():
-                    fc1 = fluid.Linear(10, 3)
-                    fc2 = fluid.Linear(3, 10, bias_attr=False)
-                    model = fluid.dygraph.Sequential(fc1, fc2)
-                    for prefix, layer in model.named_sublayers():
-                        print(prefix, layer)
+                fc1 = paddle.nn.Linear(10, 3)
+                fc2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                model = paddle.nn.Sequential(fc1, fc2)
+                for prefix, layer in model.named_sublayers():
+                    print(prefix, layer)
 
         """
         if layers_set is None:
@@ -510,11 +632,11 @@ def named_sublayers(self,
                         layers_set=layers_set):
                     yield p, l
 
-    def register_buffer(self, name, variable, persistable=True):
+    def register_buffer(self, name, tensor, persistable=True):
         """
-        Registers a variable as buffer into the layer.
+        Registers a tensor as buffer into the layer.
 
-        `buffer` is a non-parameteric variable and will not be updated by optimizer,
+        `buffer` is a non-trainable tensor and will not be updated by optimizer,
         but is necessary for evaluation and inference. For example, the mean and variance in BatchNorm layers.
         The registered buffer is persistable by default, and will be saved into
         `state_dict` alongside parameters. If set persistable=False, it registers
@@ -525,7 +647,7 @@ def register_buffer(self, name, variable, persistable=True):
         Parameters:
             name (string): name of the buffer. The buffer can be accessed
                 from this layer using the given name
-            variable (Variable): the variable to be registered as buffer.
+            tensor (Tensor): the tensor to be registered as buffer.
             persistable (bool): whether the buffer is part of this layer's
                 state_dict.
 
@@ -536,16 +658,15 @@ def register_buffer(self, name, variable, persistable=True):
             .. code-block:: python
 
                 import numpy as np
-                import paddle.fluid as fluid
+                import paddle
 
-                with fluid.dygraph.guard():
-                    linear = fluid.Linear(10, 3)
-                    value = np.array([0]).astype("float32")
-                    buffer = fluid.dygraph.to_variable(value)
-                    linear.register_buffer("buf_name", buffer, persistable=True)
-                    
-                    # get the buffer by attribute.
-                    print(linear.buf_name)
+                linear = paddle.nn.Linear(10, 3)
+                value = np.array([0]).astype("float32")
+                buffer = paddle.to_tensor(value)
+                linear.register_buffer("buf_name", buffer, persistable=True)
+
+                # get the buffer by attribute.
+                print(linear.buf_name)
 
         """
 
@@ -565,12 +686,12 @@ def register_buffer(self, name, variable, persistable=True):
             raise KeyError("The name of buffer can not be empty.")
         elif hasattr(self, name) and name not in self._buffers:
             raise KeyError("attribute '{}' already exists.".format(name))
-        elif variable is not None and not type(variable) == core.VarBase:
+        elif tensor is not None and not type(tensor) == core.VarBase:
             raise TypeError(
                 "The registered buffer should be a core.VarBase, but received {}.".
-                format(type(variable).__name__))
+                format(type(tensor).__name__))
         else:
-            self._buffers[name] = variable
+            self._buffers[name] = tensor
             if persistable:
                 self._non_persistable_buffer_names_set.discard(name)
             else:
@@ -584,7 +705,21 @@ def buffers(self, include_sublayers=True):
             include_sublayers(bool, optional): Whether include the buffers of sublayers. If True, also include the buffers from sublayers. Default: True
 
         Returns:
-            list of :ref:`api_guide_Variable_en` : a list of buffers.
+            list of Tensor : a list of buffers.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+
+                linear = paddle.nn.Linear(10, 3)
+                value = np.array([0]).astype("float32")
+                buffer = paddle.to_tensor(value)
+                linear.register_buffer("buf_name", buffer, persistable=True)
+
+                print(linear.buffers())     # == print([linear.buf_name])
+
         """
         ret = [
             buffer
@@ -595,7 +730,7 @@ def buffers(self, include_sublayers=True):
 
     def named_buffers(self, prefix='', include_sublayers=True):
         """
-        Returns an iterator over all buffers in the Layer, yielding tuple of name and Variable.
+        Returns an iterator over all buffers in the Layer, yielding tuple of name and Tensor.
 
         Parameters:
             prefix(str, optional): Prefix to prepend to all buffer names. Default: ''.
@@ -603,31 +738,30 @@ def named_buffers(self, prefix='', include_sublayers=True):
                 If True, also include the named buffers from sublayers. Default: True.
 
         Yields:
-            (string, Variable): Tuple of name and Variable
+            (string, Tensor): Tuple of name and tensor
 
         Examples:
             .. code-block:: python
 
                 import numpy as np
-                import paddle.fluid as fluid
+                import paddle
 
-                with fluid.dygraph.guard():
-                    fc1 = fluid.Linear(10, 3)
-                    buffer1 = fluid.dygraph.to_variable(np.array([0]).astype("float32"))
-                    # register a variable as buffer by specific `persistable`
-                    fc1.register_buffer("buf_name_1", buffer1, persistable=True)
+                fc1 = paddle.nn.Linear(10, 3)
+                buffer1 = paddle.to_tensor(np.array([0]).astype("float32"))
+                # register a tensor as buffer by specific `persistable`
+                fc1.register_buffer("buf_name_1", buffer1, persistable=True)
 
-                    fc2 = fluid.Linear(3, 10)
-                    buffer2 = fluid.dygraph.to_variable(np.array([1]).astype("float32"))
-                    # register a buffer by assigning an attribute with Variable.
-                    # The `persistable` can only be False by this way.
-                    fc2.buf_name_2 = buffer2
+                fc2 = paddle.nn.Linear(3, 10)
+                buffer2 = paddle.to_tensor(np.array([1]).astype("float32"))
+                # register a buffer by assigning an attribute with Tensor.
+                # The `persistable` can only be False by this way.
+                fc2.buf_name_2 = buffer2
 
-                    model = fluid.dygraph.Sequential(fc1, fc2)
+                model = paddle.nn.Sequential(fc1, fc2)
 
-                    # get all named buffers
-                    for name, buffer in model.named_buffers():
-                        print(name, buffer)
+                # get all named buffers
+                for name, buffer in model.named_buffers():
+                    print(name, buffer)
 
         """
         buffers_set = set()
@@ -654,19 +788,18 @@ def clear_gradients(self):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
+                import paddle
                 import numpy as np
 
-                with fluid.dygraph.guard():
-                    value = np.arange(26).reshape(2, 13).astype("float32")
-                    a = fluid.dygraph.to_variable(value)
-                    linear = fluid.Linear(13, 5, dtype="float32")
-                    adam = fluid.optimizer.Adam(learning_rate=0.01, 
-                                                parameter_list=linear.parameters())
-                    out = linear(a)
-                    out.backward()
-                    adam.minimize(out)
-                    linear.clear_gradients()
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.to_tensor(value)
+                linear = paddle.nn.Linear(13, 5)
+                adam = paddle.optimizer.Adam(learning_rate=0.01,
+                                            parameters=linear.parameters())
+                out = linear(a)
+                out.backward()
+                adam.step()
+                linear.clear_gradients()
 
         """
         for p in self.parameters():
@@ -726,6 +859,32 @@ def add_sublayer(self, name, sublayer):
             sublayer(Layer): an instance of Layer.
         Returns:
             Layer: the sublayer passed in.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class MySequential(paddle.nn.Layer):
+                    def __init__(self, *layers):
+                        super(MySequential, self).__init__()
+                        if len(layers) > 0 and isinstance(layers[0], tuple):
+                            for name, layer in layers:
+                                self.add_sublayer(name, layer)
+                        else:
+                            for idx, layer in enumerate(layers):
+                                self.add_sublayer(str(idx), layer)
+
+                    def forward(self, input):
+                        for layer in self._sub_layers.values():
+                            input = layer(input)
+                        return input
+
+                fc1 = paddle.nn.Linear(10, 3)
+                fc2 = paddle.nn.Linear(3, 10, bias_attr=False)
+                model = MySequential(fc1, fc2)
+                for prefix, layer in model.named_sublayers():
+                    print(prefix, layer)
         """
         assert isinstance(sublayer, core.Layer)
 
@@ -742,6 +901,25 @@ def add_parameter(self, name, parameter):
             parameter(Parameter): an instance of Parameter.
         Returns:
             Parameter: the parameter passed in.
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self._linear = paddle.nn.Linear(1, 1)
+                        w_tmp = self.create_parameter([1,1])
+                        self.add_parameter("w_tmp", w_tmp)
+
+                    def forward(self, input):
+                        return self._linear(input)
+
+                mylayer = MyLayer()
+                for name, param in mylayer.named_parameters():
+                    print(name, param)      # will print w_tmp,_linear.weight,_linear.bias
+
         """
         if '_parameters' not in self.__dict__:
             raise RuntimeError(
@@ -871,24 +1049,23 @@ def __dir__(self):
         Return a list. Get all parameters, buffers(non-parameter variables), sublayers, method and attr of Layer.
 
         Examples:
-            import paddle.fluid as fluid
-            import numpy as np
-
-            fluid.dygraph.enable_dygraph()
+            .. code-block:: python
+                import paddle
+                import numpy as np
 
-            class Mylayer(fluid.dygraph.Layer):
-                def __init__(self):
-                    super(Mylayer, self).__init__()
-                    self.linear1 = fluid.dygraph.Linear(10, 10)
-                    self.linear2 = fluid.dygraph.Linear(5, 5)
-                    self.conv2d = fluid.dygraph.Conv2D(3, 2, 3)
-                    self.embedding = fluid.dygraph.Embedding(size=[128, 16])
-                    self.h_0 = fluid.dygraph.to_variable(np.zeros([10, 10]).astype('float32'))
+                class Mylayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(Mylayer, self).__init__()
+                        self.linear1 = paddle.nn.Linear(10, 10)
+                        self.linear2 = paddle.nn.Linear(5, 5)
+                        self.conv2d = paddle.nn.Conv2d(3, 2, 3)
+                        self.embedding = paddle.nn.Embedding(128, 16)
+                        self.h_0 = paddle.to_tensor(np.zeros([10, 10]).astype('float32'))
 
-            mylayer = Mylayer()
-            print(dir(mylayer))
-            # only parts are shown, because of list have too much content
-            # ['__call__', '__class__',  ... , 'conv2d', 'embedding', 'h_0', 'linear1', 'linear2', ... , 'sublayers', 'train']
+                mylayer = Mylayer()
+                print(dir(mylayer))
+                # only parts are shown, because of list have too much content
+                # ['__call__', '__class__',  ... , 'conv2d', 'embedding', 'h_0', 'linear1', 'linear2', ... , 'sublayers', 'train']
 
         """
         method = dir(self.__class__)
@@ -918,12 +1095,12 @@ def state_dict(self,
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
+                import paddle
 
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
+                emb = paddle.nn.Embedding(10, 10)
+
+                state_dict = emb.state_dict()
+                paddle.save( state_dict, "paddle_dy.pdparams")
 
         '''
 
@@ -967,16 +1144,12 @@ def set_state_dict(self,
             .. code-block:: python
 
                 import paddle
-                
-                paddle.disable_static()
-                
+
                 emb = paddle.nn.Embedding(10, 10)
 
                 state_dict = emb.state_dict()
                 paddle.save(state_dict, "paddle_dy.pdparams")
-                
                 para_state_dict = paddle.load("paddle_dy.pdparams")
-
                 emb.set_state_dict(para_state_dict)
 
         '''

From 7ca66f1e0639b969da150fc6a6b787f501248018 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 8 Oct 2020 16:21:03 +0800
Subject: [PATCH 23/91] modify Sequential doc, test=develop (#27608)

---
 python/paddle/fluid/dygraph/container.py | 35 ++++++++++++------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/python/paddle/fluid/dygraph/container.py b/python/paddle/fluid/dygraph/container.py
index 8a8787da3a543..dc43062aa9f17 100644
--- a/python/paddle/fluid/dygraph/container.py
+++ b/python/paddle/fluid/dygraph/container.py
@@ -34,27 +34,26 @@ class Sequential(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
             data = np.random.uniform(-1, 1, [30, 10]).astype('float32')
-            with fluid.dygraph.guard():
-                data = fluid.dygraph.to_variable(data)
-                # create Sequential with iterable Layers
-                model1 = fluid.dygraph.Sequential(
-                    fluid.Linear(10, 1), fluid.Linear(1, 2)
-                )
-                model1[0]  # access the first layer
-                res1 = model1(data)  # sequential execution
-
-                # create Sequential with name Layer pairs
-                model2 = fluid.dygraph.Sequential(
-                    ('l1', fluid.Linear(10, 2)),
-                    ('l2', fluid.Linear(2, 3))
-                )
-                model2['l1']  # access l1 layer
-                model2.add_sublayer('l3', fluid.Linear(3, 3))  # add sublayer
-                res2 = model2(data)  # sequential execution
+            data = paddle.to_tensor(data)
+            # create Sequential with iterable Layers
+            model1 = paddle.nn.Sequential(
+                paddle.nn.Linear(10, 1), paddle.nn.Linear(1, 2)
+            )
+            model1[0]  # access the first layer
+            res1 = model1(data)  # sequential execution
+
+            # create Sequential with name Layer pairs
+            model2 = paddle.nn.Sequential(
+                ('l1', paddle.nn.Linear(10, 2)),
+                ('l2', paddle.nn.Linear(2, 3))
+            )
+            model2['l1']  # access l1 layer
+            model2.add_sublayer('l3', paddle.nn.Linear(3, 3))  # add sublayer
+            res2 = model2(data)  # sequential execution
 
     """
 

From aaa3ee65e4c13bca5d66b647ad2120b2e37b4a90 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 8 Oct 2020 16:21:54 +0800
Subject: [PATCH 24/91] modify doc for unique_name.guard unique_name.generate
 unique_name.switch, test=develop (#27605)

---
 python/paddle/fluid/unique_name.py | 39 +++++++++++++++---------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index 406baa9d7d65c..9565dd74d83e2 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -97,9 +97,9 @@ def generate(key):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            name1 = fluid.unique_name.generate('fc')
-            name2 = fluid.unique_name.generate('fc')
+            import paddle
+            name1 = paddle.utils.unique_name.generate('fc')
+            name2 = paddle.utils.unique_name.generate('fc')
             print(name1, name2) # fc_0, fc_1
     """
     return generator(key)
@@ -154,19 +154,18 @@ def switch(new_generator=None, new_para_name_checker=None):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            name1 = fluid.unique_name.generate('fc')
-            name2 = fluid.unique_name.generate('fc')
+            import paddle
+            name1 = paddle.utils.unique_name.generate('fc')
+            name2 = paddle.utils.unique_name.generate('fc')
             print(name1, name2) # fc_0, fc_1
 
-            pre_generator, pre_dygraph_name_checker = fluid.unique_name.switch() # switch to a new anonymous namespace.
-            name2 = fluid.unique_name.generate('fc')
+            pre_generator, pre_dygraph_name_checker = paddle.utils.unique_name.switch() # switch to a new anonymous namespace.
+            name2 = paddle.utils.unique_name.generate('fc')
             print(name2) # fc_0
 
-            fluid.unique_name.switch(pre_generator, pre_dygraph_name_checker) # switch back to pre_generator.
-            name3 = fluid.unique_name.generate('fc')
+            paddle.utils.unique_name.switch(pre_generator, pre_dygraph_name_checker) # switch back to pre_generator.
+            name3 = paddle.utils.unique_name.generate('fc')
             print(name3) # fc_2, since pre_generator has generated fc_0, fc_1.
-
     """
     global generator
     old_generator = generator
@@ -204,17 +203,17 @@ def guard(new_generator=None):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            with fluid.unique_name.guard():
-              name_1 = fluid.unique_name.generate('fc')
-            with fluid.unique_name.guard():
-              name_2 = fluid.unique_name.generate('fc')
+            import paddle
+            with paddle.utils.unique_name.guard():
+                name_1 = paddle.utils.unique_name.generate('fc')
+            with paddle.utils.unique_name.guard():
+                name_2 = paddle.utils.unique_name.generate('fc')
             print(name_1, name_2) # fc_0, fc_0
 
-            with fluid.unique_name.guard('A'):
-              name_1 = fluid.unique_name.generate('fc')
-            with fluid.unique_name.guard('B'):
-              name_2 = fluid.unique_name.generate('fc') 
+            with paddle.utils.unique_name.guard('A'):
+                name_1 = paddle.utils.unique_name.generate('fc')
+            with paddle.utils.unique_name.guard('B'):
+                name_2 = paddle.utils.unique_name.generate('fc')
             print(name_1, name_2) # Afc_0, Bfc_0
     """
     if isinstance(new_generator, six.string_types):

From c826bcb2ec03933687cf7faf5c3c09334b0726b2 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 8 Oct 2020 16:23:33 +0800
Subject: [PATCH 25/91] modify doc for ParameterList and LayerList (#27609)

* modify doc for ParameterList and LayerList, test=develop

* add empty line after code-block, test=develop
---
 python/paddle/fluid/dygraph/container.py | 90 ++++++++++++------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/python/paddle/fluid/dygraph/container.py b/python/paddle/fluid/dygraph/container.py
index dc43062aa9f17..bfcb43f5f677c 100644
--- a/python/paddle/fluid/dygraph/container.py
+++ b/python/paddle/fluid/dygraph/container.py
@@ -98,15 +98,15 @@ class ParameterList(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
-            class MyLayer(fluid.Layer):
+            class MyLayer(paddle.nn.Layer):
                 def __init__(self, num_stacked_param):
                     super(MyLayer, self).__init__()
                     # create ParameterList with iterable Parameters
-                    self.params = fluid.dygraph.ParameterList(
-                        [fluid.layers.create_parameter(
+                    self.params = paddle.nn.ParameterList(
+                        [paddle.create_parameter(
                             shape=[2, 2], dtype='float32')] * num_stacked_param)
 
                 def forward(self, x):
@@ -118,27 +118,26 @@ def forward(self, x):
                                     "Y": p},
                             outputs={"Out": tmp},
                             attrs={"x_num_col_dims": 1,
-                                   "y_num_col_dims": 1})
+                                    "y_num_col_dims": 1})
                         x = tmp
                     return x
 
             data_np = np.random.uniform(-1, 1, [5, 2]).astype('float32')
-            with fluid.dygraph.guard():
-                x = fluid.dygraph.to_variable(data_np)
-                num_stacked_param = 4
-                model = MyLayer(num_stacked_param)
-                print(len(model.params))  # 4
-                res = model(x)
-                print(res.shape)  # [5, 2]
-
-                replaced_param = fluid.layers.create_parameter(shape=[2, 3], dtype='float32')
-                model.params[num_stacked_param - 1] = replaced_param  # replace last param
-                res = model(x)
-                print(res.shape)  # [5, 3]
-                model.params.append(fluid.layers.create_parameter(shape=[3, 4], dtype='float32'))  # append param
-                print(len(model.params))  # 5
-                res = model(x)
-                print(res.shape)  # [5, 4]
+            x = paddle.to_tensor(data_np)
+            num_stacked_param = 4
+            model = MyLayer(num_stacked_param)
+            print(len(model.params))  # 4
+            res = model(x)
+            print(res.shape)  # [5, 2]
+
+            replaced_param = paddle.create_parameter(shape=[2, 3], dtype='float32')
+            model.params[num_stacked_param - 1] = replaced_param  # replace last param
+            res = model(x)
+            print(res.shape)  # [5, 3]
+            model.params.append(paddle.create_parameter(shape=[3, 4], dtype='float32'))  # append param
+            print(len(model.params))  # 5
+            res = model(x)
+            print(res.shape)  # [5, 4]
     """
 
     def __init__(self, parameters=None):
@@ -182,14 +181,15 @@ class LayerList(Layer):
 
     Examples:
         .. code-block:: python
-            import paddle.fluid as fluid
+
+            import paddle
             import numpy as np
 
-            class MyLayer(fluid.Layer):
+            class MyLayer(paddle.nn.Layer):
                 def __init__(self):
                     super(MyLayer, self).__init__()
-                    self.linears = fluid.dygraph.LayerList(
-                        [fluid.dygraph.Linear(10, 10) for i in range(10)])
+                    self.linears = paddle.nn.LayerList(
+                        [paddle.nn.Linear(10, 10) for i in range(10)])
 
                 def forward(self, x):
                     # LayerList can act as an iterable, or be indexed using ints
@@ -238,13 +238,13 @@ def append(self, sublayer):
 
         Examples:
             .. code-block:: python
-                import paddle.fluid as fluid
 
-                with fluid.dygraph.guard():
-                    linears = fluid.dygraph.LayerList([fluid.dygraph.Linear(10, 10) for i in range(10)])
-                    another = fluid.dygraph.Linear(10, 10)
-                    linears.append(another)
-                    print(len(linears))  # 11
+                import paddle
+
+                linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)])
+                another = paddle.nn.Linear(10, 10)
+                linears.append(another)
+                print(len(linears))  # 11
         """
         self.add_sublayer(str(len(self)), sublayer)
         return self
@@ -259,13 +259,13 @@ def insert(self, index, sublayer):
 
         Examples:
             .. code-block:: python
-                import paddle.fluid as fluid
 
-                with fluid.dygraph.guard():
-                    linears = fluid.dygraph.LayerList([fluid.dygraph.Linear(10, 10) for i in range(10)])
-                    another = fluid.dygraph.Linear(10, 10)
-                    linears.insert(3, another)
-                    print(linears[3] is another)  # True
+                import paddle
+
+                linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)])
+                another = paddle.nn.Linear(10, 10)
+                linears.insert(3, another)
+                print(linears[3] is another)  # True
         """
         assert isinstance(index, int) and \
                0 <= index < len(self._sub_layers), \
@@ -283,14 +283,14 @@ def extend(self, sublayers):
 
         Examples:
             .. code-block:: python
-                import paddle.fluid as fluid
-
-                with fluid.dygraph.guard():
-                    linears = fluid.dygraph.LayerList([fluid.dygraph.Linear(10, 10) for i in range(10)])
-                    another_list = fluid.dygraph.LayerList([fluid.dygraph.Linear(10, 10) for i in range(5)])
-                    linears.extend(another_list)
-                    print(len(linears))  # 15
-                    print(another_list[0] is linears[10])  # True
+
+                import paddle
+
+                linears = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(10)])
+                another_list = paddle.nn.LayerList([paddle.nn.Linear(10, 10) for i in range(5)])
+                linears.extend(another_list)
+                print(len(linears))  # 15
+                print(another_list[0] is linears[10])  # True
         """
         offset = len(self)
         for i, sublayer in enumerate(sublayers):

From bcc34724c2c569df88dabd4900ad693d9aa8e56a Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 9 Oct 2020 10:18:37 +0800
Subject: [PATCH 26/91] polish api code & example (#27696)

---
 python/paddle/fluid/framework.py | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 61ffb60b1105d..a1cf11364f8fb 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -380,31 +380,35 @@ def cuda_places(device_ids=None):
         For multi-card tasks, please use `FLAGS_selected_gpus` environment variable to set the visible GPU device.
         The next version will fix the problem with `CUDA_VISIBLE_DEVICES` environment variable.
 
-    This function creates a list of :code:`fluid.CUDAPlace` objects.
+    This function creates a list of :code:`paddle.CUDAPlace` objects.
 
     If :code:`device_ids` is None, environment variable of
     :code:`FLAGS_selected_gpus` would be checked first. For example, if
     :code:`FLAGS_selected_gpus=0,1,2`, the returned list would
-    be [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
+    be [paddle.CUDAPlace(0), paddle.CUDAPlace(1), paddle.CUDAPlace(2)].
     If :code:`FLAGS_selected_gpus` is not set, all visible
     gpu places would be returned according to the :code:`CUDA_VISIBLE_DEVICES` environment variable.
 
     If :code:`device_ids` is not None, it should be the device
     ids of GPUs. For example, if :code:`device_ids=[0,1,2]`,
     the returned list would be 
-    [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
+    [paddle.CUDAPlace(0), paddle.CUDAPlace(1), paddle.CUDAPlace(2)].
     
     Parameters:
         device_ids (list or tuple of int, optional): list of GPU device ids.
 
     Returns:
-        list of fluid.CUDAPlace: Created GPU place list.
+        list of paddle.CUDAPlace: Created GPU place list.
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            cuda_places = fluid.cuda_places()
+            import paddle
+            import paddle.static as static
+            
+            paddle.enable_static()
+
+            cuda_places = static.cuda_places()
 
     """
     assert core.is_compiled_with_cuda(), \
@@ -418,7 +422,7 @@ def cuda_places(device_ids=None):
 
 def cpu_places(device_count=None):
     """
-    This function creates a list of :code:`fluid.CPUPlace` objects, and returns the created list.
+    This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
     
     If :code:`device_count` is None, the device count would
     be determined by environment variable :code:`CPU_NUM`. 
@@ -431,13 +435,17 @@ def cpu_places(device_count=None):
         device_count (int, optional): device number. Default: None.
 
     Returns:
-        list of fluid.CPUPlace: Created list of CPU places.
+        list of paddle.CPUPlace: Created list of CPU places.
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            cpu_places = fluid.cpu_places()
+            import paddle
+            import paddle.static as static
+            
+            paddle.enable_static()
+
+            cpu_places = static.cpu_places()
     """
 
     if device_count is None:

From 6076d0a5dfc6020f8a04aa9444a6d2ad3efdd715 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Fri, 9 Oct 2020 10:29:28 +0800
Subject: [PATCH 27/91] modify doc for bilinear_tensor_product, test=develop
 (#27613)

---
 python/paddle/fluid/layers/nn.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 733d8b5d29f1a..dbcd91eedbdf1 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -13263,10 +13263,11 @@ def bilinear_tensor_product(x,
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          layer1 = fluid.data("t1", shape=[-1, 5], dtype="float32")
-          layer2 = fluid.data("t2", shape=[-1, 4], dtype="float32")
-          tensor = fluid.layers.bilinear_tensor_product(x=layer1, y=layer2, size=1000)
+            import paddle
+            paddle.enable_static()
+            layer1 = paddle.static.data("t1", shape=[-1, 5], dtype="float32")
+            layer2 = paddle.static.data("t2", shape=[-1, 4], dtype="float32")
+            tensor = paddle.static.nn.bilinear_tensor_product(x=layer1, y=layer2, size=1000)
     """
     helper = LayerHelper('bilinear_tensor_product', **locals())
     dtype = helper.input_dtype('x')

From 5345a588b797598b848d5a3ec8ea60f74a0be818 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Fri, 9 Oct 2020 10:53:33 +0800
Subject: [PATCH 28/91] update readme for 1.8.5,test=document_fix (#27764)

---
 README.md    | 2 +-
 README_cn.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d14d0ef001481..580ebca8ef308 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ pip install paddlepaddle
 # Linux GPU cuda10cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu==1.8.4.post97
+pip install paddlepaddle-gpu==1.8.5.post97
 
 ```
 It is recommended to read [this doc](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/install/index_en.html) on our website.
diff --git a/README_cn.md b/README_cn.md
index e4544a3eff6e5..ee8cfbef1cef9 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -30,7 +30,7 @@ pip install paddlepaddle
 # Linux GPU cuda10cudnn7
 pip install paddlepaddle-gpu
 # Linux GPU cuda9cudnn7
-pip install paddlepaddle-gpu==1.8.4.post97
+pip install paddlepaddle-gpu==1.8.5.post97
 
 ```
 更多安装信息详见官网 [安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/install/index_cn.html)

From 9089841b6e7ec632bd5ed51ceb4fad8748e0aac3 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Fri, 9 Oct 2020 11:38:35 +0800
Subject: [PATCH 29/91] Fix bilateral inference shape bug (#26822)

* fix bilateral bug
---
 paddle/fluid/operators/bilateral_slice_op.cc  | 31 +++++++++++--------
 python/paddle/fluid/contrib/layers/nn.py      |  8 +++--
 .../unittests/test_bilateral_slice_op.py      | 17 +++++++---
 3 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/operators/bilateral_slice_op.cc b/paddle/fluid/operators/bilateral_slice_op.cc
index b742b4c0deea8..b00604155d67e 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cc
+++ b/paddle/fluid/operators/bilateral_slice_op.cc
@@ -50,20 +50,25 @@ class BilateralSliceOp : public framework::OperatorWithKernel {
     int64_t input_chans = input_dims[1];
 
     int64_t output_chans;
-    if (has_offset) {
-      PADDLE_ENFORCE_EQ((coeffs_chans % (input_chans + 1)), 0,
-                        platform::errors::InvalidArgument(
-                            "Slicing with affine offset, coefficients grid "
-                            "should have n_out*(n_in+1) channels, but got %d",
-                            coeffs_chans));
-      output_chans = coeffs_chans / (input_chans + 1);
+    if ((!ctx->IsRuntime()) && ((coeffs_chans < 0) || (input_chans < 0))) {
+      output_chans = -1;
     } else {
-      PADDLE_ENFORCE_EQ((coeffs_chans % input_chans), 0,
-                        platform::errors::InvalidArgument(
-                            "Slicing without affine offset, coefficients grid "
-                            "should have n_out*n_in channels, but got %d .",
-                            coeffs_chans));
-      output_chans = coeffs_chans / input_chans;
+      if (has_offset) {
+        PADDLE_ENFORCE_EQ((coeffs_chans % (input_chans + 1)), 0,
+                          platform::errors::InvalidArgument(
+                              "Slicing with affine offset, coefficients grid "
+                              "should have n_out*(n_in+1) channels, but got %d",
+                              coeffs_chans));
+        output_chans = coeffs_chans / (input_chans + 1);
+      } else {
+        PADDLE_ENFORCE_EQ(
+            (coeffs_chans % input_chans), 0,
+            platform::errors::InvalidArgument(
+                "Slicing without affine offset, coefficients grid "
+                "should have n_out*n_in channels, but got %d .",
+                coeffs_chans));
+        output_chans = coeffs_chans / input_chans;
+      }
     }
 
     std::vector<int64_t> output_dims;
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index ac6493b1c2969..d0543bb90dd14 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -1525,10 +1525,10 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
             grid = fluid.data(name='grid', shape=[None, 12, 8, 10, 6], dtype='float32')
 
             # without offset
-            output = fluid.layers.bilateral_slice(x, guide, grid, has_offset=False)
+            output = fluid.contrib.bilateral_slice(x, guide, grid, has_offset=False)
             
             # has offset
-            output = fluid.layers.bilateral_slice(x, guide, grid, has_offset=True)
+            output = fluid.contrib.bilateral_slice(x, guide, grid, has_offset=True)
 
     """
     helper = LayerHelper("bilateral_slice", **locals())
@@ -1541,7 +1541,9 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
 
     out = helper.create_variable_for_type_inference(x.dtype)
     inputs = {'X': x, 'Guide': guide, 'Grid': grid}
-
+    if paddle.fluid.in_dygraph_mode():
+        attrs = ('has_offset', has_offset)
+        return getattr(core.ops, "bilateral_slice")(x, grid, guide, *attrs)
     helper.append_op(
         type='bilateral_slice',
         inputs=inputs,
diff --git a/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py b/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py
index 51e447dba725c..c0d622d7ea187 100644
--- a/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py
@@ -178,16 +178,25 @@ def initTestCase(self):
         self.data_type = 'float32'
 
 
-class TestBilateralSliceApi(TestBilateralSliceOp):
+class TestBilateralSliceApi(unittest.TestCase):
     def test_api(self):
         x = paddle.fluid.data(
             name='x', shape=[None, 3, 25, 15], dtype='float32')
         guide = paddle.fluid.data(
             name='guide', shape=[None, 25, 15], dtype='float32')
         grid = paddle.fluid.data(
-            name='grid', shape=[None, 12, 8, 5, 3], dtype='float32')
-        paddle.fluid.contrib.layers.bilateral_slice(x, guide, grid,
-                                                    self.has_offset)
+            name='grid', shape=[None, None, 8, 5, 3], dtype='float32')
+        paddle.fluid.contrib.layers.bilateral_slice(x, guide, grid, False)
+
+        if not paddle.fluid.is_compiled_with_cuda():
+            return
+
+        with paddle.fluid.dygraph.guard():
+            x1 = paddle.rand([3, 1, 50, 30])
+            guide1 = paddle.rand([3, 50, 30])
+            grid1 = paddle.rand([3, 2, 2, 5, 3])
+
+            paddle.fluid.contrib.bilateral_slice(x1, guide1, grid1, False)
 
 
 if __name__ == "__main__":

From b9c7c66ea52952553a797ac86ff9045f0db9f3fd Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Fri, 9 Oct 2020 11:39:16 +0800
Subject: [PATCH 30/91] add type promotion (#27756)

---
 .../fluid/tests/unittests/test_kldiv_loss_op.py   | 15 +++++++++++++++
 python/paddle/nn/functional/loss.py               | 10 ++++++++++
 2 files changed, 25 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
index 3a3b7071e04dc..aaba571e1a6b9 100644
--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
@@ -115,5 +115,20 @@ def test_kl_loss_static_api(self):
         pred_loss = paddle.nn.functional.kl_div(input, label)
 
 
+class TestKLDivLossTypePromotion(unittest.TestCase):
+    def test_kl_div_promotion(self):
+
+        with paddle.fluid.dygraph.guard():
+            x1 = paddle.rand([5, 20], dtype='float32')
+            target1 = paddle.rand([5, 20], dtype='float64')
+
+            kldiv_criterion = paddle.nn.KLDivLoss()
+            pred_loss1 = kldiv_criterion(x1, target1)
+
+            x2 = paddle.rand([5, 20], dtype='float64')
+            target2 = paddle.rand([5, 20], dtype='float32')
+            pred_loss2 = paddle.nn.functional.kl_div(x2, target2)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 76722f26007c4..05daf24ca24ab 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -800,6 +800,16 @@ def kl_div(input, label, reduction='mean', name=None):
             # shape=[5, 20]
 
     """
+    # ugly type promotion
+    if fluid.data_feeder.convert_dtype(
+            input.dtype) == 'float32' and fluid.data_feeder.convert_dtype(
+                label.dtype) == 'float64':
+        input = fluid.layers.cast(input, 'float64')
+    elif fluid.data_feeder.convert_dtype(
+            input.dtype) == 'float64' and fluid.data_feeder.convert_dtype(
+                label.dtype) == 'float32':
+        label = fluid.layers.cast(label, 'float64')
+
     if paddle.in_dynamic_mode():
         out = core.ops.kldiv_loss(input, label, 'reduction', reduction)
         return out

From d84eb9b33f8751e50c73c07ed0d88379d9a406e9 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Fri, 9 Oct 2020 11:39:52 +0800
Subject: [PATCH 31/91] keep network mode unchange when use summary api
 (#27754)

* keep summary mode unchange

* add no grad decorator
---
 python/paddle/hapi/model_summary.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index 3ead3fc295c0b..c46a53e910df0 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -106,6 +106,12 @@ def forward(self, inputs):
         warnings.warn(
             "Your model was created in static mode, this may not get correct summary information!"
         )
+        in_train_mode = False
+    else:
+        in_train_mode = net.training
+
+    if in_train_mode:
+        net.eval()
 
     def _is_shape(shape):
         for item in shape:
@@ -143,9 +149,13 @@ def _check_input(input_size):
     result, params_info = summary_string(net, _input_size, dtypes)
     print(result)
 
+    if in_train_mode:
+        net.train()
+
     return params_info
 
 
+@paddle.no_grad()
 def summary_string(model, input_size, dtypes=None):
     def _all_is_numper(items):
         for item in items:

From 365c2c9c89152e546d0556b71f44dd8b1f003e5c Mon Sep 17 00:00:00 2001
From: Zhen Wang <wangzhen31@baidu.com>
Date: Fri, 9 Oct 2020 11:45:19 +0800
Subject: [PATCH 32/91] fix error message showing in UpdateLossScalingOp
 (#27596)

---
 paddle/fluid/operators/amp/update_loss_scaling_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
index fca3c531b4055..8bd76a9886c62 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -103,7 +103,7 @@ class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
         .AddCustomChecker([](float decr_ratio) {
           PADDLE_ENFORCE_EQ(decr_ratio > 0.0f && decr_ratio < 1.0f, true,
                             platform::errors::InvalidArgument(
-                                "'incr_ratio' should be between 0 and 1, but "
+                                "'decr_ratio' should be between 0 and 1, but "
                                 "the received is %f",
                                 decr_ratio));
         });

From 994438b10956965a6034e541aa2578ef3dc41ac6 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Fri, 9 Oct 2020 12:28:26 +0800
Subject: [PATCH 33/91] change clip grad api, test=develop (#27767)

---
 python/paddle/fluid/clip.py                   | 158 +++++++-----------
 python/paddle/fluid/framework.py              |  10 +-
 python/paddle/fluid/param_attr.py             |  29 ++--
 .../tests/unittests/test_gradient_clip.py     |  66 ++------
 python/paddle/nn/__init__.py                  |  20 +--
 python/paddle/nn/clip.py                      |  12 +-
 6 files changed, 121 insertions(+), 174 deletions(-)

diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 0e7a9dbea2561..505d6fef8fb53 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -26,8 +26,8 @@
 from .dygraph import base as imperative_base
 
 __all__ = [
-    'set_gradient_clip', 'ErrorClipByValue', 'GradientClipByValue',
-    'GradientClipByNorm', 'GradientClipByGlobalNorm'
+    'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue',
+    'ClipGradByNorm', 'ClipGradByGlobalNorm'
 ]
 
 
@@ -115,16 +115,9 @@ def error_clip_callback(block, context):
             error_clip._append_clip_op(block, grad_n)
 
 
-class GradientClipBase(object):
-    def __init__(self, need_clip=None):
-        if need_clip is not None and not callable(need_clip):
-            raise TypeError(
-                "The type of need_clip must be funciton, and it can filter out "
-                "parameter that does't need gradient clip. This function must return "
-                "True or False, and True means that clipping is required. Please refer to "
-                "API documention of GradientClipByGlobalNorm / GradientClipByNorm "
-                "/GradientClipByValue.")
-        self._need_clip_func = need_clip
+class ClipGradBase(object):
+    def __init__(self):
+        super(ClipGradBase, self).__init__()
 
     def __str__(self):
         raise NotImplementedError()
@@ -144,7 +137,7 @@ def __call__(self, params_grads):
                 if getattr(p, 'gradient_clip_attr', None) is not None:
                     warnings.warn(
                         "'set_gradient_clip' will be ineffective, because you have "
-                        "set 'grad_clip' in 'optimizer'. So, 'set_gradient_clip' "
+                        "set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
                         "is redundant and you can remove it.")
                     break
             return self._static_clip(params_grads)
@@ -156,7 +149,7 @@ def _create_operators(self, param, grad):
         raise NotImplementedError()
 
 
-class GradientClipByValue(GradientClipBase):
+class ClipGradByValue(ClipGradBase):
     """
     Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
     
@@ -164,19 +157,20 @@ class GradientClipByValue(GradientClipBase):
     
     - Any values greater than max are set to ``max``.
 
-    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
-    is not None, then only part of gradients can be selected for gradient clipping.
+    The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``. 
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
     
     Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
     (for example: :ref:`api_paddle_optimizer_SGD`).
+
+    Note:
+        ``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0. 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
     
     Args:
         max (float): The maximum value to clip by.
         min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max`` 
             automatically. In this case, ``max`` must be greater than 0.
-        need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` 
-            (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, 
-            and gradients of all parameters in the network will be clipped.
 
     Examples:
         .. code-block:: python
@@ -184,29 +178,20 @@ class GradientClipByValue(GradientClipBase):
             import paddle
 
             x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(10, 10)
+            linear = paddle.nn.Linear(in_features=10, out_features=10, 
+                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
             out = linear(x)
             loss = paddle.mean(out)
             loss.backward()
 
-            # clip all parameters in network:
-            clip = paddle.nn.GradientClipByValue(min=-1, max=1)
-
-            # clip a part of parameters in network: (e.g. linear_0.w_0)
-            # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
-            # def fileter_func(ParamBase):
-            # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
-            #   return ParamBase.name == "linear_0.w_0"
-            # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
-            #   return ParamBase.name == linear.weight.name
-            # clip = paddle.nn.GradientClipByValue(min=-1, max=1, need_clip=fileter_func)
-
+            clip = paddle.nn.ClipGradByValue(min=-1, max=1)
             sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
             sdg.step()
     """
 
-    def __init__(self, max, min=None, need_clip=None):
-        super(GradientClipByValue, self).__init__(need_clip)
+    def __init__(self, max, min=None):
+        super(ClipGradByValue, self).__init__()
         if min is None:
             assert (max > 0.0)
             min = -max
@@ -214,7 +199,7 @@ def __init__(self, max, min=None, need_clip=None):
         self.min = float(min)
 
     def __str__(self):
-        return "Gradient Clip By Value, min = %f, max=%f" % (self.min, self.max)
+        return "Clip Gradient By Value, min = %f, max=%f" % (self.min, self.max)
 
     @imperative_base.no_grad
     def _dygraph_clip(self, params_grads):
@@ -222,7 +207,7 @@ def _dygraph_clip(self, params_grads):
         for p, g in params_grads:
             if g is None:
                 continue
-            if self._need_clip_func is not None and not self._need_clip_func(p):
+            if getattr(p, 'need_clip', True) is False:
                 params_and_grads.append((p, g))
                 continue
             new_grad = layers.clip(x=g, min=self.min, max=self.max)
@@ -236,8 +221,7 @@ def _static_clip(self, params_grads):
             for p, g in params_grads:
                 if g is None:
                     continue
-                if self._need_clip_func is not None and not self._need_clip_func(
-                        p):
+                if getattr(p, 'need_clip', True) is False:
                     params_and_grads.append((p, g))
                     continue
 
@@ -256,7 +240,7 @@ def _create_operators(self, param, grad):
         return param, new_grad
 
 
-class GradientClipByNorm(GradientClipBase):
+class ClipGradByNorm(ClipGradBase):
     """
     Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
     
@@ -264,8 +248,8 @@ class GradientClipByNorm(GradientClipBase):
     
     - If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
     
-    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
-    is not None, then only part of gradients can be selected for gradient clipping.
+    The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
     
     Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
     (for example: :ref:`api_paddle_optimizer_SGD`).
@@ -287,11 +271,12 @@ class GradientClipByNorm(GradientClipBase):
     .. math::
         norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}}
 
+    Note:
+        ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
     Args:
         clip_norm(float): The maximum norm value.
-        need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` 
-            (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, 
-            and gradients of all parameters in the network will be clipped.
 
     Examples:
         .. code-block:: python
@@ -299,29 +284,20 @@ class GradientClipByNorm(GradientClipBase):
             import paddle
 
             x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(10, 10)
+            linear = paddle.nn.Linear(in_features=10, out_features=10, 
+                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
             out = linear(x)
             loss = paddle.mean(out)
             loss.backward()
 
-            # clip all parameters in network:
-            clip = paddle.nn.GradientClipByNorm(clip_norm=1.0)
-
-            # clip a part of parameters in network: (e.g. linear_0.w_0)
-            # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
-            # def fileter_func(ParamBase):
-            # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
-            #   return ParamBase.name == "linear_0.w_0"
-            # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
-            #   return ParamBase.name == linear.weight.name
-            # clip = paddle.nn.GradientClipByNorm(clip_norm=1.0, need_clip=fileter_func)
-
+            clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
             sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
             sdg.step()
     """
 
-    def __init__(self, clip_norm, need_clip=None):
-        super(GradientClipByNorm, self).__init__(need_clip)
+    def __init__(self, clip_norm):
+        super(ClipGradByNorm, self).__init__()
         self.clip_norm = float(clip_norm)
 
     def __str__(self):
@@ -333,7 +309,7 @@ def _dygraph_clip(self, params_grads):
         for p, g in params_grads:
             if g is None:
                 continue
-            if self._need_clip_func is not None and not self._need_clip_func(p):
+            if getattr(p, 'need_clip', True) is False:
                 params_and_grads.append((p, g))
                 continue
             new_grad = layers.clip_by_norm(x=g, max_norm=self.clip_norm)
@@ -347,8 +323,7 @@ def _static_clip(self, params_grads):
             for p, g in params_grads:
                 if g is None:
                     continue
-                if self._need_clip_func is not None and not self._need_clip_func(
-                        p):
+                if getattr(p, 'need_clip', True) is False:
                     params_and_grads.append((p, g))
                     continue
 
@@ -367,7 +342,7 @@ def _create_operators(self, param, grad):
         return param, new_grad
 
 
-class GradientClipByGlobalNorm(GradientClipBase):
+class ClipGradByGlobalNorm(ClipGradBase):
     """
     Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in 
     :math:`t\_list` , and limit it to ``clip_norm`` .
@@ -376,8 +351,8 @@ class GradientClipByGlobalNorm(GradientClipBase):
     
     - If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
     
-    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters in ``Program`` . If ``need_clip``
-    is not None, then only part of gradients can be selected for gradient clipping.
+    The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
+    If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
     
     Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer`` 
     (for example: :ref:`api_paddle_optimizer_SGD`).
@@ -394,12 +369,13 @@ class GradientClipByGlobalNorm(GradientClipBase):
 
         global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
 
+    Note:
+        ``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0. 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+
     Args:
         clip_norm (float): The maximum norm value.
-        group_name (str, optional): The group name for this clip. Default value is ``default_group``
-        need_clip (function, optional): Type: function. This function accepts a ``Parameter`` and returns ``bool`` 
-            (True: the gradient of this ``Parameter`` need to be clipped, False: not need). Default: None, 
-            and gradients of all parameters in the network will be clipped.
+        group_name (str, optional): The group name for this clip. Default value is ``default_group``.
 
     Examples:
         .. code-block:: python
@@ -407,29 +383,20 @@ class GradientClipByGlobalNorm(GradientClipBase):
             import paddle
 
             x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
-            linear = paddle.nn.Linear(10, 10)
+            linear = paddle.nn.Linear(in_features=10, out_features=10, 
+                                      weight_attr=paddle.ParamAttr(need_clip=True), 
+                                      bias_attr=paddle.ParamAttr(need_clip=False))
             out = linear(x)
             loss = paddle.mean(out)
             loss.backward()
 
-            # clip all parameters in network:
-            clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0)
-
-            # clip a part of parameters in network: (e.g. linear_0.w_0)
-            # pass a function(fileter_func) to need_clip, and fileter_func receive a ParamBase, and return bool
-            # def fileter_func(ParamBase):
-            # # It can be easily filtered by ParamBase.name(name can be set in paddle.ParamAttr, and the default name is linear_0.w_0, linear_0.b_0)
-            #   return ParamBase.name == "linear_0.w_0"
-            # # Note: linear.weight and linear.bias can return the weight and bias of dygraph.Linear, respectively, and can be used to filter
-            #   return ParamBase.name == linear.weight.name
-            # clip = paddle.nn.GradientClipByGlobalNorm(clip_norm=1.0, need_clip=fileter_func)
-
+            clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
             sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
             sdg.step()
     """
 
-    def __init__(self, clip_norm, group_name="default_group", need_clip=None):
-        super(GradientClipByGlobalNorm, self).__init__(need_clip)
+    def __init__(self, clip_norm, group_name="default_group"):
+        super(ClipGradByGlobalNorm, self).__init__()
         self.clip_norm = float(clip_norm)
         self.group_name = group_name
 
@@ -443,7 +410,7 @@ def _dygraph_clip(self, params_grads):
         for p, g in params_grads:
             if g is None:
                 continue
-            if self._need_clip_func is not None and not self._need_clip_func(p):
+            if getattr(p, 'need_clip', True) is False:
                 continue
             merge_grad = g
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
@@ -469,7 +436,7 @@ def _dygraph_clip(self, params_grads):
         for p, g in params_grads:
             if g is None:
                 continue
-            if self._need_clip_func is not None and not self._need_clip_func(p):
+            if getattr(p, 'need_clip', True) is False:
                 params_and_grads.append((p, g))
                 continue
             new_grad = layers.elementwise_mul(x=g, y=clip_var)
@@ -484,8 +451,7 @@ def _static_clip(self, params_grads):
             for p, g in params_grads:
                 if g is None:
                     continue
-                if self._need_clip_func is not None and not self._need_clip_func(
-                        p):
+                if getattr(p, 'need_clip', True) is False:
                     continue
                 merge_grad = g
                 with p.block.program._optimized_guard([p, g]):
@@ -518,8 +484,7 @@ def _static_clip(self, params_grads):
             for p, g in params_grads:
                 if g is None:
                     continue
-                if self._need_clip_func is not None and not self._need_clip_func(
-                        p):
+                if getattr(p, 'need_clip', True) is False:
                     params_and_grads.append((p, g))
                     continue
 
@@ -670,9 +635,9 @@ def network():
                   "This method can reduce the mistakes, please "
                   "refer to documention of 'optimizer'.")
 
-    if not isinstance(clip, GradientClipBase):
+    if not isinstance(clip, ClipGradBase):
         raise TypeError(
-            "'clip' should be an instance of GradientClipBase's derived class")
+            "'clip' should be an instance of ClipGradBase's derived class")
     if program is None:
         program = framework.default_main_program()
 
@@ -708,7 +673,7 @@ def append_gradient_clip_ops(param_grads):
             clip_attr = getattr(p, 'gradient_clip_attr', None)
             if clip_attr is None:
                 return param_grads
-            if not isinstance(clip_attr, GradientClipBase):
+            if not isinstance(clip_attr, ClipGradBase):
                 raise TypeError(
                     "clip attribute should be an instance of GradientClipBase")
 
@@ -754,6 +719,7 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
                     op._set_attr('op_role_var', correct_p_g)
 
 
-ClipByValue = GradientClipByValue
-ClipByNorm = GradientClipByNorm
-ClipByGlobalNorm = GradientClipByGlobalNorm
+GradientClipBase = ClipGradBase
+GradientClipByValue = ClipGradByValue
+GradientClipByNorm = ClipGradByNorm
+GradientClipByGlobalNorm = ClipGradByGlobalNorm
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index a1cf11364f8fb..52c1e5d5e16c1 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -5123,6 +5123,8 @@ class Parameter(Variable):
             be applied on the parameter. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this parameter.
+        need_clip (bool): Whether the parameter gradient need to be cliped 
+            in optimizer. Default is True.
     """
 
     def __init__(self,
@@ -5162,6 +5164,8 @@ def __init__(self,
 
         self.do_model_average = kwargs.get('do_model_average', None)
 
+        self.need_clip = kwargs.get('need_clip', True)
+
         self.is_distributed = False
 
     def __str__(self):
@@ -5194,7 +5198,7 @@ def to_string(self, throw_on_error, with_details=False):
         if with_details:
             res_str = Variable.to_string(self, throw_on_error, True)
             additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "do_model_average")
+                               "do_model_average", "need_clip")
             for attr_name in additional_attr:
                 res_str += "%s: %s\n" % (attr_name,
                                          cpt.to_text(getattr(self, attr_name)))
@@ -5226,6 +5230,8 @@ class ParamBase(core.VarBase):
             be applied on the ParamBase. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this ParamBase.
+        need_clip (bool): Whether the parameter gradient need to be cliped 
+            in optimizer. Default is True.
     """
 
     @dygraph_only
@@ -5265,6 +5271,8 @@ def __init__(self, shape, dtype, **kwargs):
 
         self.do_model_average = kwargs.get('do_model_average', None)
 
+        self.need_clip = kwargs.get('need_clip', True)
+
         self.is_distributed = False
         # self.block = default_main_program().global_block()
 
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 83f54fc8208db..bf04239370693 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -36,8 +36,8 @@ class ParamAttr(object):
     
     Note:
         ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. 
-        It is recommended to set ``grad_clip`` in ``optimizer`` to clip gradient. 
-        There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+        There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` , 
         :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
 
     Parameters:
@@ -57,6 +57,7 @@ class ParamAttr(object):
         trainable (bool): Whether this parameter is trainable. Default True.
         do_model_average (bool): Whether this parameter should do model average
                 when model average is enabled. Default False.
+        need_clip (bool): Whether the parameter gradient need to be cliped in optimizer. Default is True.
 
     Examples:
         .. code-block:: python
@@ -78,7 +79,8 @@ def __init__(self,
                  learning_rate=1.0,
                  regularizer=None,
                  trainable=True,
-                 do_model_average=True):
+                 do_model_average=True,
+                 need_clip=True):
 
         if sys.version_info.major == 2:
             check_type(name, "name", (str, type(None), unicode), "ParamAttr")
@@ -87,6 +89,7 @@ def __init__(self,
         check_type(learning_rate, "learning_rate", (float, int), "ParamAttr")
         check_type(trainable, "trainable", (bool), "ParamAttr")
         check_type(do_model_average, "do_model_average", (bool), "ParamAttr")
+        check_type(need_clip, "need_clip", (bool), "ParamAttr")
         check_type(initializer, "initializer", (Initializer, type(None)),
                    "ParamAttr")
         check_type(regularizer, "regularizer",
@@ -101,6 +104,7 @@ def __init__(self,
         self.regularizer = regularizer
         self.trainable = trainable
         self.do_model_average = do_model_average
+        self.need_clip = need_clip
 
     def _set_default_initializer(self, initializer):
         """
@@ -197,7 +201,8 @@ def _to_kwargs(self, with_initializer=False):
             },
             'regularizer': self.regularizer,
             'trainable': self.trainable,
-            'do_model_average': self.do_model_average
+            'do_model_average': self.do_model_average,
+            'need_clip': self.need_clip
         }
         if with_initializer:
             kwargs['initializer'] = self.initializer
@@ -219,9 +224,9 @@ class WeightNormParamAttr(ParamAttr):
     <https://arxiv.org/pdf/1602.07868.pdf>`_.
       
     Note:
-        ``gradient_clip`` of ``WeightNormParamAttr`` HAS BEEN DEPRECATED since 2.0. 
-        It is recommended to use ``minimize(loss, grad_clip=clip)`` to clip gradient. 
-        There are three clipping strategies: :ref:`api_fluid_clip_GradientClipByGlobalNorm` , 
+        ``gradient_clip`` of ``ParamAttr`` HAS BEEN DEPRECATED since 2.0. 
+        Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
+        There are three clipping strategies: :ref:`api_paddle_nn_GradientClipByGlobalNorm` , 
         :ref:`api_fluid_clip_GradientClipByNorm` , :ref:`api_fluid_clip_GradientClipByValue` .
         
 
@@ -248,6 +253,7 @@ class WeightNormParamAttr(ParamAttr):
         trainable(bool, optional): Whether this parameter is trainable. Default True.
         do_model_average(bool, optional): Whether this parameter should do model average.
             Default False.
+        need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True.
 
     Examples:
         .. code-block:: python
@@ -267,7 +273,8 @@ class WeightNormParamAttr(ParamAttr):
                                                 learning_rate=1.0,
                                                 regularizer=paddle.regularizer.L2Decay(0.1),
                                                 trainable=True,
-                                                do_model_average=False))
+                                                do_model_average=False,
+                                                need_clip=True))
 
     """
     # List to record the parameters reparameterized by weight normalization.
@@ -283,12 +290,14 @@ def __init__(self,
                  learning_rate=1.0,
                  regularizer=None,
                  trainable=True,
-                 do_model_average=False):
+                 do_model_average=False,
+                 need_clip=True):
         super(WeightNormParamAttr, self).__init__(
             name=name,
             initializer=initializer,
             learning_rate=learning_rate,
             regularizer=regularizer,
             trainable=trainable,
-            do_model_average=do_model_average)
+            do_model_average=do_model_average,
+            need_clip=need_clip)
         self.dim = dim
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index cc54e680c7525..f258e830b5fe5 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -185,12 +185,7 @@ def func(params_grads):
     # invoke 'set_gradient_clip' in a wrong order
     def test_wrong_API_order(self):
         def backward_func(cost):
-            # no clip gradient
-            def fileter_func(param):
-                return param.name == "fc.w_0"
-
-            clip = fluid.clip.GradientClipByGlobalNorm(
-                clip_norm=5.0, need_clip=fileter_func)
+            clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
             fluid.clip.set_gradient_clip(clip)
             sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01,
                                                 grad_clip=clip)
@@ -205,11 +200,7 @@ def fileter_func(param):
 
     # if grad is None or not need clip
     def test_none_grad(self):
-        def fileter_func(param):
-            return param.name == "x"
-
-        clip = fluid.clip.GradientClipByGlobalNorm(
-            self.clip_norm, need_clip=fileter_func)
+        clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
         x = fluid.default_main_program().global_block().create_parameter(
             name="x", shape=[2, 3], dtype="float32")
         y = fluid.default_main_program().global_block().create_parameter(
@@ -228,11 +219,6 @@ def fileter_func(param):
 
     # raise typeError
     def test_tpyeError(self):
-        # the type of need_clip must be an funciton
-        with self.assertRaises(TypeError):
-            clip = fluid.clip.GradientClipByGlobalNorm(
-                clip_norm=self.clip_norm, need_clip="test")
-
         # the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class
         with self.assertRaises(TypeError):
             sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1,
@@ -264,26 +250,22 @@ def test_gradient_clip(self):
 
     # if grad is None or not need clip
     def test_none_grad(self):
-        def fileter_func(param):
-            return param.name == "z"
-
-        clip = fluid.clip.GradientClipByNorm(
-            self.clip_norm, need_clip=fileter_func)
+        clip = fluid.clip.GradientClipByNorm(self.clip_norm)
         x = fluid.default_main_program().global_block().create_parameter(
-            name="x", shape=[2, 3], dtype="float32")
+            name="x", shape=[2, 3], dtype="float32", need_clip=False)
         y = fluid.default_main_program().global_block().create_parameter(
-            name="y", shape=[2, 3], dtype="float32")
+            name="y", shape=[2, 3], dtype="float32", need_clip=False)
 
         # (x, None) should not be returned
         params_grads = [(x, None), (x, y)]
         params_grads = clip(params_grads)
         self.assertTrue(
             len(clip(params_grads)) == 1,
-            "ClipByNorm: when grad is None, it shouldn't be returned by gradient clip!"
+            "ClipGradByNorm: when grad is None, it shouldn't be returned by gradient clip!"
         )
         self.assertTrue(
             params_grads[0][1].name == 'y',
-            "ClipByNorm: grad should not be clipped when filtered out!")
+            "ClipGradByNorm: grad should not be clipped when filtered out!")
 
 
 class TestGradientClipByValue(TestGradientClip):
@@ -312,26 +294,22 @@ def test_gradient_clip(self):
 
     # if grad is None or not need clip
     def test_none_grad(self):
-        def fileter_func(param):
-            return param.name == "z"
-
-        clip = fluid.clip.GradientClipByValue(
-            self.max, self.min, need_clip=fileter_func)
+        clip = fluid.clip.GradientClipByValue(self.max, self.min)
         x = fluid.default_main_program().global_block().create_parameter(
-            name="x", shape=[2, 3], dtype="float32")
+            name="x", shape=[2, 3], dtype="float32", need_clip=False)
         y = fluid.default_main_program().global_block().create_parameter(
-            name="y", shape=[2, 3], dtype="float32")
+            name="y", shape=[2, 3], dtype="float32", need_clip=False)
 
         # (x, None) should not be returned
         params_grads = [(x, None), (x, y)]
         params_grads = clip(params_grads)
         self.assertTrue(
             len(clip(params_grads)) == 1,
-            "ClipByValue: when grad is None, it shouldn't be returned by gradient clip!"
+            "ClipGradByValue: when grad is None, it shouldn't be returned by gradient clip!"
         )
         self.assertTrue(
             params_grads[0][1].name == 'y',
-            "ClipByValue: grad should not be clipped when filtered out!")
+            "ClipGradByValue: grad should not be clipped when filtered out!")
 
 
 class TestDygraphGradientClip(unittest.TestCase):
@@ -355,13 +333,9 @@ def check_clip_result(self, loss, optimizer):
 
 class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
     def setUp(self):
-        # only clip gradient of x (ParamBase)
-        def fileter_func(param):
-            return param.name == "x"
-
         self.clip_norm = 0.8
         self.clip1 = fluid.clip.GradientClipByGlobalNorm(
-            clip_norm=self.clip_norm, need_clip=fileter_func)
+            clip_norm=self.clip_norm)
         self.clip2 = fluid.clip.GradientClipByGlobalNorm(
             clip_norm=self.clip_norm)
 
@@ -401,13 +375,8 @@ def check_clip_result(self, loss, optimizer):
 
 class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
     def setUp(self):
-        # only clip gradient of linear_0.w_0 (ParamBase)
-        def fileter_func(param):
-            return param.name == "linear_0.w_0"
-
         self.clip_norm = 0.8
-        self.clip = fluid.clip.GradientClipByNorm(
-            clip_norm=self.clip_norm, need_clip=fileter_func)
+        self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
 
     def check_clip_result(self, loss, optimizer):
         # if grad is None
@@ -435,14 +404,9 @@ def check_clip_result(self, loss, optimizer):
 
 class TestDygraphGradientClipByValue(TestDygraphGradientClip):
     def setUp(self):
-        # only clip gradient of linear_0.w_0 (ParamBase)
-        def fileter_func(param):
-            return param.name == "linear_0.w_0"
-
         self.max = 0.2
         self.min = 0.1
-        self.clip = fluid.clip.GradientClipByValue(
-            max=self.max, min=self.min, need_clip=fileter_func)
+        self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
 
     def check_clip_result(self, loss, optimizer):
         # if grad is None
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 2452f196987b8..82fec5c0faa2e 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -31,9 +31,9 @@
 __all__ += weight_norm_hook.__all__
 
 # TODO: define alias in nn directory
-from .clip import GradientClipByGlobalNorm  #DEFINE_ALIAS
-from .clip import GradientClipByNorm  #DEFINE_ALIAS
-from .clip import GradientClipByValue  #DEFINE_ALIAS
+from .clip import ClipGradByGlobalNorm  #DEFINE_ALIAS
+from .clip import ClipGradByNorm  #DEFINE_ALIAS
+from .clip import ClipGradByValue  #DEFINE_ALIAS
 # from .clip import set_gradient_clip        #DEFINE_ALIAS
 from .clip import clip  #DEFINE_ALIAS
 from .clip import clip_by_norm  #DEFINE_ALIAS
@@ -51,13 +51,13 @@
 # from .decode import dynamic_decode        #DEFINE_ALIAS
 from .decode import gather_tree  #DEFINE_ALIAS
 # from .input import Input        #DEFINE_ALIAS
-from .layer.activation import ELU
-from .layer.activation import GELU
-from .layer.activation import Tanh
-from .layer.activation import Hardshrink
-from .layer.activation import Hardtanh
-from .layer.activation import PReLU
-from .layer.activation import ReLU
+from .layer.activation import ELU  #DEFINE_ALIAS
+from .layer.activation import GELU  #DEFINE_ALIAS
+from .layer.activation import Tanh  #DEFINE_ALIAS
+from .layer.activation import Hardshrink  #DEFINE_ALIAS
+from .layer.activation import Hardtanh  #DEFINE_ALIAS
+from .layer.activation import PReLU  #DEFINE_ALIAS
+from .layer.activation import ReLU  #DEFINE_ALIAS
 from .layer.activation import ReLU6  #DEFINE_ALIAS
 from .layer.activation import SELU  #DEFINE_ALIAS
 from .layer.activation import LeakyReLU  #DEFINE_ALIAS
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index a50dad628cf32..9fd1241bd83e0 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 
 # TODO: define the functions to clip gradient of parameter  
-from ..fluid.clip import GradientClipByGlobalNorm  #DEFINE_ALIAS
-from ..fluid.clip import GradientClipByNorm  #DEFINE_ALIAS
-from ..fluid.clip import GradientClipByValue  #DEFINE_ALIAS
+from ..fluid.clip import ClipGradByGlobalNorm  #DEFINE_ALIAS
+from ..fluid.clip import ClipGradByNorm  #DEFINE_ALIAS
+from ..fluid.clip import ClipGradByValue  #DEFINE_ALIAS
 from ..fluid.layers import clip  #DEFINE_ALIAS
 
 from ..fluid.layers import clip_by_norm  #DEFINE_ALIAS
 
 __all__ = [
     #       'ErrorClipByValue',
-    'GradientClipByGlobalNorm',
-    'GradientClipByNorm',
-    'GradientClipByValue',
+    'ClipGradByGlobalNorm',
+    'ClipGradByNorm',
+    'ClipGradByValue',
     #       'set_gradient_clip',
     'clip',
     'clip_by_norm'

From ec7d11a4922f2013304a2203b07db727100e3816 Mon Sep 17 00:00:00 2001
From: cc <52520497+juncaipeng@users.noreply.github.com>
Date: Fri, 9 Oct 2020 12:47:30 +0800
Subject: [PATCH 34/91] refine fused_elemwise_activation error message (#27734)

---
 .../fused/fused_elemwise_activation_op.h      | 52 ++++++++++++-------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
index 2c0c5f9ec0afa..c61b9a9e48854 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
@@ -276,7 +276,8 @@ static void RunFunctors(const framework::ExecutionContext &ctx,
         ctx, paddle::operators::math::MulFunctor<T>(),
         paddle::operators::math::SigmoidFunctor<T>(), in_x, in_y, outputs);
   } else {
-    PADDLE_THROW("%s has not been implemented.", funcs_str);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s has not been implemented.", funcs_str));
   }
 }
 
@@ -374,7 +375,8 @@ static void RunGradFunctors(
         paddle::operators::math::SigmoidGradFunctor<T>(), in_x, in_y, in_out,
         in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
   } else {
-    PADDLE_THROW("%s has not been implemented.", funcs_str);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s has not been implemented.", funcs_str));
   }
 }
 
@@ -386,16 +388,21 @@ class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
                                  "X", "FusedElemwiseActivation");
     auto &in_y = GET_DATA_SAFELY(ctx.Input<framework::Tensor>("Y"), "Input",
                                  "Y", "FusedElemwiseActivation");
-    PADDLE_ENFORCE(ctx.HasOutput("Out"), "The output(Out) should not be empty");
+
+    PADDLE_ENFORCE_EQ(ctx.HasOutput("Out"), true,
+                      platform::errors::InvalidArgument(
+                          "The output(Out) should not be empty"));
     auto output = ctx.Output<framework::Tensor>("Out");
 
     std::vector<framework::Tensor *> outputs;
     outputs.emplace_back(output);
 
     if (ctx.Attr<bool>("save_intermediate_out")) {
-      PADDLE_ENFORCE(ctx.HasOutput("IntermediateOut"),
-                     "The save_intermediate_out is enable, so the "
-                     "IntermediateOut should not be empty.");
+      PADDLE_ENFORCE_EQ(ctx.HasOutput("IntermediateOut"), true,
+                        platform::errors::InvalidArgument(
+                            "The save_intermediate_out is enable, so the "
+                            "IntermediateOut should not be empty."));
+
       auto intermediate_out = ctx.Output<framework::Tensor>("IntermediateOut");
       outputs.emplace_back(intermediate_out);
     } else {
@@ -411,13 +418,18 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto in_y = ctx.Input<framework::Tensor>("Y");
-    PADDLE_ENFORCE(in_y != nullptr, "Input(Y) should not be nullptr.");
+    PADDLE_ENFORCE_NE(in_y, nullptr, platform::errors::InvalidArgument(
+                                         "Input(Y) should not be nullptr."));
     auto in_out = ctx.Input<framework::Tensor>("Out");
-    PADDLE_ENFORCE(in_out != nullptr, "Input(Out) should not be nullptr.");
+    PADDLE_ENFORCE_NE(
+        in_out, nullptr,
+        platform::errors::InvalidArgument("Input(Out) should not be nullptr."));
     auto in_out_grad =
         ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE(in_out_grad != nullptr,
-                   "Input(Out@Grad) should not be nullptr.");
+    PADDLE_ENFORCE_NE(in_out_grad, nullptr,
+                      platform::errors::InvalidArgument(
+                          "Input(Out@Grad) should not be nullptr."));
+
     framework::Tensor *in_x =
         const_cast<framework::Tensor *>(ctx.Input<framework::Tensor>("X"));
     framework::Tensor *x_grad =
@@ -437,24 +449,28 @@ class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
       // recompute.
       in_intermediate_out = const_cast<framework::Tensor *>(
           ctx.Input<framework::Tensor>("IntermediateOut"));
-      PADDLE_ENFORCE(in_intermediate_out != nullptr,
-                     "The option of 'save_intermediate_out' is opened, "
-                     "so the number of 'Out' should be two.");
+      PADDLE_ENFORCE_NE(in_intermediate_out, nullptr,
+                        platform::errors::InvalidArgument(
+                            "The option of 'save_intermediate_out' is opened,"
+                            " so the number of 'Out' should be two."));
     } else {
       if (!InputXCanBeAbsent(functor_list)) {
-        PADDLE_ENFORCE(in_x != nullptr, "Input(X) should not be null.");
+        PADDLE_ENFORCE_NE(in_x, nullptr, platform::errors::InvalidArgument(
+                                             "Input(X) should not be null."));
       }
     }
 
     // Get in_x
     if (ctx.HasInput("X")) {
-      PADDLE_ENFORCE(in_x != nullptr, "Input(X) should not be nullptr.");
+      PADDLE_ENFORCE_NE(in_x, nullptr, platform::errors::InvalidArgument(
+                                           "Input(X) should not be null."));
     } else {
       // If functor_list contains elementwise_add, the backward doesn't use
       // in_x, in_y and in_out.
-      PADDLE_ENFORCE(InputXCanBeAbsent(functor_list),
-                     "Only when the compoundfunctor contains "
-                     "elementwise_add_grad, the 'X' could be absent.");
+      PADDLE_ENFORCE_EQ(InputXCanBeAbsent(functor_list), true,
+                        platform::errors::InvalidArgument(
+                            "Only when the compoundfunctor contains "
+                            "elementwise_add_grad, the 'X' could be absent."));
       in_x = const_cast<framework::Tensor *>(in_out_grad);
     }
 

From 7ecbc465c1d5bcacbd0b1fab91ae4db1292fe934 Mon Sep 17 00:00:00 2001
From: FlyingQianMM <245467267@qq.com>
Date: Fri, 9 Oct 2020 13:21:11 +0800
Subject: [PATCH 35/91] reimplement paddle.nn.functional.sigmoid_focal_loss
 (#27748)

* reimplement paddle.nn.functional.sigmoid_focal_loss. test=develop

* fix reduction error message. test=develop

* fix exp. test=develop

* reset the shape of logit. test=develop

* delete disable_static in example. test=develop
---
 .../unittests/test_sigmoid_focal_loss.py      | 165 ++++++++++++++++++
 python/paddle/nn/functional/loss.py           | 163 ++++++++++++++++-
 2 files changed, 327 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py

diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
new file mode 100644
index 0000000000000..71e119739e777
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import unittest
+from op_test import OpTest
+from test_sigmoid_focal_loss_op import sigmoid_focal_loss_forward
+
+
+def call_sfl_functional(logit,
+                        label,
+                        normalizer,
+                        alpha=0.25,
+                        gamma=2.0,
+                        reduction='sum'):
+    res = paddle.nn.functional.sigmoid_focal_loss(
+        logit, label, normalizer, alpha=alpha, gamma=gamma, reduction=reduction)
+    return res
+
+
+def test_static(place,
+                logit_np,
+                label_np,
+                normalizer_np,
+                alpha=0.25,
+                gamma=2.0,
+                reduction='sum'):
+    paddle.enable_static()
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        logit = paddle.data(name='logit', shape=logit_np.shape, dtype='float64')
+        label = paddle.data(name='label', shape=label_np.shape, dtype='float64')
+        feed_dict = {"logit": logit_np, "label": label_np}
+
+        normalizer = None
+        if normalizer_np is not None:
+            normalizer = paddle.data(
+                name='normalizer', shape=normalizer_np.shape, dtype='float64')
+            feed_dict["normalizer"] = normalizer_np
+
+        res = call_sfl_functional(logit, label, normalizer, alpha, gamma,
+                                  reduction)
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog, feed=feed_dict, fetch_list=[res])
+    return static_result
+
+
+def test_dygraph(place,
+                 logit_np,
+                 label_np,
+                 normalizer_np,
+                 alpha=0.25,
+                 gamma=2.0,
+                 reduction='sum'):
+    paddle.disable_static()
+    logit = paddle.to_tensor(logit_np)
+    label = paddle.to_tensor(label_np)
+    normalizer = None
+    if normalizer_np is not None:
+        normalizer = paddle.to_tensor(normalizer_np)
+    dy_res = call_sfl_functional(logit, label, normalizer, alpha, gamma,
+                                 reduction)
+    dy_result = dy_res.numpy()
+    paddle.enable_static()
+    return dy_result
+
+
+def calc_sigmoid_focal_loss(logit_np,
+                            label_np,
+                            normalizer_np,
+                            alpha=0.25,
+                            gamma=2.0,
+                            reduction='sum'):
+
+    loss = np.maximum(
+        logit_np,
+        0) - logit_np * label_np + np.log(1 + np.exp(-np.abs(logit_np)))
+
+    pred = 1 / (1 + np.exp(-logit_np))
+    p_t = pred * label_np + (1 - pred) * (1 - label_np)
+
+    if alpha is not None:
+        alpha_t = alpha * label_np + (1 - alpha) * (1 - label_np)
+        loss = alpha_t * loss
+
+    if gamma is not None:
+        loss = loss * ((1 - p_t)**gamma)
+
+    if normalizer_np is not None:
+        loss = loss / normalizer_np
+
+    if reduction == 'mean':
+        loss = np.mean(loss)
+    elif reduction == 'sum':
+        loss = np.sum(loss)
+
+    return loss
+
+
+class TestSigmoidFocalLoss(unittest.TestCase):
+    def test_SigmoidFocalLoss(self):
+        logit_np = np.random.uniform(
+            0.1, 0.8, size=(2, 3, 4, 10)).astype(np.float64)
+        label_np = np.random.randint(
+            0, 2, size=(2, 3, 4, 10)).astype(np.float64)
+        normalizer_nps = [
+            np.asarray(
+                [np.sum(label_np > 0)], dtype=label_np.dtype), None
+        ]
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        reductions = ['sum', 'mean', 'none']
+        alphas = [0.25, 0.5]
+        gammas = [3, 0.]
+        for place in places:
+            for reduction in reductions:
+                for alpha in alphas:
+                    for gamma in gammas:
+                        for normalizer_np in normalizer_nps:
+                            static_result = test_static(place, logit_np,
+                                                        label_np, normalizer_np,
+                                                        alpha, gamma, reduction)
+                            dy_result = test_dygraph(place, logit_np, label_np,
+                                                     normalizer_np, alpha,
+                                                     gamma, reduction)
+                            expected = calc_sigmoid_focal_loss(
+                                logit_np, label_np, normalizer_np, alpha, gamma,
+                                reduction)
+                            self.assertTrue(
+                                np.allclose(static_result, expected))
+                            self.assertTrue(
+                                np.allclose(static_result, dy_result))
+                            self.assertTrue(np.allclose(dy_result, expected))
+
+    def test_SigmoidFocalLoss_error(self):
+        paddle.disable_static()
+        logit = paddle.to_tensor([[0.97], [0.91], [0.03]], dtype='float32')
+        label = paddle.to_tensor([[1.0], [1.0], [0.0]], dtype='float32')
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.sigmoid_focal_loss,
+            logit=logit,
+            label=label,
+            normalizer=None,
+            reduction="unsupport reduction")
+        paddle.enable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 05daf24ca24ab..c4b5606dddcf1 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -32,7 +32,6 @@
 from ...fluid.layers import rank_loss  #DEFINE_ALIAS
 from ...fluid.layers import reshape
 from ...fluid.layers import sigmoid_cross_entropy_with_logits  #DEFINE_ALIAS
-from ...fluid.layers import sigmoid_focal_loss  #DEFINE_ALIAS
 from ...fluid.layers import smooth_l1  #DEFINE_ALIAS
 from ...fluid.layers import softmax_with_cross_entropy  #DEFINE_ALIAS
 from ...fluid.layers import square_error_cost  #DEFINE_ALIAS
@@ -1151,3 +1150,165 @@ def cross_entropy(input,
         out = reshape(out, shape=out_shape)
 
     return out
+
+
+def sigmoid_focal_loss(logit,
+                       label,
+                       normalizer=None,
+                       alpha=0.25,
+                       gamma=2.0,
+                       reduction='sum',
+                       name=None):
+    """
+    `Focal Loss <https://arxiv.org/abs/1708.02002>`_ is proposed to address the
+    foreground-background class imbalance for classification tasks. It down-weights
+    easily-classified examples and thus focuses training on hard examples. For example,
+    it is used in one-stage object detection where the foreground-background class
+    imbalance is extremely high.
+
+    This operator measures focal loss function as follows: 
+
+    .. math::
+           Out = -Labels * alpha * {(1 - \\sigma(Logit))}^{gamma}\\log(\\sigma(Logit)) - (1 - Labels) * (1 - alpha) * {\\sigma(Logit)}^{gamma}\\log(1 - \\sigma(Logit))
+
+    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\exp(-Logit)}`. 
+
+    Then, if :attr:`normalizer` is not None, this operator divides the
+    normalizer tensor on the loss `Out`:
+
+    .. math::
+           Out = \\frac{Out}{normalizer}
+
+    Finally, this operator applies reduce operation on the loss.
+    If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
+    If :attr:`reduction` set to ``'mean'``, the reduced mean loss is :math:`Out = MEAN(Out)`.
+    If :attr:`reduction` set to ``'sum'``, the reduced sum loss is :math:`Out = SUM(Out)`.
+
+    Note that the target ``label`` is 0 for the negative class and is 1 for the positive class.
+
+    Args:
+        logit (Tensor): The input logit tensor. The shape is [N, *], where N is batch_size,
+            `*` means any number of additional dimensions. The ``logit`` is usually the
+            output of a convolution layer. Available dtype is float32, float64.
+        label (Tensor): The target label tensor with the same shape as
+            ``logit``. The target label whose value should be numbers between 0 and 1.
+            Available dtype is float32, float64.
+        normalizer (Tensor, optional): The number normalizes the focal loss. It has to be
+            a 1-D Tensor whose shape is `[1, ]`. The data type is float32, float64.
+            For object detection task, it is the the number of positive samples.
+            If set to None, the focal loss will not be normalized. Default is None.
+        alpha(int|float, optional): Hyper-parameter to balance the positive and negative example,
+            it should be between 0 and 1.  Default value is set to 0.25. 
+        gamma(int|float, optional): Hyper-parameter to modulate the easy and hard examples.
+            Default value is set to 2.0.
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default is ``'sum'``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, if :attr:`reduction` is ``'mean'`` or ``'sum'``, the out shape is :math:`[1]`, otherwise the shape is the same as ``logit``. The same dtype as ``logit`` tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            logit = paddle.to_tensor([[0.97, 0.91, 0.03], [0.55, 0.43, 0.71]], dtype='float32')
+            label = paddle.to_tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype='float32')
+            one = paddle.to_tensor([1.], dtype='float32')
+            fg_label = paddle.greater_equal(label, one)
+            fg_num = paddle.reduce_sum(paddle.cast(fg_label, dtype='float32'))
+            output = paddle.nn.functional.sigmoid_focal_loss(logit, label, normalizer=fg_num)
+            print(output.numpy())  # [0.65782464]
+
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in sigmoid_focal_loss "
+            "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
+            % reduction)
+
+    if normalizer is not None:
+        fluid.data_feeder.check_variable_and_dtype(normalizer, 'normalizer',
+                                                   ['float32', 'float64'],
+                                                   'sigmoid_focal_loss')
+        normalizer_shape = list(normalizer.shape)
+        normalizer_dims = len(normalizer_shape)
+        if normalizer_dims > 1:
+            raise ValueError(
+                "Expected one dimension of normalizer in sigmoid_focal_loss but got {}.".
+                format(normalizer_dims))
+
+    if in_dygraph_mode():
+        one = _varbase_creator(dtype=logit.dtype)
+        core.ops.fill_constant(one, 'value',
+                               float(1.0), 'force_cpu', False, 'dtype',
+                               one.dtype, 'str_value', '1.0', 'shape',
+                               logit.shape)
+        loss = core.ops.sigmoid_cross_entropy_with_logits(logit, label)
+        pred = core.ops.sigmoid(logit)
+        p_t = core.ops.elementwise_add(
+            core.ops.elementwise_mul(pred, label),
+            core.ops.elementwise_mul(
+                core.ops.elementwise_sub(one, pred),
+                core.ops.elementwise_sub(one, label)))
+
+        alpha = fluid.dygraph.base.to_variable([alpha], dtype=loss.dtype)
+        alpha_t = core.ops.elementwise_add(
+            core.ops.elementwise_mul(alpha, label),
+            core.ops.elementwise_mul(
+                core.ops.elementwise_sub(one, alpha),
+                core.ops.elementwise_sub(one, label)))
+        loss = core.ops.elementwise_mul(alpha_t, loss)
+
+        gamma = fluid.dygraph.base.to_variable([gamma], dtype=loss.dtype)
+        gamma_t = core.ops.elementwise_pow(
+            core.ops.elementwise_sub(one, p_t), gamma)
+        loss = core.ops.elementwise_mul(gamma_t, loss)
+
+        if normalizer is not None:
+            loss = core.ops.elementwise_div(loss, normalizer)
+
+        if reduction == "sum":
+            return core.ops.reduce_sum(loss, 'reduce_all', True)
+        elif reduction == "mean":
+            return core.ops.mean(loss)
+
+        return loss
+
+    fluid.data_feeder.check_variable_and_dtype(
+        logit, 'logit', ['float32', 'float64'], 'sigmoid_focal_loss')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'sigmoid_focal_loss')
+
+    bce_name = None
+    if reduction == 'none' and normalizer is None:
+        bce_name = name
+    loss = paddle.nn.functional.binary_cross_entropy_with_logits(
+        logit, label, reduction='none', name=bce_name)
+
+    pred = fluid.layers.sigmoid(logit)
+    p_t = pred * label + (1 - pred) * (1 - label)
+
+    alpha_t = alpha * label + (1 - alpha) * (1 - label)
+    loss = paddle.multiply(alpha_t, loss)
+
+    gamma_t = paddle.pow((1 - p_t), gamma)
+    loss = paddle.multiply(gamma_t, loss)
+
+    if normalizer is not None:
+        normalizer_name = name if reduction == 'none' else None
+        loss = paddle.divide(loss, normalizer, name=normalizer_name)
+
+    if reduction == 'mean':
+        loss = paddle.mean(loss, name=name)
+    elif reduction == 'sum':
+        loss = paddle.sum(loss, name=name)
+
+    return loss

From 6da552a22f21452978959a8cca371b082693aec7 Mon Sep 17 00:00:00 2001
From: zhulei <563755780@qq.com>
Date: Fri, 9 Oct 2020 15:52:41 +0800
Subject: [PATCH 36/91] Update initializer examples of Bilinear (#27709)

---
 python/paddle/fluid/initializer.py | 41 +++++++++++++++---------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 7a92adf0a89dc..67c572d4988ce 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -729,31 +729,32 @@ class BilinearInitializer(Initializer):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
             import math
+
+            import paddle
+            import paddle.nn as nn
+            from paddle.regularizer import L2Decay
+
             factor = 2
             C = 2
             B = 8
             H = W = 32
-            w_attr = fluid.param_attr.ParamAttr(
-                learning_rate=0., 
-                regularizer=fluid.regularizer.L2Decay(0.),
-                initializer=fluid.initializer.Bilinear())
-            x = fluid.data(name="data", shape=[B, 3, H, W], 
-                                  dtype="float32")
-            conv_up = fluid.layers.conv2d_transpose(
-                input=x,
-                num_filters=C,
-                output_size=None,
-                filter_size=2 * factor - factor % 2,
-                padding=int(math.ceil((factor - 1) / 2.)),
-                stride=factor,
-                groups=C,
-                param_attr=w_attr,
-                bias_attr=False)
-
-    Where, `num_filters=C` and `groups=C` means this is channel-wise transposed
-    convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`,
+            w_attr = paddle.ParamAttr(learning_rate=0.,
+                                      regularizer=L2Decay(0.),
+                                      initializer=nn.initializer.Bilinear())
+            data = paddle.rand([B, 3, H, W], dtype='float32')
+            conv_up = nn.ConvTranspose2d(3,
+                                         out_channels=C,
+                                         kernel_size=2 * factor - factor % 2,
+                                         padding=int(
+                                             math.ceil((factor - 1) / 2.)),
+                                         stride=factor,
+                                         weight_attr=w_attr,
+                                         bias_attr=False)
+            x = conv_up(data)
+
+    Where, `out_channels=C` and `groups=C` means this is channel-wise transposed
+    convolution. The filter shape will be (C, 1, K, K) where K is `kernel_size`,
     This initializer will set a (K, K) interpolation kernel for every channel
     of the filter identically. The resulting shape of the output feature map
     will be (B, C, factor * H, factor * W). Note that the learning rate and the

From baddedfdf1b13a76738cd30f7ee26ab4aac6bc0b Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Fri, 9 Oct 2020 15:56:44 +0800
Subject: [PATCH 37/91] fix ut test=develop (#27760)

---
 python/paddle/fluid/tests/unittests/hdfs_test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
index 266e42c06199d..29204a000592a 100644
--- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
+++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
@@ -157,7 +157,7 @@ def _test_download(self, fs):
 
         assert fs.need_upload_download()
 
-        self.assertTrue(fs.is_exist(dst_file))
+        self.assertFalse(fs.is_exist(dst_file))
         fs.delete(dst_file)
         fs.delete(src_file)
 

From 6c1acf34ed2d612c98b8a3e1709486cf5ca15789 Mon Sep 17 00:00:00 2001
From: xiemoyuan <71377852+xiemoyuan@users.noreply.github.com>
Date: Fri, 9 Oct 2020 17:15:40 +0800
Subject: [PATCH 38/91] Optimize the error message for OP (#27617)

* Optimize the error message for OPs.

* Optimize the error message for OPs in details.
---
 .../fluid/operators/beam_search_decode_op.cc  |  3 +-
 paddle/fluid/operators/chunk_eval_op.h        |  2 +-
 paddle/fluid/operators/cudnn_lstm_op.cc       |  4 +--
 paddle/fluid/operators/edit_distance_op.cu    |  5 +--
 paddle/fluid/operators/edit_distance_op.h     | 14 ++++----
 paddle/fluid/operators/expand_as_op.cc        |  5 +--
 paddle/fluid/operators/expand_as_op.h         | 30 +++++++++-------
 paddle/fluid/operators/linear_chain_crf_op.h  | 36 ++++++++++++-------
 8 files changed, 61 insertions(+), 38 deletions(-)

diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 3cb3f1d48bfa7..4bf4ba1120df0 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -117,7 +117,8 @@ void BeamSearchDecodeFunctor::apply() const {
 
 template <>
 void BeamSearchDecodeFunctor::apply<bool>() const {
-  PADDLE_THROW("beam search decode op does not support bool!");
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "beam search decode op does not support bool!"));
 }
 
 class BeamSearchDecodeOp : public framework::OperatorBase {
diff --git a/paddle/fluid/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h
index bee3ab37448e8..555130fe85268 100644
--- a/paddle/fluid/operators/chunk_eval_op.h
+++ b/paddle/fluid/operators/chunk_eval_op.h
@@ -146,7 +146,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
       tag_end = -1;
       tag_single = -1;
     } else {
-      PADDLE_THROW("Unknown chunk scheme.");
+      PADDLE_THROW(platform::errors::InvalidArgument("Unknown chunk scheme."));
     }
     other_chunk_type = num_chunk_types = context.Attr<int>("num_chunk_types");
     excluded_chunk_types.insert(
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index 82954bc109a74..50486ad041aa4 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -274,8 +274,8 @@ template <typename T>
 class NotImpleKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(
-        "CPU is not support for this kernel now. Will be add in the future");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "CPU is not support for this kernel now. Will be add in the future"));
   }
 };
 
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
index 8d79626aa8785..80490af33a1f9 100644
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -111,8 +111,9 @@ class EditDistanceGPUKernel : public framework::OpKernel<T> {
 
     if (normalized) {
       for (size_t i = 1; i < ref_lod.size(); ++i) {
-        PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
-                       "Reference string %d is empty.", i);
+        PADDLE_ENFORCE_GT(ref_lod[i], ref_lod[i - 1],
+                          platform::errors::InvalidArgument(
+                              "Reference string %d is empty.", i));
       }
     }
 
diff --git a/paddle/fluid/operators/edit_distance_op.h b/paddle/fluid/operators/edit_distance_op.h
index 3e1aec7ceeec7..ef290c2eff2be 100644
--- a/paddle/fluid/operators/edit_distance_op.h
+++ b/paddle/fluid/operators/edit_distance_op.h
@@ -58,8 +58,9 @@ class EditDistanceKernel : public framework::OpKernel<T> {
 
     if (normalized) {
       for (size_t i = 1; i < ref_lod.size(); ++i) {
-        PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
-                       "Reference string %d is empty.", i);
+        PADDLE_ENFORCE_GT(ref_lod[i], ref_lod[i - 1],
+                          platform::errors::InvalidArgument(
+                              "Reference string %d is empty.", i));
       }
     }
     auto num_strs = hyp_lod.size() - 1;
@@ -106,10 +107,11 @@ class EditDistanceKernel : public framework::OpKernel<T> {
       }
 
       if (normalized) {
-        PADDLE_ENFORCE(n > 0,
-                       "The reference string (#%d) cannot be empty "
-                       "when Attr(normalized) is enabled.",
-                       n);
+        PADDLE_ENFORCE_GT(n, 0UL,
+                          platform::errors::InvalidArgument(
+                              "The reference string (#%d) cannot be empty "
+                              "when Attr(normalized) is enabled.",
+                              n));
         distance = distance / n;
       }
       out[num] = distance;
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index 870464efed2b1..25b83ed93f729 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -89,8 +89,9 @@ class ExpandAsGradOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true);
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true);
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAs");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "ExpandAs");
 
     auto x_dims = ctx->GetInputDim("X");
     auto x_grad_name = framework::GradVarName("X");
diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h
index b189aa6f12274..cbaeb0c4e4256 100644
--- a/paddle/fluid/operators/expand_as_op.h
+++ b/paddle/fluid/operators/expand_as_op.h
@@ -61,7 +61,10 @@ class ExpandAsKernel : public framework::OpKernel<T> {
     switch (rank) {
       REP_EXPAND_AS_TEMPLATE(MAX_RANK_SUPPORTED)
       default:
-        PADDLE_THROW("Only support tensor with rank being between 1 and 6.");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Only support tensor with rank being between 1 and 6. But received "
+            "tensor X's rank = %d.",
+            rank));
     }
   }
 
@@ -77,13 +80,19 @@ class ExpandAsKernel : public framework::OpKernel<T> {
     auto x_dims = in0->dims();
     auto y_dims = target_tensor->dims();
     for (int i = 0; i < y_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(x_dims[i], 0, "X(input) should not have 0 dim");
+      PADDLE_ENFORCE_NE(
+          x_dims[i], 0UL,
+          platform::errors::InvalidArgument(
+              "X(input) should not have 0 dim. But received x_dims[%d] = 0.",
+              i));
       bcast_dims[i] = y_dims[i] / x_dims[i];
       bcast_dims_remainder += y_dims[i] % x_dims[i];
     }
-    PADDLE_ENFORCE_EQ(bcast_dims_remainder, 0,
-                      "X(input) could not be broadcast together with remapped "
-                      "shape(expand tensor's shape)");
+    PADDLE_ENFORCE_EQ(
+        bcast_dims_remainder, 0UL,
+        platform::errors::InvalidArgument(
+            "X(input) could not be broadcast together with remapped "
+            "shape(expand tensor's shape)"));
     framework::DDim out_dims(in_dims);
     for (size_t i = 0; i < bcast_dims.size(); ++i) {
       out_dims[i] *= bcast_dims[i];
@@ -137,7 +146,10 @@ class ExpandAsGradKernel : public framework::OpKernel<T> {
       switch (dims) {
         REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
         default:
-          PADDLE_THROW("Only support tensor with rank being between 1 and 6.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support tensor with rank being between 1 and 6. But "
+              "received tensor's rank = %d.",
+              dims));
       }
     }
   }
@@ -149,12 +161,6 @@ class ExpandAsGradKernel : public framework::OpKernel<T> {
                         const std::vector<int>& reduce_dims_vec) const {
     size_t reshape_size = reshape_dims_vec.size();
     size_t reduce_size = reduce_dims_vec.size();
-    PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(),
-                      "Inconsistent size between template Dims and "
-                      "reshape dimensions.");
-    PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(),
-                      "Inconsistent size between template Dims and "
-                      "reduce dimensions.");
     auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index 488cbc6d517fc..d4f3fc5d7a622 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -27,9 +27,10 @@ static inline T NormalizeL1(T* x, size_t len) {
   // (This comment is from the old LinearChainCRFLayer.)
   // Right now, we just bet that sum won't be zero. If this really happens, we
   // will figure out what should be done then.
-  PADDLE_ENFORCE(sum,
-                 "The unnormalized probabilities of all possible unfinished "
-                 "sequences must be greater than 0.");
+  PADDLE_ENFORCE_GT(
+      sum, 0., platform::errors::InvalidArgument(
+                   "The unnormalized probabilities of all possible unfinished "
+                   "sequences must be greater than 0."));
   T s = 1. / sum;
   for (size_t i = 0; i < len; ++i) x[i] *= s;
   return sum;
@@ -84,13 +85,19 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
       const Tensor* label_length = ctx.Input<framework::Tensor>("Length");
       length_data = label_length->data<int64_t>();
       seq_num = label_length->numel();
-      PADDLE_ENFORCE_EQ(seq_num, emission_dims[0],
-                        "the size of Input(length) must be equal to "
-                        "emission_dims[0].");
+      PADDLE_ENFORCE_EQ(
+          seq_num, emission_dims[0],
+          platform::errors::InvalidArgument(
+              "the size of Input(length) must be equal to "
+              "emission_dims[0]. But input_size = %d, emission_dims[0] = %d.",
+              seq_num, emission_dims[0]));
       auto label_dims = label->dims();
-      PADDLE_ENFORCE_EQ(seq_num, label_dims[0],
-                        "the size of Input(length) must be equal to "
-                        "label_dims[0].");
+      PADDLE_ENFORCE_EQ(
+          seq_num, label_dims[0],
+          platform::errors::InvalidArgument(
+              "the size of Input(length) must be equal to "
+              "label_dims[0]. But input_size = %d, label_dims[0] = %d.",
+              seq_num, label_dims[0]));
 
       batch_size = emission_dims[0] * emission_dims[1];
       tag_num = emission_dims[2];
@@ -102,7 +109,9 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
       math::set_constant(ctx.device_context(), alpha, 0.0);
     } else {
       in_lod = ctx.Input<LoDTensor>("Label")->lod();
-      PADDLE_ENFORCE_NE(in_lod.size(), 0, "Input(Label) must be a sequence.");
+      PADDLE_ENFORCE_NE(in_lod.size(), 0,
+                        platform::errors::InvalidArgument(
+                            "Input(Label) must be a sequence."));
       seq_num = in_lod[0].size() - 1;
       batch_size = emission_dims[0];
       tag_num = emission_dims[1];
@@ -204,7 +213,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     const int64_t* lbl = label.data<int64_t>();
     PADDLE_ENFORCE_LT(
         static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)), tag_num,
-        "An invalid tag label that execesses the largest tag number.");
+        platform::errors::InvalidArgument(
+            "An invalid tag label that execesses the largest tag number."));
 
     // Calculate the nominator part, which depends on the label sequence.
     ll += w[lbl[0]] /*start transition*/ + x[lbl[0]] +
@@ -254,7 +264,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
           {emission_dims[0] * emission_dims[1], emission_dims[2]});
     } else {
       in_lod = ctx.Input<LoDTensor>("Label")->lod();
-      PADDLE_ENFORCE_NE(in_lod.size(), 0, "Input(Label) must be a sequence.");
+      PADDLE_ENFORCE_NE(in_lod.size(), 0,
+                        platform::errors::InvalidArgument(
+                            "Input(Label) must be a sequence."));
       seq_num = static_cast<int64_t>(in_lod[0].size() - 1);
     }
 

From 606611d35102c670112fa792f8b24b70e56fc68a Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Fri, 9 Oct 2020 11:25:51 +0200
Subject: [PATCH 39/91] [oneDNN] GRU BF16 kernel  (#27731)

---
 .../framework/ir/graph_pattern_detector.cc    |   3 +-
 .../fused/mkldnn/fusion_gru_mkldnn_op.cc      |  13 +-
 .../mkldnn/test_fusion_gru_bf16_mkldnn_op.py  | 113 ++++++++++++++++++
 3 files changed, 123 insertions(+), 6 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 449881a9f8feb..ed2863e8bf798 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1894,7 +1894,8 @@ PDNode *patterns::QuantizePlacement::operator()(
 
 PDNode *patterns::Bfloat16Placement::operator()(
     const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
-  std::unordered_set<std::string> supported_op_types{"conv2d"};
+  std::unordered_set<std::string> supported_op_types =
+      std::unordered_set<std::string>({"conv2d", "fusion_gru"});
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
index 58ecc6731f00b..e51d94e4b1e05 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -86,7 +86,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
 
       // Weights for int8 kernel are of a type s8
       const auto weights_dt =
-          is_INT8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::f32;
+          is_INT8 ? dnnl::memory::data_type::s8 : MKLDNNGetDataType<T>();
 
       // oneDNN RNN dimensions
       const int64_t D = 1;  // Directions
@@ -226,6 +226,8 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
   }
 
   // TODO(grygielski) H0 is for now persistable
+  // TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
+  // not support in yet)
   std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
     const std::string h0_key = memory_key_ + "@h0";
     auto memory_p =
@@ -397,14 +399,14 @@ template <typename T>
 class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const bool is_INT8 = std::is_same<T, uint8_t>::value;
+    const bool is_bf16 = std::is_same<T, paddle::platform::bfloat16>::value;
     const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
 
-    // TODO(grygielski) Add option for bfloat
-    if (!is_INT8 || force_fp32_output) {
+    // BF16 does not support force output
+    if (!is_bf16 && force_fp32_output) {
       RunKernel<float>(ctx);
     } else {
-      RunKernel<uint8_t>(ctx);
+      RunKernel<T>(ctx);
     }
   }
 
@@ -495,4 +497,5 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace,
                    ops::FusionGRUMKLDNNKernel<float>,
+                   ops::FusionGRUMKLDNNKernel<paddle::platform::bfloat16>,
                    ops::FusionGRUMKLDNNKernel<uint8_t>);
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
new file mode 100644
index 0000000000000..83b636650ab41
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import struct
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_fusion_gru_op import fusion_gru
+from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestFusionGRUBF16MKLDNNOp(OpTest):
+    def set_confs(self):
+        self.mkldnn_data_type = False
+
+    def setUp(self):
+        self.op_type = "fusion_gru"
+        self.lod = [[2, 4, 3]]
+        self.M = 3
+        self.D = 5
+        self.is_reverse = False
+        self.with_h0 = False
+        self.use_mkldnn = True
+        self._cpu_only = True
+        self.with_bias = True
+        self.act_state = 'tanh'
+        self.act_gate = 'sigmoid'
+        self.origin_mode = False
+        self.use_mkldnn = True
+        self.force_fp32_output = False
+        self.set_confs()
+
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
+
+        # fp32 X input for reference implementation and
+        # corressponding bf16 data as input to GRU oneDNN bf16 kernel
+        x_fp32 = np.random.rand(T, self.M).astype('float32')
+        x_bf16 = convert_float_to_uint16(x_fp32)
+
+        wx_fp32 = np.random.rand(self.M, 3 * self.D).astype('float32')
+        wh_fp32 = np.random.rand(self.D, 3 * self.D).astype('float32')
+
+        # bias is fp32 despite other inputs being in bf16
+        bias = np.random.rand(
+            1, 3 * self.D).astype('float32') if self.with_bias else np.zeros(
+                (1, 3 * self.D), dtype='float32')
+
+        h0_fp32 = np.random.rand(
+            N, self.D).astype('float32') if self.with_h0 else np.zeros(
+                (N, self.D), dtype='float32')
+
+        _, _, _, hidden = fusion_gru(
+            x_fp32, self.lod, h0_fp32, wx_fp32, wh_fp32, bias, self.is_reverse,
+            self.origin_mode, ACTIVATION[self.act_state],
+            ACTIVATION[self.act_gate])
+
+        hidden_bf16 = convert_float_to_uint16(hidden)
+
+        self.inputs = {
+            'X': (x_bf16, self.lod),
+            'WeightX': wx_fp32,
+            'WeightH': wh_fp32
+        }
+
+        if self.with_bias:
+            self.inputs['Bias'] = bias
+
+        if self.with_h0:
+            self.inputs['H0'] = h0_bf16
+
+        h0_bf16 = convert_float_to_uint16(h0_fp32)
+        self.outputs = {'Hidden': (hidden_bf16, self.lod)}
+
+        self.attrs = {
+            'activation': self.act_state,
+            'gate_activation': self.act_gate,
+            'is_reverse': self.is_reverse,
+            'origin_mode': self.origin_mode,
+            'force_fp32_output': self.force_fp32_output,
+            'use_mkldnn': self.use_mkldnn
+        }
+
+
+class TestFusionGRUINT8MKLDNNOp2(TestFusionGRUBF16MKLDNNOp):
+    def set_confs(self):
+        self.origin_mode = False
+
+
+class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUBF16MKLDNNOp):
+    def set_confs(self):
+        self.with_bias = False
+
+
+if __name__ == "__main__":
+    unittest.main()

From 057e28bc8f56ace065475a8b1a032fe0820ee912 Mon Sep 17 00:00:00 2001
From: smallv0221 <33639025+smallv0221@users.noreply.github.com>
Date: Fri, 9 Oct 2020 17:32:07 +0800
Subject: [PATCH 40/91] API(lstm_unit, lstmp, sequence_mask,
 sequence_enumerate, sequence_conv) error message enhancement (#27572)

* API(Compute) error message enhancement on line 44, 50, 53.

* lstm_unit error message enhancement.
lstmp error message enhancement.
sequence_conv error message enhencement.
sequence_enumerate error message enhencement.
sequence_mask error message enhencement.

* Update lstm_unit_op.cc

* Update lstm_unit_op.h

* error msg enhancement.

* Update sequence_conv_op.cc

* Update lstm_unit_op.cc

* Update sequence_conv_op.cc

* Update sequence_enumerate_op.cc

* Update sequence_enumerate_op.cu

* Update sequence_enumerate_op.h

* Update sequence_pool_op.h

* error message enhencement.

* error message enhancement.
---
 paddle/fluid/operators/lstm_unit_op.cc        | 36 +++++++++++--------
 paddle/fluid/operators/lstm_unit_op.cu        | 10 +++---
 paddle/fluid/operators/lstm_unit_op.h         | 10 +++---
 paddle/fluid/operators/lstmp_op.h             |  6 ++--
 .../sequence_ops/sequence_conv_op.cc          | 18 +++++-----
 .../sequence_ops/sequence_enumerate_op.cc     |  7 ++--
 .../sequence_ops/sequence_enumerate_op.cu     |  5 ++-
 .../sequence_ops/sequence_enumerate_op.h      | 24 +++++++++----
 .../sequence_ops/sequence_mask_op.cc          |  6 ++--
 .../operators/sequence_ops/sequence_pool_op.h | 14 ++++++--
 10 files changed, 89 insertions(+), 47 deletions(-)

diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
index c325c0892ed81..917482589fcf3 100644
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -23,23 +23,31 @@ class LstmUnitOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("C_prev"),
-                   "Input(C_prev) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("C"),
-                   "Output(C) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("H"),
-                   "Output(H) of LSTM should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lstm_unit");
+    OP_INOUT_CHECK(ctx->HasInput("C_prev"), "Input", "C_prev", "lstm_unit");
+    OP_INOUT_CHECK(ctx->HasOutput("C"), "Output", "C", "lstm_unit");
+    OP_INOUT_CHECK(ctx->HasOutput("H"), "Output", "H", "lstm_unit");
 
     auto x_dims = ctx->GetInputDim("X");
     auto c_prev_dims = ctx->GetInputDim("C_prev");
 
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "Input(X)'s rank must be 2. Received %d instead.", x_dims.size()));
     if (ctx->IsRuntime()) {
       PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0],
-                        "Batch size of inputs and states must be equal");
+                        platform::errors::InvalidArgument(
+                            "Batch size of inputs and states must be equal, "
+                            "but received %d (inputs)"
+                            "vs %d (states).",
+                            x_dims[0], c_prev_dims[0]));
       PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4,
-                        "Dimension of FC should equal to prev state * 4");
+                        platform::errors::InvalidArgument(
+                            "Dimension of FC should equal to prev state * 4, "
+                            "but received %d (dimension of FC)"
+                            "vs %d (prev state * 4).",
+                            x_dims[1], c_prev_dims[1] * 4));
     }
 
     int b_size = c_prev_dims[0];  // batch size
@@ -85,10 +93,10 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("C")),
-                   "Input(C@GRAD) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("H")),
-                   "Input(H@GRAD) should not be null");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("C")), "Input",
+                   framework::GradVarName("C"), "lstm_unit");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("H")), "Input",
+                   framework::GradVarName("H"), "lstm_unit");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
     ctx->SetOutputDim(framework::GradVarName("C_prev"),
                       ctx->GetInputDim("C_prev"));
diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu
index 810b83cb535fe..3949a066e0868 100644
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@@ -93,8 +93,9 @@ template <typename T>
 class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
 
     auto* x_tensor = ctx.Input<framework::Tensor>("X");
     auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
@@ -124,8 +125,9 @@ template <typename T>
 class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace."));
 
     auto x_tensor = ctx.Input<Tensor>("X");
     auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
diff --git a/paddle/fluid/operators/lstm_unit_op.h b/paddle/fluid/operators/lstm_unit_op.h
index 3fe7bda39b68d..99ae654d7ef0c 100644
--- a/paddle/fluid/operators/lstm_unit_op.h
+++ b/paddle/fluid/operators/lstm_unit_op.h
@@ -39,8 +39,9 @@ template <typename DeviceContext, typename T>
 class LstmUnitKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_cpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
 
     auto* x_tensor = ctx.Input<framework::Tensor>("X");
     auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
@@ -82,8 +83,9 @@ template <typename DeviceContext, typename T>
 class LstmUnitGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_cpu_place(ctx.GetPlace()), true,
+        paddle::platform::errors::PreconditionNotMet("It must use CPUPlace."));
 
     auto x_tensor = ctx.Input<Tensor>("X");
     auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index f0a727f34fec7..a2d1d5295be82 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -91,7 +91,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
     else if (act_type == math::detail::ActivationType::kReLU)
       ReluFunctor<T>()(d, x, y);
     else
-      PADDLE_THROW("unsupported activation type");
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("unsupported activation type"));
   }
 
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -263,7 +264,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     else if (act_type == math::detail::ActivationType::kReLU)
       ReluGradFunctor<T>()(d, x, y, dy, dx);
     else
-      PADDLE_THROW("unsupported activation type");
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("unsupported activation type"));
   }
 
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
index 99e8064d2446f..5f976685c982b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
@@ -59,20 +59,22 @@ class SequenceConvOp : public framework::OperatorWithKernel {
             filter_dims[0], context_length * in_dims[1]));
 
     if (ctx->Attrs().Get<bool>("paddingTrainable")) {
-      PADDLE_ENFORCE(
-          ctx->HasInput("PaddingData"),
-          "Input(PaddingData) of SequenceConvOp should not be null.");
+      OP_INOUT_CHECK(ctx->HasInput("PaddingData"), "Input", "PaddingData",
+                     "sequence_conv");
       framework::DDim padding_dim = ctx->GetInputDim("PaddingData");
       int up_pad = std::max(0, -context_start);
       int down_pad = std::max(0, context_start + context_length - 1);
       int total_pad = up_pad + down_pad;
       int input_width = static_cast<int>(in_dims[1]);
+      bool start_equals_zero = context_start == 0;
+      bool length_equals_one = context_length == 1;
+      bool start_length = start_equals_zero && length_equals_one;
 
-      if (context_start == 0 && context_length == 1) {
-        PADDLE_THROW(
-            "If context_start is 0 and context_length is 1, paddingTrainable "
-            "should be false.");
-      }
+      PADDLE_ENFORCE_EQ(
+          start_length, false,
+          platform::errors::InvalidArgument(
+              "If context_start is 0 and context_length is 1, paddingTrainable "
+              "should be false."));
       PADDLE_ENFORCE_EQ(
           padding_dim.size(), 2,
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
index 1dbddfa709d72..758ff01b1e7ec 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -43,8 +43,11 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
               "Output LoDTensor of SequenceEnumerate operator.");
     AddAttr<int>("win_size", "(int) The enumerate sequence window size.")
         .AddCustomChecker([](const int& win_size) {
-          PADDLE_ENFORCE(win_size >= 2,
-                         "The window size should be not less than 2.");
+          PADDLE_ENFORCE_GE(win_size, 2,
+                            platform::errors::InvalidArgument(
+                                "The window size should be not less than 2."
+                                "Received window size is %d",
+                                win_size));
         });
     AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
         .SetDefault(0);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
index d5deb7582c7c0..6d8f60ce932ab 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
@@ -58,7 +58,10 @@ class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_EQ(
         static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
-        "The actual input data's size mismatched with LoD information.");
+        platform::errors::InvalidArgument(
+            "The actual input data's size mismatched with LoD information."
+            "Received input data size is %d (actual) vs %d (loD information).",
+            static_cast<uint64_t>(in_dims[0]), in_lod[0].back()));
 
     /* Generate enumerate sequence set */
     auto stream = context.cuda_device_context().stream();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
index 4807521bc0d92..d104d33caebb3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h
@@ -29,21 +29,31 @@ class SequenceEnumerateKernel : public framework::OpKernel<T> {
     int win_size = context.Attr<int>("win_size");
     auto pad_value = static_cast<T>(context.Attr<int>("pad_value"));
 
-    PADDLE_ENFORCE_EQ(in->lod().empty(), false,
-                      "Input(X) Tensor of SequenceEnumerateOp does not contain "
-                      "LoD information.");
+    PADDLE_ENFORCE_EQ(
+        in->lod().empty(), false,
+        platform::errors::InvalidArgument(
+            "Input(X) Tensor of SequenceEnumerateOp does not contain "
+            "LoD information."));
 
     auto in_dims = in->dims();
     auto lod0 = in->lod()[0];
     PADDLE_ENFORCE_EQ(
         static_cast<uint64_t>(in_dims[0]), lod0.back(),
-        "The actual input data's size mismatched with LoD information.");
+        platform::errors::InvalidArgument(
+            "The actual input data's size mismatched with LoD information."
+            "Received input data size is %d (actual) vs %d (loD information).",
+            static_cast<uint64_t>(in_dims[0]), lod0.back()));
     PADDLE_ENFORCE_EQ(
         in_dims.size(), 2UL,
-        "Input(X) of SequenceEnumerate operator's rank should be 2.");
+        platform::errors::InvalidArgument(
+            "Input(X) of SequenceEnumerate operator's rank should be 2."
+            "Received %d instead.",
+            in_dims.size()));
     PADDLE_ENFORCE_EQ(in_dims[1], 1,
-                      "Input(X) of SequenceEnumerate operator's 2nd "
-                      "dimension should be 1.");
+                      platform::errors::InvalidArgument(
+                          "Input(X) of SequenceEnumerate operator's 2nd "
+                          "dimension should be 1. Received %d instead.",
+                          in_dims[1]));
 
     // Generate enumerate sequence set
     auto in_data = in->data<T>();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
index b8912dd4c7960..b06b1f755a22b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
@@ -69,8 +69,10 @@ class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
                  "= max(Input(X)).")
         .SetDefault(-1)
         .AddCustomChecker([](const int& v) {
-          PADDLE_ENFORCE(v < 0 || v >= 1,
-                         "Attr(maxlen) must be less than 0 or larger than 1");
+          PADDLE_ENFORCE_EQ(
+              v < 0 || v >= 1, true,
+              platform::errors::InvalidArgument(
+                  "Attr(maxlen) must be less than 0 or larger than 1"));
         });
     AddAttr<int>("out_dtype", "Output data type");
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
index 8fe68deca66aa..37f9caf76ceba 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
@@ -42,14 +42,22 @@ class SequencePoolKernel : public framework::OpKernel<T> {
                                         "Input(X) Tensor of SequencePoolOp "
                                         "does not contain LoD information."));
     PADDLE_ENFORCE_LE(lod_level, 2UL,
-                      "The lod level of input shall be no more than 2.");
+                      platform::errors::InvalidArgument(
+                          "The lod level of input shall be no more than 2."
+                          "Received lod level is %d.",
+                          lod_level));
     PADDLE_ENFORCE_GE(
         dims[0],
         /*batch size = */ static_cast<int64_t>(lod[lod_level - 1].size() - 1),
-        "The first dimension of Input(X) must be large than batch size.");
+        platform::errors::InvalidArgument(
+            "The first dimension of Input(X) must be large than batch size."
+            "But received first dimension of Input(X) is %d, while batch"
+            "size is %d.",
+            dims[0], static_cast<int64_t>(lod[lod_level - 1].size() - 1)));
     if (lod_level > 1UL) {
       PADDLE_ENFORCE_EQ(lod[0][lod[0].size() - 1], lod[1].size() - 1,
-                        "The input lod information is illegal.");
+                        platform::errors::InvalidArgument(
+                            "The input lod information is illegal."));
       framework::LoD out_lod;
       out_lod.push_back(lod[0]);
       out->set_lod(out_lod);

From 395cb561aaa61be7edd5a17a5d95e5ae02c21d51 Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Fri, 9 Oct 2020 17:33:00 +0800
Subject: [PATCH 41/91] refine logsumexp error message and docs (#27713)

---
 .../operators/reduce_ops/logsumexp_op.cc      | 28 +++++++++----------
 python/paddle/tensor/math.py                  |  4 +--
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
index 7cd164bfd3a3d..9d2639c10301d 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
@@ -32,7 +32,7 @@ class LogsumexpOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LE(x_rank, 4,
                       platform::errors::InvalidArgument(
                           "The input tensor X's dimensions of logsumexp "
-                          "should be less equal than 4. But received X's "
+                          "should be less or equal than 4. But received X's "
                           "dimensions = %d, X's shape = [%s].",
                           x_rank, x_dims));
     auto axis = ctx->Attrs().Get<std::vector<int>>("axis");
@@ -45,20 +45,18 @@ class LogsumexpOp : public framework::OperatorWithKernel {
             axis.size()));
 
     for (size_t i = 0; i < axis.size(); i++) {
-      PADDLE_ENFORCE_LT(
-          axis[i], x_rank,
-          platform::errors::InvalidArgument(
-              "axis[%d] should be in the "
-              "range [-dimension(X), dimension(X)] "
-              "where dimesion(X) is %d. But received axis[i] = %d.",
-              i, x_rank, axis[i]));
-      PADDLE_ENFORCE_GE(
-          axis[i], -x_rank,
-          platform::errors::InvalidArgument(
-              "axis[%d] should be in the "
-              "range [-dimension(X), dimension(X)] "
-              "where dimesion(X) is %d. But received axis[i] = %d.",
-              i, x_rank, axis[i]));
+      PADDLE_ENFORCE_LT(axis[i], x_rank,
+                        platform::errors::InvalidArgument(
+                            "axis[%d] should be in the "
+                            "range [-D, D), where D is the dimensions of X and "
+                            "D is %d. But received axis[%d] = %d.",
+                            i, x_rank, i, axis[i]));
+      PADDLE_ENFORCE_GE(axis[i], -x_rank,
+                        platform::errors::InvalidArgument(
+                            "axis[%d] should be in the "
+                            "range [-D, D), where D is the dimensions of X and "
+                            "D is %d. But received axis[%d] = %d.",
+                            i, x_rank, i, axis[i]));
       if (axis[i] < 0) {
         axis[i] += x_rank;
       }
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 51dc771281393..34d15ed0bab61 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -999,7 +999,7 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
     This OP calculates the log of the sum of exponentials of ``x`` along ``axis`` .
 
     .. math::
-       logsumexp(x) = \log\sum exp(x)
+       logsumexp(x) = \\log\\sum exp(x)
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -1030,8 +1030,6 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
 
         import paddle
 
-        paddle.disable_static()
-
         x = paddle.to_tensor([[-1.5, 0., 2.], [3., 1.2, -2.4]])
         out1 = paddle.logsumexp(x) # [3.4691226]
         out2 = paddle.logsumexp(x, 1) # [2.15317821, 3.15684602]

From 0a7bab4e34610aeed59ab62108a381237194f42a Mon Sep 17 00:00:00 2001
From: Feiyu Chan <chenfeiyu@baidu.com>
Date: Fri, 9 Oct 2020 18:03:25 +0800
Subject: [PATCH 42/91] fix error mesage for negative_positive_pair_op and
 nce_op (#27779)

---
 paddle/fluid/operators/nce_op.h               | 77 +++++++++++--------
 .../operators/positive_negative_pair_op.cc    | 16 ++--
 2 files changed, 52 insertions(+), 41 deletions(-)

diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 1c75424fae7ef..8748078109f16 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -104,25 +104,29 @@ class NCEKernel : public framework::OpKernel<T> {
 
         PADDLE_ENFORCE_EQ(
             dist_probs->numel(), num_total_classes,
-            "ShapeError: The number of elements in Input(CustomDistProbs) "
-            "should be equal to the number of total classes. But Received: "
-            "Input(CustomDistProbs).numel() = %d, Attr(num_total_classes) "
-            "= %d.",
-            dist_probs->numel(), num_total_classes);
+            platform::errors::InvalidArgument(
+                "ShapeError: The number of elements in Input(CustomDistProbs) "
+                "should be equal to the number of total classes. But Received: "
+                "Input(CustomDistProbs).numel() = %d, Attr(num_total_classes) "
+                "= %d.",
+                dist_probs->numel(), num_total_classes));
         PADDLE_ENFORCE_EQ(
             dist_alias->numel(), num_total_classes,
-            "ShapeError: The number of elements in Input(CustomDistAlias) "
-            "should be equal to the number of total classes. But Received: "
-            "Input(CustomDistAlias).numel() = %d, Attr(num_total_classes) "
-            "= %d.",
-            dist_alias->numel(), num_total_classes);
+            platform::errors::InvalidArgument(
+                "ShapeError: The number of elements in Input(CustomDistAlias) "
+                "should be equal to the number of total classes. But Received: "
+                "Input(CustomDistAlias).numel() = %d, Attr(num_total_classes) "
+                "= %d.",
+                dist_alias->numel(), num_total_classes));
         PADDLE_ENFORCE_EQ(
             dist_alias_probs->numel(), num_total_classes,
-            "ShapeError: The number of elements in Input(CustomDistAliasProbs) "
-            "should be equal to the number of total classes. But Received: "
-            "Input(CustomDistAliasProbs).numel() = %d, "
-            "Attr(num_total_classes) = %d.",
-            dist_alias_probs->numel(), num_total_classes);
+            platform::errors::InvalidArgument(
+                "ShapeError: The number of elements in "
+                "Input(CustomDistAliasProbs) "
+                "should be equal to the number of total classes. But Received: "
+                "Input(CustomDistAliasProbs).numel() = %d, "
+                "Attr(num_total_classes) = %d.",
+                dist_alias_probs->numel(), num_total_classes));
 
         const float *probs_data = dist_probs->data<float>();
         const int *alias_data = dist_alias->data<int>();
@@ -140,10 +144,11 @@ class NCEKernel : public framework::OpKernel<T> {
 
     for (int x = 0; x < sample_labels->numel(); x++) {
       PADDLE_ENFORCE_GE(sample_labels_data[x], 0,
-                        "ValueError: Every sample label should be "
-                        "non-negative. But received: "
-                        "Input(SampleLabels)[%d] = %d",
-                        x, sample_labels_data[x]);
+                        platform::errors::InvalidArgument(
+                            "ValueError: Every sample label should be "
+                            "non-negative. But received: "
+                            "Input(SampleLabels)[%d] = %d",
+                            x, sample_labels_data[x]));
     }
 
     auto sample_out = context.Output<Tensor>("SampleLogits");
@@ -311,25 +316,29 @@ class NCEGradKernel : public framework::OpKernel<T> {
 
         PADDLE_ENFORCE_EQ(
             dist_probs->numel(), num_total_classes,
-            "ShapeError: The number of elements in Input(CustomDistProbs) "
-            "should be equal to the number of total classes. But Received: "
-            "Input(CustomDistProbs).numel() = %d, Attr(num_total_classes) "
-            "= %d.",
-            dist_probs->numel(), num_total_classes);
+            platform::errors::InvalidArgument(
+                "ShapeError: The number of elements in Input(CustomDistProbs) "
+                "should be equal to the number of total classes. But Received: "
+                "Input(CustomDistProbs).numel() = %d, Attr(num_total_classes) "
+                "= %d.",
+                dist_probs->numel(), num_total_classes));
         PADDLE_ENFORCE_EQ(
             dist_alias->numel(), num_total_classes,
-            "ShapeError: The number of elements in Input(CustomDistAlias) "
-            "should be equal to the number of total classes. But Received: "
-            "Input(CustomDistAlias).numel() = %d, Attr(num_total_classes) "
-            "= %d.",
-            dist_alias->numel(), num_total_classes);
+            platform::errors::InvalidArgument(
+                "ShapeError: The number of elements in Input(CustomDistAlias) "
+                "should be equal to the number of total classes. But Received: "
+                "Input(CustomDistAlias).numel() = %d, Attr(num_total_classes) "
+                "= %d.",
+                dist_alias->numel(), num_total_classes));
         PADDLE_ENFORCE_EQ(
             dist_alias_probs->numel(), num_total_classes,
-            "ShapeError: The number of elements in Input(CustomDistAliasProbs) "
-            "should be equal to the number of total classes. But Received: "
-            "Input(CustomDistAliasProbs).numel() = %d, "
-            "Attr(num_total_classes) = %d.",
-            dist_alias_probs->numel(), num_total_classes);
+            platform::errors::InvalidArgument(
+                "ShapeError: The number of elements in "
+                "Input(CustomDistAliasProbs) "
+                "should be equal to the number of total classes. But Received: "
+                "Input(CustomDistAliasProbs).numel() = %d, "
+                "Attr(num_total_classes) = %d.",
+                dist_alias_probs->numel(), num_total_classes));
 
         const float *probs_data = dist_probs->data<float>();
         const int *alias_data = dist_alias->data<int>();
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index e42c4666e110f..75d1b36c7d6a8 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -37,13 +37,15 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
     if (ctx->HasInput("AccumulatePositivePair") ||
         ctx->HasInput("AccumulateNegativePair") ||
         ctx->HasInput("AccumulateNeutralPair")) {
-      PADDLE_ENFORCE(ctx->HasInput("AccumulatePositivePair") &&
-                         ctx->HasInput("AccumulateNegativePair") &&
-                         ctx->HasInput("AccumulateNeutralPair"),
-                     "All optional inputs(AccumulatePositivePair, "
-                     "AccumulateNegativePair, AccumulateNeutralPair) of "
-                     "PositiveNegativePairOp are required if one of them is "
-                     "specified.");
+      PADDLE_ENFORCE_EQ(
+          ctx->HasInput("AccumulatePositivePair") &&
+              ctx->HasInput("AccumulateNegativePair") &&
+              ctx->HasInput("AccumulateNeutralPair"),
+          true, platform::errors::InvalidArgument(
+                    "All optional inputs(AccumulatePositivePair, "
+                    "AccumulateNegativePair, AccumulateNeutralPair) of "
+                    "PositiveNegativePairOp are required if one of them "
+                    "is specified."));
       PADDLE_ENFORCE_EQ(
           ctx->GetInputDim("AccumulatePositivePair"), scalar_dim,
           platform::errors::InvalidArgument(

From 061240b34af4579244d329445d360637c0992684 Mon Sep 17 00:00:00 2001
From: liu zhengxi <380185688@qq.com>
Date: Fri, 9 Oct 2020 19:32:15 +0800
Subject: [PATCH 43/91] remove beam_search and beam_search_decode api in
 paddle.nn (#27660)

---
 python/paddle/nn/__init__.py | 4 ++--
 python/paddle/nn/decode.py   | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 82fec5c0faa2e..427a2f91c963f 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -44,8 +44,8 @@
 # from .control_flow import rnn        #DEFINE_ALIAS
 # from .decode import BeamSearchDecoder        #DEFINE_ALIAS
 # from .decode import Decoder        #DEFINE_ALIAS
-from .decode import beam_search  #DEFINE_ALIAS
-from .decode import beam_search_decode  #DEFINE_ALIAS
+# from .decode import beam_search  #DEFINE_ALIAS
+# from .decode import beam_search_decode  #DEFINE_ALIAS
 # from .decode import crf_decoding        #DEFINE_ALIAS
 # from .decode import ctc_greedy_decoder        #DEFINE_ALIAS
 # from .decode import dynamic_decode        #DEFINE_ALIAS
diff --git a/python/paddle/nn/decode.py b/python/paddle/nn/decode.py
index f01a5ed15b650..214744217e957 100644
--- a/python/paddle/nn/decode.py
+++ b/python/paddle/nn/decode.py
@@ -13,16 +13,16 @@
 # limitations under the License.
 
 # TODO: define api to implement decoding algorithm  
-from ..fluid.layers import beam_search  #DEFINE_ALIAS
-from ..fluid.layers import beam_search_decode  #DEFINE_ALIAS
+# from ..fluid.layers import beam_search  #DEFINE_ALIAS
+# from ..fluid.layers import beam_search_decode  #DEFINE_ALIAS
 
 from ..fluid.layers import gather_tree  #DEFINE_ALIAS
 
 __all__ = [
     #       'BeamSearchDecoder',
     #       'Decoder',
-    'beam_search',
-    'beam_search_decode',
+    #       'beam_search',
+    #       'beam_search_decode',
     #       'crf_decoding',
     #       'ctc_greedy_decoder',
     #       'dynamic_decode',

From 631c1f3018261c2e52aca97b716d0333afeb858f Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Fri, 9 Oct 2020 13:54:24 +0200
Subject: [PATCH 44/91] - Fix to 27398 (#27770)

test=develop

- compilation fix

test=develop
---
 paddle/fluid/framework/data_transform.cc | 3 +++
 paddle/fluid/framework/tensor_util.cc    | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 3a40de6988f29..70693a5df2609 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -117,6 +117,9 @@ void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
     auto *tran_lod_tensor = out_var->GetMutable<LoDTensor>();
     tran_lod_tensor->set_lod(in_lod_tensor.lod());
     tran_lod_tensor->set_layout(in_lod_tensor.layout());
+#ifdef PADDLE_WITH_MKLDNN
+    tran_lod_tensor->set_format(in_lod_tensor.format());
+#endif
     tran_lod_tensor->ShareDataWith(tensor);
   } else if (in_var.IsType<SelectedRows>()) {
     auto &in_selected_rows = in_var.Get<SelectedRows>();
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index a073dbd733f0b..4fe01aff79e52 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -38,6 +38,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
 
   dst->Resize(src.dims());
   dst->set_layout(src.layout());
+#ifdef PADDLE_WITH_MKLDNN
+  dst->set_format(src.format());
+#endif
   auto src_place = src.place();
   auto src_ptr = src.data<void>();
   auto dst_ptr = dst->mutable_data(dst_place, src.type());
@@ -237,6 +240,9 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   src.check_memory_size();
   dst->Resize(src.dims());
   dst->set_layout(src.layout());
+#ifdef PADDLE_WITH_MKLDNN
+  dst->set_format(src.format());
+#endif
   auto src_place = src.place();
   auto src_ptr = src.data<void>();
   auto dst_ptr = dst->mutable_data(dst_place, src.type());

From 836ee3b0c86dfcd609ab3e12a13554b9291cfc3c Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Sat, 10 Oct 2020 08:57:52 +0800
Subject: [PATCH 45/91] Polish the english documentation and example of
 paddle.nn.Linear and paddle.nn.functional.linear (#27759)

---
 python/paddle/fluid/dygraph/nn.py     |  3 -
 python/paddle/nn/functional/common.py | 59 +++++++++--------
 python/paddle/nn/layer/common.py      | 91 ++++++++++++++++-----------
 3 files changed, 87 insertions(+), 66 deletions(-)

diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 05269028acc40..1a488844dec21 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -895,9 +895,6 @@ def forward(self, input):
 
 class Linear(layers.Layer):
     """
-    :alias_main: paddle.nn.Linear
-	:alias: paddle.nn.Linear,paddle.nn.layer.Linear,paddle.nn.layer.common.Linear
-	:old_api: paddle.fluid.dygraph.Linear
     
     Fully-connected linear transformation layer:
 
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 7d2ed0cdcf83a..81c38c0be6557 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1406,46 +1406,53 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
 def linear(x, weight, bias=None, name=None):
     """
 
-    Fully-connected linear transformation op
+    Fully-connected linear transformation operator. For each input :math:`X` ,
+    the equation is:
 
     .. math::
 
-        Out = {XW + b}
+        Out = XW + b
 
-    where :math:`X` is the input Tensor, :math:`W` and :math:`b` are weight and bias respectively.
+    where :math:`W` is the weight and :math:`b` is the bias.
 
-    The linear op multiplies input tensor with weight matrix and
-    produces an output Tensor of shape [N, *, output_dim], 
-    where N is batch size and `*` means any number of additional dimensions and output_dim is the last dim of ``weight``.
-    If ``bias`` is not None, a bias will be added to the output.
+    If the weight is a 2-D tensor of shape :math:`[in\_features, out\_features]` ,
+    input should be a multi-dimensional tensor of shape
+    :math:`[batch\_size, *, in\_features]` , where :math:`*` means any number of
+    additional dimensions. The linear operator multiplies input tensor with
+    weight and produces an output tensor of shape :math:`[batch\_size, *, out\_features]` , 
+    If :math:`bias` is not None, the bias should be a 1-D tensor of shape
+    :math:`[out\_features]` and will be added to the output.
 
-    Args:
-        x(Tensor): Input tensor, its data type is float16, float32 or float64
-        weight(Tensor): Weight tensor, its data type is float16, float32 or float64
-        bias(Tensor|None, optional): Bias tensor, its data type is float16, float32 or float64. If it is set to None, no bias will be added to the output units.
-        name(str|None, optional): For detailed information, please refer to :ref:`api_guide_Name`. Default: None.
+    Parameters:
+        x (Tensor): Input tensor. The data type should be float16, float32 or float64.
+        weight (Tensor): Weight tensor. The data type should be float16, float32 or float64.
+        bias (Tensor, optional): Bias tensor. The data type should be float16, float32 or float64.
+                                 If it is set to None, no bias will be added to the output units.
+        name (str, optional): Normally there is no need for user to set this parameter.
+                              For detailed information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Output tensor
+        Tensor, the shape is :math:`[batch\_size, *, out\_features]` and the
+        data type is the same with input :math:`x` .
 
     Examples:
         .. code-block:: python
           
-          import numpy as np
           import paddle
-          import paddle.nn.functional as F
           
-          input = np.ones((3,1,2), dtype=np.float32)
-          weight = np.ones((2,2), dtype=np.float32)
-          bias = np.ones((2), dtype=np.float32)
-          place = paddle.CPUPlace()
-          paddle.disable_static(place)
-          input = paddle.to_tensor(input)
-          weight = paddle.to_tensor(weight)
-          bias = paddle.to_tensor(bias)
-          out = F.linear(input, weight, bias)
-          print(out) #[3 3 3 3 3 3]
-    
+          x = paddle.randn((3, 2), dtype="float32")
+          # x: [[-0.32342386 -1.200079  ]
+          #     [ 0.7979031  -0.90978354]
+          #     [ 0.40597573  1.8095392 ]]
+          weight = paddle.full(shape=[2, 4], fill_value="0.5", dtype="float32", name="weight")
+          # weight: [[0.5 0.5 0.5 0.5]
+          #          [0.5 0.5 0.5 0.5]]
+          bias = paddle.ones(shape=[4], dtype="float32", name="bias")
+          # bias: [1. 1. 1. 1.]
+          y = paddle.nn.functional.linear(x, weight, bias)
+          # y: [[0.23824859 0.23824859 0.23824859 0.23824859]
+          #     [0.9440598  0.9440598  0.9440598  0.9440598 ]
+          #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
     """
     if in_dygraph_mode():
         pre_bias = _varbase_creator(dtype=x.dtype)
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 433443fee1765..05cbd96863c28 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -15,7 +15,6 @@
 # TODO: define the common classes to build a neural network
 from ...fluid.dygraph import BilinearTensorProduct  #DEFINE_ALIAS
 from ...fluid.dygraph import Pool2D  #DEFINE_ALIAS
-from ...fluid.dygraph import Linear  #DEFINE_ALIAS
 from ...fluid.dygraph import Flatten  #DEFINE_ALIAS
 from ...fluid.dygraph import layers
 from .. import functional as F
@@ -50,56 +49,74 @@
 
 class Linear(layers.Layer):
     """
-    
-    Fully-connected linear transformation layer:
+
+    Fully-connected linear transformation layer. For each input :math:`X` ,
+    the equation is:
 
     .. math::
 
-        Out = {XW + b}
+        Out = XW + b
 
-    where :math:`X` is the input Tensor, :math:`W` and :math:`b` are weight and bias respectively.
+    where :math:`W` is the weight and :math:`b` is the bias.
 
-    Linear layer takes only one ``Tensor`` input.
-    The Linear layer multiplies input tensor with weight matrix and
-    produces an output Tensor of shape [N, *, `output_dim`],
-    where N is batch size and `*` means any number of additional dimensions.
-    If ``bias_attr`` is not None, a bias variable will be created and added to the output.
+    Linear layer takes only one multi-dimensional tensor as input with the
+    shape :math:`[batch\_size, *, in\_features]` , where :math:`*` means any
+    number of additional dimensions. It multiplies input tensor with the weight
+    (a 2-D tensor of shape :math:`[in\_features, out\_features]` ) and produces
+    an output tensor of shape :math:`[batch\_size, *, out\_features]` .
+    If :math:`bias\_attr` is not False, the bias (a 1-D tensor of
+    shape :math:`[out\_features]` ) will be created and added to the output.
 
     Parameters:
-        in_features(int): The number of input units in this layer.
-        out_features(int): The number of output units in this layer.
-        weight_attr(ParamAttr or list of ParamAttr, optional): The parameter attribute for learnable
-            weights(Parameter) of this layer. Default: None.
-        bias_attr(ParamAttr or list of ParamAttr, optional): The attribute for the bias
-            of this layer. If it is set to False, no bias will be added to the output units.
-            If it is set to None, the bias is initialized zero. Default: None.
-        name(str|None): For detailed information, please refer to :ref:`api_guide_Name`. Default: None.
-
-    Attributes:
-        **weight** (Parameter): the learnable weights of this layer.
+        in_features (int): The number of input units.
+        out_features (int): The number of output units.
+        weight_attr (ParamAttr, optional): The attribute for the learnable
+            weight of this layer. The default value is None and the weight will be
+            initialized to zero. For detailed information, please refer to
+            paddle.ParamAttr.
+        bias_attr (ParamAttr|bool, optional): The attribute for the learnable bias
+            of this layer. If it is set to False, no bias will be added to the output.
+            If it is set to None or one kind of ParamAttr, a bias parameter will
+            be created according to ParamAttr. For detailed information, please refer
+            to paddle.ParamAttr. The default value is None and the bias will be
+            initialized to zero.
+        name (str, optional): Normally there is no need for user to set this parameter.
+            For detailed information, please refer to :ref:`api_guide_Name` .
 
-        **bias** (Parameter or None): the learnable bias of this layer.
+    Attribute:
+        **weight** (Parameter): the learnable weight of this layer.
 
-    Returns:
-        None
+        **bias** (Parameter): the learnable bias of this layer.
+
+    Shape:
+        - input: Multi-dimentional tensor with shape :math:`[batch\_size, *, in\_features]` .
+        - output: Multi-dimentional tensor with shape :math:`[batch\_size, *, out\_features]` .
 
     Examples:
         .. code-block:: python
 
           import paddle
-          from paddle import nn
-          import numpy as np
-
-          data = np.ones((3,1,2), np.float32)
-          place = paddle.CPUPlace()
-          paddle.disable_static(place)
-          data = paddle.to_tensor(data)
-          weight_attr=paddle.framework.ParamAttr(name="linear_weight", learning_rate=1.0,
-          trainable=False, regularizer=None, initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
-          bias_attr=paddle.framework.ParamAttr(name="linear_bias", learning_rate=1.0,
-          trainable=False, regularizer=None, initializer=paddle.fluid.initializer.ConstantInitializer(value=1.0))
-          linear = nn.Linear(2,2,weight_attr=weight_attr, bias_attr=bias_attr)
-          res = linear(data)  # [3 3 3 3 3 3]
+
+          # Define the linear layer.
+          weight_attr = paddle.ParamAttr(
+              name="weight",
+              initializer=paddle.nn.initializer.Constant(value=0.5))
+          bias_attr = paddle.ParamAttr(
+              name="bias",
+              initializer=paddle.nn.initializer.Constant(value=1.0))
+          linear = paddle.nn.Linear(2, 4, weight_attr=weight_attr, bias_attr=bias_attr)
+          # linear.weight: [[0.5 0.5 0.5 0.5]
+          #                 [0.5 0.5 0.5 0.5]]
+          # linear.bias: [1. 1. 1. 1.]
+
+          x = paddle.randn((3, 2), dtype="float32")
+          # x: [[-0.32342386 -1.200079  ]
+          #     [ 0.7979031  -0.90978354]
+          #     [ 0.40597573  1.8095392 ]]
+          y = linear(x)
+          # y: [[0.23824859 0.23824859 0.23824859 0.23824859]
+          #     [0.9440598  0.9440598  0.9440598  0.9440598 ]
+          #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
     """
 
     def __init__(self,

From c52b6bbfa9b1f9c99a425c53b711cacc71e23243 Mon Sep 17 00:00:00 2001
From: Feiyu Chan <chenfeiyu@baidu.com>
Date: Sat, 10 Oct 2020 09:55:20 +0800
Subject: [PATCH 46/91] add alias for RNN APIs from paddle.nn.layer.rnn
 explicitly (#27784)

---
 python/paddle/nn/__init__.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 427a2f91c963f..b1acea2ba5f8f 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -148,10 +148,17 @@
 from .layer.norm import BatchNorm1d  #DEFINE_ALIAS
 from .layer.norm import BatchNorm2d  #DEFINE_ALIAS
 from .layer.norm import BatchNorm3d  #DEFINE_ALIAS
-from .layer.rnn import *
-# from .layer.rnn import RNNCell        #DEFINE_ALIAS
-# from .layer.rnn import GRUCell        #DEFINE_ALIAS
-# from .layer.rnn import LSTMCell        #DEFINE_ALIAS
+
+from .layer.rnn import RNNCellBase  #DEFINE_ALIAS
+from .layer.rnn import SimpleRNNCell  #DEFINE_ALIAS
+from .layer.rnn import LSTMCell  #DEFINE_ALIAS
+from .layer.rnn import GRUCell  #DEFINE_ALIAS
+from .layer.rnn import RNN  #DEFINE_ALIAS
+from .layer.rnn import BiRNN  #DEFINE_ALIAS
+from .layer.rnn import SimpleRNN  #DEFINE_ALIAS
+from .layer.rnn import LSTM  #DEFINE_ALIAS
+from .layer.rnn import GRU  #DEFINE_ALIAS
+
 from .layer.transformer import MultiHeadAttention
 from .layer.transformer import TransformerEncoderLayer
 from .layer.transformer import TransformerEncoder

From b53970ee36a365df859e461300d6c3cb3e1b4125 Mon Sep 17 00:00:00 2001
From: LutaoChu <30695251+LutaoChu@users.noreply.github.com>
Date: Sat, 10 Oct 2020 10:16:09 +0800
Subject: [PATCH 47/91] Fix cross, cumsum op docs   test=document_fix

Fix cross, cumsum op docs
---
 python/paddle/tensor/linalg.py | 18 +++++++-----------
 python/paddle/tensor/math.py   | 20 +++++++++-----------
 2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 2dcdf1603a737..2745464995f5d 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -687,27 +687,24 @@ def t(input, name=None):
 
 def cross(x, y, axis=None, name=None):
     """
-	:alias_main: paddle.cross
-	:alias: paddle.cross,paddle.tensor.cross,paddle.tensor.linalg.cross
-
     Computes the cross product between two tensors along an axis.
+    
     Inputs must have the same shape, and the length of their axes should be equal to 3.
     If `axis` is not given, it defaults to the first axis found with the length 3.
     
     Args:
-        x (Variable): The first input tensor variable.
-        y (Variable): The second input tensor variable.
+        x (Tensor): The first input tensor.
+        y (Tensor): The second input tensor.
         axis (int, optional): The axis along which to compute the cross product. It defaults to the first axis found with the length 3.
-        name (str, optional): The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: A Tensor with same data type as `x`.
+        Tensor. A Tensor with same data type as `x`.
         
     Examples:
         .. code-block:: python
+
             import paddle
-            paddle.disable_static()
 
             x = paddle.to_tensor([[1.0, 1.0, 1.0],
                                   [2.0, 2.0, 2.0],
@@ -715,14 +712,13 @@ def cross(x, y, axis=None, name=None):
             y = paddle.to_tensor([[1.0, 1.0, 1.0],
                                   [1.0, 1.0, 1.0],
                                   [1.0, 1.0, 1.0]])
+
             z1 = paddle.cross(x, y)
-            print(z1.numpy())
             # [[-1. -1. -1.]
             #  [ 2.  2.  2.]
             #  [-1. -1. -1.]]
 
             z2 = paddle.cross(x, y, axis=1)
-            print(z2.numpy())
             # [[0. 0. 0.]
             #  [0. 0. 0.]
             #  [0. 0. 0.]]
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 34d15ed0bab61..2a370422eed7e 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1622,39 +1622,37 @@ def kron(x, y, name=None):
 
 def cumsum(x, axis=None, dtype=None, name=None):
     """
-    The cumulative sum of the elements along a given axis. The first element of the result is the same of the first element of the input. 
+    The cumulative sum of the elements along a given axis. 
+    
+    **Note**:
+    The first element of the result is the same of the first element of the input. 
 
     Args:
-        x (Tensor): Input of cumsum operator, the Tensor needed to be cumsumed. 
+        x (Tensor): The input tensor needed to be cumsumed.
         axis (int, optional): The dimension to accumulate along. -1 means the last dimension. The default (None) is to compute the cumsum over the flattened array.
         dtype (str, optional): The data type of the output tensor, can be float32, float64, int32, int64. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None. 
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor, the result of cumsum operator, output of cumsum operator. 
+        Tensor, the result of cumsum operator. 
 
     Examples:
         .. code-block:: python
             
             import paddle
-            import numpy as np
-
-            paddle.disable_static()
-            data_np = np.arange(12).reshape(3, 4)
-            data = paddle.to_tensor(data_np)
+            
+            data = paddle.arange(12)
+            data = paddle.reshape(data, (3, 4))
 
             y = paddle.cumsum(data)
-            print(y.numpy())
             # [ 0  1  3  6 10 15 21 28 36 45 55 66]
 
             y = paddle.cumsum(data, axis=0)
-            print(y.numpy())
             # [[ 0  1  2  3]
             #  [ 4  6  8 10]
             #  [12 15 18 21]]
             
             y = paddle.cumsum(data, axis=-1)
-            print(y.numpy())
             # [[ 0  1  3  6]
             #  [ 4  9 15 22]
             #  [ 8 17 27 38]]

From d3b98f0d84a38b2bfd541a6967f61634e63a918d Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Sat, 10 Oct 2020 10:57:34 +0800
Subject: [PATCH 48/91] Fix dynamic parallel train mode for hapi (#27787)

* fix dynamic parallel for hapi

* fix code style
---
 python/paddle/hapi/model.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 8505544a71f58..459d6cd3284e9 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -638,19 +638,14 @@ def train_batch(self, inputs, labels=None):
 
         if self._nranks > 1:
             outputs = self.ddp_model.forward(* [to_variable(x) for x in inputs])
-            losses = self.model._loss(*(to_list(outputs) + labels))
-            losses = to_list(losses)
-            final_loss = fluid.layers.sum(losses)
-            final_loss = self.ddp_model.scale_loss(final_loss)
-            final_loss.backward()
-            self.ddp_model.apply_collective_grads()
         else:
             outputs = self.model.network.forward(
                 * [to_variable(x) for x in inputs])
-            losses = self.model._loss(*(to_list(outputs) + labels))
-            losses = to_list(losses)
-            final_loss = fluid.layers.sum(losses)
-            final_loss.backward()
+
+        losses = self.model._loss(*(to_list(outputs) + labels))
+        losses = to_list(losses)
+        final_loss = fluid.layers.sum(losses)
+        final_loss.backward()
 
         self.model._optimizer.minimize(final_loss)
         self.model.network.clear_gradients()

From c4b1faa469a4cfa6d352cfd1ea4cb89273e22a7a Mon Sep 17 00:00:00 2001
From: Li Fuchen <lifuchen@baidu.com>
Date: Sat, 10 Oct 2020 11:02:47 +0800
Subject: [PATCH 49/91] modified sample code of add_position_encoding to use
 paddle.randn, test=document_fix (#27679)

---
 python/paddle/fluid/layers/nn.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index dbcd91eedbdf1..90f7cbe395047 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -13187,12 +13187,10 @@ def add_position_encoding(input, alpha, beta, name=None):
     Examples:
         .. code-block:: python
 
-          import numpy as np
           import paddle
           import paddle.nn.functional as F
 
-          tensor = np.random.randn(16, 32, 64) 
-          tensor = paddle.to_tensor(tensor)
+          tensor = paddle.randn([16, 32, 64])
           position_tensor = F.add_position_encoding(
                 input=tensor, alpha=1.0, beta=1.0)
 

From b8d2a021f0fa6e750dac4857ba7603a7b4fd440e Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Sat, 10 Oct 2020 12:17:06 +0800
Subject: [PATCH 50/91] fix ut error of test_recognize_digits, test=develop
 (#27791)

---
 paddle/fluid/train/CMakeLists.txt             | 43 +++++++------------
 .../train/test_train_recognize_digits.cc      | 13 +++---
 2 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index d587081fbac8a..ad4bc20f9f0b1 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -4,37 +4,26 @@ function(train_test TARGET_NAME)
     set(multiValueArgs ARGS)
     cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-    set(arg_list "")
-    if(train_test_ARGS)
-        foreach(arg ${train_test_ARGS})
-            list(APPEND arg_list "_${arg}")
-        endforeach()
+    if (NOT APPLE AND NOT WIN32)
+        cc_test(test_train_${TARGET_NAME}
+                SRCS test_train_${TARGET_NAME}.cc
+                DEPS paddle_fluid_shared
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
     else()
-        list(APPEND arg_list "_")
+        cc_test(test_train_${TARGET_NAME}${arg}
+                SRCS test_train_${TARGET_NAME}.cc
+                DEPS paddle_fluid_api
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+    endif()
+    set_tests_properties(test_train_${TARGET_NAME}
+            PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model)
+    if(NOT WIN32 AND NOT APPLE)
+        set_tests_properties(test_train_${TARGET_NAME}
+                PROPERTIES TIMEOUT 150)
     endif()
-    foreach(arg ${arg_list})
-        string(REGEX REPLACE "^_$" "" arg "${arg}")
-        if (NOT APPLE AND NOT WIN32)
-            cc_test(test_train_${TARGET_NAME}${arg}
-                    SRCS test_train_${TARGET_NAME}.cc
-                    DEPS paddle_fluid_shared
-                    ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
-        else()
-            cc_test(test_train_${TARGET_NAME}${arg}
-                    SRCS test_train_${TARGET_NAME}.cc
-                    DEPS paddle_fluid_api
-                    ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
-        endif()
-        set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model)
-        if(NOT WIN32 AND NOT APPLE)
-            set_tests_properties(test_train_${TARGET_NAME}${arg}
-                    PROPERTIES TIMEOUT 150)
-        endif()
-    endforeach()
 endfunction(train_test)
 
 
 if(WITH_TESTING)
-  train_test(recognize_digits ARGS mlp conv)
+  train_test(recognize_digits)
 endif()
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
index e7b698e1a34e2..fb993439bb8e4 100644
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
@@ -32,16 +32,15 @@ DEFINE_string(dirname, "", "Directory of the train model.");
 
 namespace paddle {
 
-void Train() {
-  CHECK(!FLAGS_dirname.empty());
+void Train(std::string model_dir) {
   framework::InitDevices(false);
   const auto cpu_place = platform::CPUPlace();
   framework::Executor executor(cpu_place);
   framework::Scope scope;
 
   auto train_program = inference::Load(
-      &executor, &scope, FLAGS_dirname + "__model_combined__.main_program",
-      FLAGS_dirname + "__params_combined__");
+      &executor, &scope, model_dir + "__model_combined__.main_program",
+      model_dir + "__params_combined__");
 
   std::string loss_name = "";
   for (auto op_desc : train_program->Block(0).AllOps()) {
@@ -87,6 +86,10 @@ void Train() {
   EXPECT_LT(last_loss, first_loss);
 }
 
-TEST(train, recognize_digits) { Train(); }
+TEST(train, recognize_digits) {
+  CHECK(!FLAGS_dirname.empty());
+  Train(FLAGS_dirname + "recognize_digits_mlp.train.model/");
+  Train(FLAGS_dirname + "recognize_digits_conv.train.model/");
+}
 
 }  // namespace paddle

From 00d401ec1a316f1eb080b6617cbf7a90a178cd11 Mon Sep 17 00:00:00 2001
From: zhulei <563755780@qq.com>
Date: Sat, 10 Oct 2020 12:51:52 +0800
Subject: [PATCH 51/91] Add api of constant in paddle.nn.initializer (#27786)

* Add api of constant in paddle.nn.initializer

* Add api of constant in paddle.nn.initializer

* Add api of constant in paddle.nn.initializer

* Add api of constant in paddle.nn.initializer

* Add api of constant in paddle.nn.initializer

* Add api of constant in paddle.nn.initializer

* Add api of constant in paddle.nn.initializer
---
 .../tests/unittests/test_initializer_nn.py    | 108 ++++++++++++++++++
 python/paddle/nn/initializer/__init__.py      |   8 +-
 python/paddle/nn/initializer/constant.py      |  46 ++++++++
 3 files changed, 159 insertions(+), 3 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_initializer_nn.py
 create mode 100644 python/paddle/nn/initializer/constant.py

diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
new file mode 100644
index 0000000000000..6ad19658fd203
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
@@ -0,0 +1,108 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.nn.initializer as initializer
+from paddle.fluid.core import VarDesc
+
+DELTA = 0.00001
+
+
+def check_cast_op(op):
+    return op.type == 'cast' and \
+           op.attr('in_dtype') == VarDesc.VarType.FP32 and \
+           op.attr('out_dtype') == VarDesc.VarType.FP16
+
+
+class TestConstantInitializer(unittest.TestCase):
+    def static_test_constant_initializer_common(self,
+                                                init_inst,
+                                                dtype="float32",
+                                                value_target=0.0):
+        paddle.enable_static()
+        program = framework.Program()
+        block = program.global_block()
+        for _ in range(2):
+            block.create_parameter(
+                dtype=dtype,
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=init_inst)
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'fill_constant')
+        self.assertAlmostEqual(init_op.attr('value'), value_target, delta=DELTA)
+        paddle.disable_static()
+        return block
+
+    def test_constant_initializer_default_value_static(self, dtype="float32"):
+        """Test the constant initializer with default value in static graph
+        """
+        block = self.static_test_constant_initializer_common(
+            init_inst=initializer.Constant(), dtype=dtype, value_target=0.0)
+        return block
+
+    def test_constant_initializer_default_value_dygraph(self, dtype="float32"):
+        """Test constant initializer with supplied value in dygraph
+        """
+        with fluid.dygraph.guard():
+            linear = nn.Linear(2, 4, weight_attr=nn.initializer.Constant())
+            mat_target = np.ones((2, 4), dtype=dtype) * 0.0
+            mat_linear = linear.weight.numpy()
+            mismatch = np.sum(
+                (mat_target - mat_linear) * (mat_target - mat_linear))
+            self.assertAlmostEqual(mismatch, 0.0, delta=DELTA)
+
+    def test_constant_initializer_static(self, dtype="float32"):
+        """Test constant initializer with supplied value in static graph
+        """
+        block = self.static_test_constant_initializer_common(
+            init_inst=initializer.Constant(2.3), dtype=dtype, value_target=2.3)
+        return block
+
+    def test_constant_initializer_dygraph(self, dtype="float32"):
+        """Test constant initializer with supplied value in dygraph
+        """
+        with fluid.dygraph.guard():
+            linear = nn.Linear(
+                2, 4, weight_attr=nn.initializer.Constant(value=2.0))
+            mat_target = np.ones((2, 4), dtype=dtype) * 2.0
+            mat_linear = linear.weight.numpy()
+            mismatch = np.sum(
+                (mat_target - mat_linear) * (mat_target - mat_linear))
+            self.assertAlmostEqual(mismatch, 0.0, delta=DELTA)
+
+    def test_constant_initializer_fp16(self):
+        """Test constant initializer with float16
+        """
+        block = self.test_constant_initializer_default_value_static("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+        block = self.test_constant_initializer_static("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+        self.test_constant_initializer_default_value_dygraph("float16")
+        self.test_constant_initializer_dygraph("float16")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/nn/initializer/__init__.py b/python/paddle/nn/initializer/__init__.py
index 489f324868a3e..db0f5dbff2b80 100644
--- a/python/paddle/nn/initializer/__init__.py
+++ b/python/paddle/nn/initializer/__init__.py
@@ -13,21 +13,23 @@
 # limitations under the License.
 
 # TODO: define the initializers to create a Parameter in neural network
-
 from ...fluid.initializer import Bilinear  #DEFINE_ALIAS
-from ...fluid.initializer import Constant  #DEFINE_ALIAS
 from ...fluid.initializer import MSRA  #DEFINE_ALIAS
 from ...fluid.initializer import Normal  #DEFINE_ALIAS
 from ...fluid.initializer import TruncatedNormal  #DEFINE_ALIAS
 from ...fluid.initializer import Uniform  #DEFINE_ALIAS
 from ...fluid.initializer import Xavier  #DEFINE_ALIAS
 
+from . import constant
+from .constant import Constant  #DEFINE_ALIAS
+
 __all__ = [
     'Bilinear',
-    'Constant',
     'MSRA',
     'Normal',
     'TruncatedNormal',
     'Uniform',
     'Xavier',
 ]
+
+__all__ += constant.__all__
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
new file mode 100644
index 0000000000000..6d21ddae0d16b
--- /dev/null
+++ b/python/paddle/nn/initializer/constant.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO: define the initializers of Constant in neural network
+from ...fluid.initializer import ConstantInitializer
+
+__all__ = ['Constant']
+
+
+class Constant(ConstantInitializer):
+    """Implement the constant initializer.
+
+    Args:
+        value (float32): constant value to initialize the parameter 
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            data = paddle.rand([30, 10, 2], dtype='float32')
+            linear = nn.Linear(2,
+                               4,
+                               weight_attr=nn.initializer.Constant(value=2.0))
+            res = linear(data)
+            print(linear.weight.numpy())
+            #result is [[2. 2. 2. 2.],[2. 2. 2. 2.]]
+
+    """
+
+    def __init__(self, value=0.0):
+        if value is None:
+            raise ValueError("value must not be none.")
+        super(Constant, self).__init__(value=value, force_cpu=False)

From 0d27591642cb1529b089f6d541dd1ce143a4c0e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Sat, 10 Oct 2020 13:12:33 +0800
Subject: [PATCH 52/91] save operator version infomation to program desc,
 test=develop (#27668)

---
 paddle/fluid/framework/CMakeLists.txt         |  4 +-
 paddle/fluid/framework/framework.proto        | 32 +++--------
 paddle/fluid/framework/op_compatible_info.cc  | 35 ------------
 paddle/fluid/framework/op_compatible_info.h   |  8 ---
 .../framework/op_compatible_info_test.cc      |  6 --
 paddle/fluid/framework/op_version_proto.cc    | 15 +++++
 paddle/fluid/framework/op_version_proto.h     | 55 +++++++++++++++++++
 paddle/fluid/framework/op_version_registry.cc |  2 +-
 paddle/fluid/framework/op_version_registry.h  | 15 ++++-
 .../framework/op_version_registry_test.cc     |  2 +-
 paddle/fluid/framework/program_desc.cc        |  4 +-
 paddle/fluid/framework/program_desc.h         |  2 +-
 .../fluid/inference/api/analysis_predictor.cc | 39 -------------
 .../fluid/inference/api/analysis_predictor.h  |  7 ---
 paddle/fluid/pybind/pybind.cc                 | 12 ++--
 python/paddle/fluid/io.py                     |  4 +-
 16 files changed, 109 insertions(+), 133 deletions(-)
 create mode 100644 paddle/fluid/framework/op_version_proto.cc
 create mode 100644 paddle/fluid/framework/op_version_proto.h

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index bb5e2e1369a84..d31943289d7a1 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -123,7 +123,9 @@ cc_library(attribute SRCS attribute.cc DEPS framework_proto boost enforce)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
 device_context)
 
-cc_library(op_version_registry SRCS op_version_registry.cc DEPS framework_proto boost)
+cc_library(op_version_proto SRCS op_version_proto.cc DEPS framework_proto boost)
+
+cc_library(op_version_registry SRCS op_version_registry.cc DEPS op_version_proto framework_proto boost)
 cc_test(op_version_registry_test SRCS op_version_registry_test.cc DEPS op_version_registry)
 
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog)
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 29312370b3448..c33d71b3b0a9c 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -179,29 +179,15 @@ message BlockDesc {
   optional int32 forward_block_idx = 5 [ default = -1 ];
 }
 
-// CompatibleInfo is used to determine if a feature is compatible and
-// provides the information.
-message CompatibleInfo {
-  enum Type {
-    COMPATIBLE = 0;
-    DEFINITELY_NOT = 1;
-    POSSIBLE = 2;
-    BUG_FIX = 3;
-    PRECISION_CHANGE = 4;
-  }
-  required string version = 1;
-  required Type type = 2;
-}
-
-// In some cases, Paddle Fluid may perform operator definition iterations,
-// and the operator uses OpCompatibleMap for compatibility testing.
-message OpCompatibleMap {
-  message OpCompatiblePair {
+// In some cases, Paddle may perform operator definition iterations,
+// and the operator uses OpVersionMap for compatibility testing.
+message OpVersion { required int32 version = 1; }
+message OpVersionMap {
+  message OpVersionPair {
     required string op_name = 1;
-    required CompatibleInfo compatible_info = 2;
+    required OpVersion op_version = 2;
   }
-  repeated OpCompatiblePair pair = 1;
-  optional string default_required_version = 2;
+  repeated OpVersionPair pair = 1;
 }
 
 // Please refer to
@@ -210,8 +196,8 @@ message OpCompatibleMap {
 // TODO(panyx0718): A model can have multiple programs. Need a
 // way to distinguish them. Maybe ID or name?
 message ProgramDesc {
-  reserved 2; // For backward compatibility.
+  reserved 2, 3; // For backward compatibility.
   repeated BlockDesc blocks = 1;
   optional Version version = 4;
-  optional OpCompatibleMap op_compatible_map = 3;
+  optional OpVersionMap op_version_map = 5;
 }
diff --git a/paddle/fluid/framework/op_compatible_info.cc b/paddle/fluid/framework/op_compatible_info.cc
index 826e14dedb76d..93826fc97b196 100644
--- a/paddle/fluid/framework/op_compatible_info.cc
+++ b/paddle/fluid/framework/op_compatible_info.cc
@@ -182,40 +182,5 @@ OpCompatibleType OpCompatibleMap::IsRequireMiniVersion(
   }
 }
 
-bool OpCompatibleMap::ConvertToProto(proto::OpCompatibleMap* desc) const {
-  desc->Clear();
-  desc->set_default_required_version(default_required_version_);
-  for (auto pair : op_compatible_map_) {
-    const CompatibleInfo& info = pair.second;
-    auto* pair_desc = desc->add_pair();
-    pair_desc->set_op_name(pair.first);
-    auto* info_desc = pair_desc->mutable_compatible_info();
-    info_desc->set_version(info.required_version_);
-    info_desc->set_type(
-        static_cast<proto::CompatibleInfo_Type>(info.compatible_type_));
-  }
-  return true;
-}
-
-bool OpCompatibleMap::ReadFromProto(const proto::OpCompatibleMap& desc) {
-  std::string version = desc.default_required_version();
-  if (version.empty()) {
-    LOG(INFO) << "The default operator required version is missing."
-                 " Please update the model version.";
-    return false;
-  }
-  op_compatible_map_.clear();
-  default_required_version_ = desc.default_required_version();
-  for (int i = 0; i < desc.pair_size(); ++i) {
-    const auto& pair_desc = desc.pair(i);
-    auto info_desc = pair_desc.compatible_info();
-    CompatibleInfo info(info_desc.version(),
-                        static_cast<OpCompatibleType>(info_desc.type()));
-    std::pair<std::string, CompatibleInfo> pair(pair_desc.op_name(), info);
-    op_compatible_map_.insert(pair);
-  }
-  return true;
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_compatible_info.h b/paddle/fluid/framework/op_compatible_info.h
index 01fbdef99cbbc..6f86b8b64ed21 100644
--- a/paddle/fluid/framework/op_compatible_info.h
+++ b/paddle/fluid/framework/op_compatible_info.h
@@ -58,14 +58,6 @@ class OpCompatibleMap {
   OpCompatibleType IsRequireMiniVersion(std::string op_name,
                                         std::string current_version) const;
 
-  // Convert the entire OpCompatibleMap to Proto, which can be serialized
-  // to the model file as part of the ProgramDesc.
-  bool ConvertToProto(proto::OpCompatibleMap* desc) const;
-
-  // Read and reset the entire object from proto, which can be read from
-  // the model file as part of the program.
-  bool ReadFromProto(const proto::OpCompatibleMap& desc);
-
   const std::string& GetDefaultRequiredVersion() const {
     return default_required_version_;
   }
diff --git a/paddle/fluid/framework/op_compatible_info_test.cc b/paddle/fluid/framework/op_compatible_info_test.cc
index 98f3f5071ad28..cf210ed8ab2d5 100644
--- a/paddle/fluid/framework/op_compatible_info_test.cc
+++ b/paddle/fluid/framework/op_compatible_info_test.cc
@@ -28,12 +28,6 @@ TEST(test_op_compatible_info, test_op_compatible) {
   auto comp_map = OpCompatibleMap();
   comp_map.InitOpCompatibleMap();
 
-  // Ensure save-load consistency.
-  auto program_desc = ProgramDesc();
-  proto::OpCompatibleMap* proto_map = program_desc.OpCompatibleMap();
-  comp_map.ConvertToProto(proto_map);
-  comp_map.ReadFromProto(*proto_map);
-
   ASSERT_NE(comp_map.GetDefaultRequiredVersion(), std::string());
   ASSERT_NE(comp_map.GetOpCompatibleInfo("sequence_pad").required_version_,
             std::string());
diff --git a/paddle/fluid/framework/op_version_proto.cc b/paddle/fluid/framework/op_version_proto.cc
new file mode 100644
index 0000000000000..696e322380740
--- /dev/null
+++ b/paddle/fluid/framework/op_version_proto.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_version_proto.h"
diff --git a/paddle/fluid/framework/op_version_proto.h b/paddle/fluid/framework/op_version_proto.h
new file mode 100644
index 0000000000000..1a876f43d2f00
--- /dev/null
+++ b/paddle/fluid/framework/op_version_proto.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/framework.pb.h"
+
+namespace paddle {
+namespace framework {
+namespace compatible {
+namespace pb {
+
+class OpVersion {
+ public:
+  explicit OpVersion(proto::OpVersion* desc) : desc_{desc} {}
+  void SetVersionID(uint32_t version) { desc_->set_version(version); }
+
+ private:
+  proto::OpVersion* desc_;
+};
+
+class OpVersionMap {
+ public:
+  explicit OpVersionMap(proto::OpVersionMap* desc) : desc_{desc} {}
+  OpVersion operator[](const std::string& key) {
+    for (int i = 0; i < desc_->pair_size(); ++i) {
+      if (desc_->pair(i).op_name() == key) {
+        return OpVersion(desc_->mutable_pair(i)->mutable_op_version());
+      }
+    }
+    auto* pair = desc_->add_pair();
+    pair->set_op_name(key);
+    return OpVersion(pair->mutable_op_version());
+  }
+
+ private:
+  proto::OpVersionMap* desc_;
+};
+
+}  // namespace pb
+}  // namespace compatible
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_version_registry.cc b/paddle/fluid/framework/op_version_registry.cc
index 11b7224e68340..9a67c160f0233 100644
--- a/paddle/fluid/framework/op_version_registry.cc
+++ b/paddle/fluid/framework/op_version_registry.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index fea043a0ff311..5ddaf1bd8d8ce 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@ limitations under the License. */
 
 #include <boost/any.hpp>
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/op_version_proto.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -159,12 +160,14 @@ class OpVersionRegistrar {
     op_version_map_.insert({op_type, OpVersion()});
     return op_version_map_[op_type];
   }
+  const std::unordered_map<std::string, OpVersion>& GetVersionMap() {
+    return op_version_map_;
+  }
   uint32_t GetVersionID(const std::string& op_type) const {
     auto it = op_version_map_.find(op_type);
     if (it == op_version_map_.end()) {
       return 0;
     }
-
     return it->second.GetVersionID();
   }
 
@@ -175,6 +178,14 @@ class OpVersionRegistrar {
   OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete;
 };
 
+inline void SaveOpVersions(
+    const std::unordered_map<std::string, OpVersion>& src,
+    pb::OpVersionMap* dst) {
+  for (const auto& pair : src) {
+    (*dst)[pair.first].SetVersionID(pair.second.GetVersionID());
+  }
+}
+
 class OpVersionComparator {
  public:
   virtual bool operator()() = 0;
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
index d6b18751cefe5..2b173c9571588 100644
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index d37a16a3e7d9f..0faa870f50565 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -39,8 +39,8 @@ proto::ProgramDesc *ProgramDesc::Proto() {
   return &desc_;
 }
 
-proto::OpCompatibleMap *ProgramDesc::OpCompatibleMap() {
-  return desc_.mutable_op_compatible_map();
+proto::OpVersionMap *ProgramDesc::OpVersionMap() {
+  return desc_.mutable_op_version_map();
 }
 
 int64_t ProgramDesc::Version() const { return desc_.version().version(); }
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index 5cafc9111da67..8b1aac95fc288 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -58,7 +58,7 @@ class ProgramDesc {
 
   proto::ProgramDesc *Proto();
 
-  proto::OpCompatibleMap *OpCompatibleMap();
+  proto::OpVersionMap *OpVersionMap();
 
   int64_t Version() const;
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 6c68b385bcbc0..98bee2d4bb471 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -192,11 +192,6 @@ bool AnalysisPredictor::PrepareProgram(
     // If config_.ir_optim() is False, parameters is loaded in LoadParameters(),
     // still need to create other persistable variables.
     // So in both case, create persistable variables at first.
-    if (!CheckOperatorCompatible()) {
-      LOG(WARNING) << "WARNING: Results may be DIFF! "
-                      "Please use the corresponding version of the model and "
-                      "prediction library, and do not use the develop branch.";
-    }
     executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
 
     // if enable_ir_optim_ is false,
@@ -998,40 +993,6 @@ std::string AnalysisPredictor::GetSerializedProgram() const {
   return inference_program_->Proto()->SerializeAsString();
 }
 
-bool AnalysisPredictor::CheckOperatorCompatible() {
-  if (!inference_program_) {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Inference program version check failed because the program does not "
-        "exist."));
-    return false;
-  }
-  bool res = true;
-  op_compatible_map_.ReadFromProto(*inference_program_->OpCompatibleMap());
-  const auto &version = framework::DumpVersion(framework::kCurProgramVersion);
-  LOG(INFO) << "MODEL VERSION: "
-            << framework::DumpVersion(inference_program_->Version());
-  LOG(INFO) << "PREDICTOR VERSION: " << version;
-  std::set<std::string> op_types;
-  for (size_t i = 0; i < inference_program_->Size(); ++i) {
-    const auto &block = inference_program_->Block(i);
-    for (const auto *op : block.AllOps()) {
-      op_types.insert(op->Type());
-    }
-  }
-  for (const auto type : op_types) {
-    auto compatible_type =
-        op_compatible_map_.IsRequireMiniVersion(type, version);
-    if (compatible_type != framework::OpCompatibleType::compatible) {
-      if (!framework::kCurProgramVersion) {
-        LOG(WARNING) << " - Version incompatible ("
-                     << static_cast<int>(compatible_type) << ") " << type;
-      }
-      res = false;
-    }
-  }
-  return res;
-}
-
 // Add SaveOptimModel
 void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
   // save model
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index c4a7173b0104b..269f2fd80bb47 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -335,13 +335,6 @@ class AnalysisPredictor : public PaddlePredictor {
   /// AnalysisPredictor::ZeroCopyRun() now.
   ///
   void MkldnnPostReset();
-  ///
-  /// \brief Compute compatibility based on model version information and
-  /// operator version information
-  ///
-  /// \return Compatible information
-  ///
-  bool CheckOperatorCompatible();
 
 #if PADDLE_WITH_TENSORRT
   ///
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b303ddde1366e..8c75db01dd221 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -36,9 +36,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_compatible_info.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
@@ -432,10 +432,12 @@ PYBIND11_MODULE(core_noavx, m) {
     return map_output;
   });
 
-  m.def("save_op_compatible_info", [](framework::ProgramDesc &desc) {
-    framework::OpCompatibleMap op_compatible_map;
-    op_compatible_map.InitOpCompatibleMap();
-    return op_compatible_map.ConvertToProto(desc.OpCompatibleMap());
+  m.def("save_op_version_info", [](framework::ProgramDesc &desc) {
+    framework::compatible::pb::OpVersionMap pb_vmap{desc.OpVersionMap()};
+    framework::compatible::SaveOpVersions(
+        framework::compatible::OpVersionRegistrar::GetInstance()
+            .GetVersionMap(),
+        &pb_vmap);
   });
 
   m.def(
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index fe5b683bdeaa3..bb55aeb70d1f2 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1346,7 +1346,7 @@ def save_inference_model(dirname,
         append_fetch_ops(main_program, fetch_var_names)
 
         main_program.desc._set_version()
-        paddle.fluid.core.save_op_compatible_info(main_program.desc)
+        paddle.fluid.core.save_op_version_info(main_program.desc)
         with open(model_basename, "wb") as f:
             f.write(main_program.desc.serialize_to_string())
     else:
@@ -1720,7 +1720,7 @@ def get_tensor(var):
     main_program = program.clone()
     program.desc.flush()
     main_program.desc._set_version()
-    paddle.fluid.core.save_op_compatible_info(program.desc)
+    paddle.fluid.core.save_op_version_info(program.desc)
 
     with open(model_path + ".pdmodel", "wb") as f:
         f.write(program.desc.serialize_to_string())

From f6ad2375beb86beaae6ede882a5758c1716e741b Mon Sep 17 00:00:00 2001
From: Double_V <liuvv0203@163.com>
Date: Sat, 10 Oct 2020 14:26:00 +0800
Subject: [PATCH 53/91] fix pool3d bug, test=develop (#27718)

* fix pool3d bug, test=develop

* fix unitest, test=develop

* fix test and fix pool2d bug, test=develop
---
 paddle/fluid/operators/math/pooling.cc        | 348 +++++++++++-------
 .../fluid/tests/unittests/test_pool2d_op.py   |  43 ++-
 .../fluid/tests/unittests/test_pool3d_op.py   |  40 +-
 3 files changed, 278 insertions(+), 153 deletions(-)

diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
index 40cea7483f397..fec738378a64c 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -60,19 +60,25 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
           if (adaptive) {
             hstart = AdaptStartIndex(ph, input_height, output_height);
             hend = AdaptEndIndex(ph, input_height, output_height);
-          } else {
-            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
           }
           for (int pw = 0; pw < output_width; ++pw) {
+            int pool_size = 1;
             if (adaptive) {
               wstart = AdaptStartIndex(pw, input_width, output_width);
               wend = AdaptEndIndex(pw, input_width, output_width);
             } else {
+              hstart = ph * stride_height - padding_height;
               wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
+              hend = std::min(hstart + ksize_height,
+                              input_height + padding_height);
+              wend =
+                  std::min(wstart + ksize_width, input_width + padding_width);
+              pool_size = (hend - hstart) * (wend - wstart);
+
               wstart = std::max(wstart, 0);
+              hstart = std::max(hstart, 0);
+              hend = std::min(hend, input_height);
+              wend = std::min(wend, input_width);
             }
 
             T ele = pool_process.initial();
@@ -81,9 +87,10 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                 pool_process.compute(input_data[h * input_width + w], &ele);
               }
             }
-            int pool_size = (exclusive || adaptive)
-                                ? (hend - hstart) * (wend - wstart)
-                                : ksize_height * ksize_width;
+            if (exclusive || adaptive) {
+              pool_size = (hend - hstart) * (wend - wstart);
+            }
+
             pool_process.finalize(static_cast<T>(pool_size), &ele);
             output_data[ph * output_width + pw] = ele;
           }
@@ -137,19 +144,25 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                hstart = ph * stride_height - padding_height;
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (hend - hstart) * (wend - wstart);
+
                 wstart = std::max(wstart, 0);
+                hstart = std::max(hstart, 0);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
               }
 
               T ele = pool_process.initial();
@@ -158,9 +171,9 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                   pool_process.compute(input_data[h * input_width + w], &ele);
                 }
               }
-              int pool_size = (exclusive || adaptive)
-                                  ? (hend - hstart) * (wend - wstart)
-                                  : ksize_height * ksize_width;
+              if (exclusive || adaptive) {
+                pool_size = (hend - hstart) * (wend - wstart);
+              }
               pool_process.finalize(static_cast<T>(pool_size), &ele);
               output_data[ph * output_width + pw] = ele;
             }
@@ -178,19 +191,25 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                hstart = ph * stride_height - padding_height;
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (hend - hstart) * (wend - wstart);
+
                 wstart = std::max(wstart, 0);
+                hstart = std::max(hstart, 0);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
               }
               T ele = pool_process.initial();
               for (int h = hstart; h < hend; ++h) {
@@ -201,10 +220,9 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                       &ele);
                 }
               }
-              int pool_size = (exclusive || adaptive)
-                                  ? (hend - hstart) * (wend - wstart)
-                                  : ksize_height * ksize_width;
-
+              if (exclusive || adaptive) {
+                pool_size = (hend - hstart) * (wend - wstart);
+              }
               pool_process.finalize(static_cast<T>(pool_size), &ele);
               output_data[ph * output_width * output_channels +
                           pw * output_channels + c] = ele;
@@ -262,23 +280,29 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
           if (adaptive) {
             hstart = AdaptStartIndex(ph, input_height, output_height);
             hend = AdaptEndIndex(ph, input_height, output_height);
-          } else {
-            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
           }
           for (int pw = 0; pw < output_width; ++pw) {
+            int pool_size = 1;
             if (adaptive) {
               wstart = AdaptStartIndex(pw, input_width, output_width);
               wend = AdaptEndIndex(pw, input_width, output_width);
             } else {
+              hstart = ph * stride_height - padding_height;
               wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
+              hend = std::min(hstart + ksize_height,
+                              input_height + padding_height);
+              wend =
+                  std::min(wstart + ksize_width, input_width + padding_width);
+              pool_size = (hend - hstart) * (wend - wstart);
+
               wstart = std::max(wstart, 0);
+              hstart = std::max(hstart, 0);
+              hend = std::min(hend, input_height);
+              wend = std::min(wend, input_width);
+            }
+            if (exclusive || adaptive) {
+              pool_size = (hend - hstart) * (wend - wstart);
             }
-            int pool_size = (exclusive || adaptive)
-                                ? (hend - hstart) * (wend - wstart)
-                                : ksize_height * ksize_width;
             float scale = 1.0 / pool_size;
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
@@ -346,23 +370,29 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                hstart = ph * stride_height - padding_height;
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (hend - hstart) * (wend - wstart);
+
                 wstart = std::max(wstart, 0);
+                hstart = std::max(hstart, 0);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
+              }
+              if (exclusive || adaptive) {
+                pool_size = (hend - hstart) * (wend - wstart);
               }
-              int pool_size = (exclusive || adaptive)
-                                  ? (hend - hstart) * (wend - wstart)
-                                  : ksize_height * ksize_width;
               float scale = 1.0 / pool_size;
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
@@ -391,23 +421,29 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                hstart = ph * stride_height - padding_height;
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (hend - hstart) * (wend - wstart);
+
                 wstart = std::max(wstart, 0);
+                hstart = std::max(hstart, 0);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
+              }
+              if (exclusive || adaptive) {
+                pool_size = (hend - hstart) * (wend - wstart);
               }
-              int pool_size = (exclusive || adaptive)
-                                  ? (hend - hstart) * (wend - wstart)
-                                  : ksize_height * ksize_width;
               float scale = 1.0 / pool_size;
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
@@ -672,34 +708,43 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     int dstart, dend;
     int hstart, hend;
     int wstart, wend;
+
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
           if (adaptive) {
             dstart = AdaptStartIndex(pd, input_depth, output_depth);
             dend = AdaptEndIndex(pd, input_depth, output_depth);
-          } else {
-            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
           }
+
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
+
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                dstart = pd * stride_depth - padding_depth;
+                dend =
+                    std::min(dstart + ksize_depth, input_depth + padding_depth);
+                hstart = ph * stride_height - padding_height;
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                dstart = std::max(dstart, 0);
+                hstart = std::max(hstart, 0);
                 wstart = std::max(wstart, 0);
+                dend = std::min(dend, input_depth);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
               }
               int output_idx = (pd * output_height + ph) * output_width + pw;
               T ele = pool_process.initial();
@@ -712,10 +757,9 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                   }
                 }
               }
-              int pool_size =
-                  (exclusive || adaptive)
-                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                      : ksize_depth * ksize_height * ksize_width;
+              if (exclusive || adaptive) {
+                pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+              }
               pool_process.finalize(static_cast<T>(pool_size), &ele);
               output_data[output_idx] = ele;
             }
@@ -767,7 +811,6 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     int dstart, dend;
     int hstart, hend;
     int wstart, wend;
-
     if (!channel_last) {
       const int input_stride = input_depth * input_height * input_width;
       const int output_stride = output_depth * output_height * output_width;
@@ -777,29 +820,40 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               dstart = AdaptStartIndex(pd, input_depth, output_depth);
               dend = AdaptEndIndex(pd, input_depth, output_depth);
-            } else {
-              dstart = pd * stride_depth - padding_depth;
-              dend = std::min(dstart + ksize_depth, input_depth);
-              dstart = std::max(dstart, 0);
             }
+
             for (int ph = 0; ph < output_height; ++ph) {
               if (adaptive) {
                 hstart = AdaptStartIndex(ph, input_height, output_height);
                 hend = AdaptEndIndex(ph, input_height, output_height);
-              } else {
-                hstart = ph * stride_height - padding_height;
-                hend = std::min(hstart + ksize_height, input_height);
-                hstart = std::max(hstart, 0);
               }
+
               for (int pw = 0; pw < output_width; ++pw) {
+                int pool_size = 1;
                 if (adaptive) {
                   wstart = AdaptStartIndex(pw, input_width, output_width);
                   wend = AdaptEndIndex(pw, input_width, output_width);
                 } else {
+                  dstart = pd * stride_depth - padding_depth;
+                  dend = std::min(dstart + ksize_depth,
+                                  input_depth + padding_depth);
+                  hstart = ph * stride_height - padding_height;
+                  hend = std::min(hstart + ksize_height,
+                                  input_height + padding_height);
                   wstart = pw * stride_width - padding_width;
-                  wend = std::min(wstart + ksize_width, input_width);
+                  wend = std::min(wstart + ksize_width,
+                                  input_width + padding_width);
+
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  dstart = std::max(dstart, 0);
+                  hstart = std::max(hstart, 0);
                   wstart = std::max(wstart, 0);
+                  dend = std::min(dend, input_depth);
+                  hend = std::min(hend, input_height);
+                  wend = std::min(wend, input_width);
                 }
+
                 int output_idx = (pd * output_height + ph) * output_width + pw;
                 T ele = pool_process.initial();
                 for (int d = dstart; d < dend; ++d) {
@@ -811,10 +865,10 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                     }
                   }
                 }
-                int pool_size =
-                    (exclusive || adaptive)
-                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
+                if (exclusive || adaptive) {
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                }
                 pool_process.finalize(static_cast<T>(pool_size), &ele);
                 output_data[output_idx] = ele;
               }
@@ -835,28 +889,38 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               dstart = AdaptStartIndex(pd, input_depth, output_depth);
               dend = AdaptEndIndex(pd, input_depth, output_depth);
-            } else {
-              dstart = pd * stride_depth - padding_depth;
-              dend = std::min(dstart + ksize_depth, input_depth);
-              dstart = std::max(dstart, 0);
             }
+
             for (int ph = 0; ph < output_height; ++ph) {
               if (adaptive) {
                 hstart = AdaptStartIndex(ph, input_height, output_height);
                 hend = AdaptEndIndex(ph, input_height, output_height);
-              } else {
-                hstart = ph * stride_height - padding_height;
-                hend = std::min(hstart + ksize_height, input_height);
-                hstart = std::max(hstart, 0);
               }
+
               for (int pw = 0; pw < output_width; ++pw) {
+                int pool_size = 1;
                 if (adaptive) {
                   wstart = AdaptStartIndex(pw, input_width, output_width);
                   wend = AdaptEndIndex(pw, input_width, output_width);
                 } else {
+                  dstart = pd * stride_depth - padding_depth;
+                  dend = std::min(dstart + ksize_depth,
+                                  input_depth + padding_depth);
+                  hstart = ph * stride_height - padding_height;
+                  hend = std::min(hstart + ksize_height,
+                                  input_height + padding_height);
                   wstart = pw * stride_width - padding_width;
-                  wend = std::min(wstart + ksize_width, input_width);
+                  wend = std::min(wstart + ksize_width,
+                                  input_width + padding_width);
+
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  dstart = std::max(dstart, 0);
+                  hstart = std::max(hstart, 0);
                   wstart = std::max(wstart, 0);
+                  dend = std::min(dend, input_depth);
+                  hend = std::min(hend, input_height);
+                  wend = std::min(wend, input_width);
                 }
 
                 T ele = pool_process.initial();
@@ -871,10 +935,10 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                     }
                   }
                 }
-                int pool_size =
-                    (exclusive || adaptive)
-                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
+                if (exclusive || adaptive) {
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                }
                 pool_process.finalize(static_cast<T>(pool_size), &ele);
                 int output_idx =
                     ((pd * output_height + ph) * output_width + pw) *
@@ -943,34 +1007,42 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
           if (adaptive) {
             dstart = AdaptStartIndex(pd, input_depth, output_depth);
             dend = AdaptEndIndex(pd, input_depth, output_depth);
-          } else {
-            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
           }
+
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
+
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                dstart = pd * stride_depth - padding_depth;
+                dend =
+                    std::min(dstart + ksize_depth, input_depth + padding_depth);
+                hstart = ph * stride_height - padding_height;
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+
+                pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                dstart = std::max(dstart, 0);
+                hstart = std::max(hstart, 0);
                 wstart = std::max(wstart, 0);
+                dend = std::min(dend, input_depth);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
               }
 
-              int pool_size =
-                  (exclusive || adaptive)
-                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                      : ksize_depth * ksize_height * ksize_width;
+              if (exclusive || adaptive) {
+                pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+              }
               float scale = 1.0 / pool_size;
               for (int d = dstart; d < dend; ++d) {
                 for (int h = hstart; h < hend; ++h) {
@@ -1046,34 +1118,44 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               dstart = AdaptStartIndex(pd, input_depth, output_depth);
               dend = AdaptEndIndex(pd, input_depth, output_depth);
-            } else {
-              dstart = pd * stride_depth - padding_depth;
-              dend = std::min(dstart + ksize_depth, input_depth);
-              dstart = std::max(dstart, 0);
             }
+
             for (int ph = 0; ph < output_height; ++ph) {
               if (adaptive) {
                 hstart = AdaptStartIndex(ph, input_height, output_height);
                 hend = AdaptEndIndex(ph, input_height, output_height);
-              } else {
-                hstart = ph * stride_height - padding_height;
-                hend = std::min(hstart + ksize_height, input_height);
-                hstart = std::max(hstart, 0);
               }
+
               for (int pw = 0; pw < output_width; ++pw) {
+                int pool_size = 1;
                 if (adaptive) {
                   wstart = AdaptStartIndex(pw, input_width, output_width);
                   wend = AdaptEndIndex(pw, input_width, output_width);
                 } else {
+                  dstart = pd * stride_depth - padding_depth;
+                  dend = std::min(dstart + ksize_depth,
+                                  input_depth + padding_depth);
+                  hstart = ph * stride_height - padding_height;
+                  hend = std::min(hstart + ksize_height,
+                                  input_height + padding_height);
                   wstart = pw * stride_width - padding_width;
-                  wend = std::min(wstart + ksize_width, input_width);
+                  wend = std::min(wstart + ksize_width,
+                                  input_width + padding_width);
+
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  dstart = std::max(dstart, 0);
+                  hstart = std::max(hstart, 0);
                   wstart = std::max(wstart, 0);
+                  dend = std::min(dend, input_depth);
+                  hend = std::min(hend, input_height);
+                  wend = std::min(wend, input_width);
                 }
 
-                int pool_size =
-                    (exclusive || adaptive)
-                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
+                if (exclusive || adaptive) {
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                }
                 float scale = 1.0 / pool_size;
                 for (int d = dstart; d < dend; ++d) {
                   for (int h = hstart; h < hend; ++h) {
@@ -1108,34 +1190,44 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               dstart = AdaptStartIndex(pd, input_depth, output_depth);
               dend = AdaptEndIndex(pd, input_depth, output_depth);
-            } else {
-              dstart = pd * stride_depth - padding_depth;
-              dend = std::min(dstart + ksize_depth, input_depth);
-              dstart = std::max(dstart, 0);
             }
+
             for (int ph = 0; ph < output_height; ++ph) {
               if (adaptive) {
                 hstart = AdaptStartIndex(ph, input_height, output_height);
                 hend = AdaptEndIndex(ph, input_height, output_height);
-              } else {
-                hstart = ph * stride_height - padding_height;
-                hend = std::min(hstart + ksize_height, input_height);
-                hstart = std::max(hstart, 0);
               }
+
               for (int pw = 0; pw < output_width; ++pw) {
+                int pool_size = 1;
                 if (adaptive) {
                   wstart = AdaptStartIndex(pw, input_width, output_width);
                   wend = AdaptEndIndex(pw, input_width, output_width);
                 } else {
+                  dstart = pd * stride_depth - padding_depth;
+                  dend = std::min(dstart + ksize_depth,
+                                  input_depth + padding_depth);
+                  hstart = ph * stride_height - padding_height;
+                  hend = std::min(hstart + ksize_height,
+                                  input_height + padding_height);
                   wstart = pw * stride_width - padding_width;
-                  wend = std::min(wstart + ksize_width, input_width);
+                  wend = std::min(wstart + ksize_width,
+                                  input_width + padding_width);
+
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  dstart = std::max(dstart, 0);
+                  hstart = std::max(hstart, 0);
                   wstart = std::max(wstart, 0);
+                  dend = std::min(dend, input_depth);
+                  hend = std::min(hend, input_height);
+                  wend = std::min(wend, input_width);
                 }
 
-                int pool_size =
-                    (exclusive || adaptive)
-                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
+                if (exclusive || adaptive) {
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                }
                 float scale = 1.0 / pool_size;
                 for (int d = dstart; d < dend; ++d) {
                   for (int h = hstart; h < hend; ++h) {
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index a12a328b653b2..5e8828c3e9126 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -102,14 +102,21 @@ def avg_pool2D_forward_naive(x,
                 c_start = adaptive_start_index(j, W, ksize[1])
                 c_end = adaptive_end_index(j, W, ksize[1])
             else:
-                r_start = np.max((i * strides[0] - paddings[0], 0))
-                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-                c_start = np.max((j * strides[1] - paddings[1], 0))
-                c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                r_start = i * strides[0] - paddings[0]
+                r_end = i * strides[0] + ksize[0] - paddings[0]
+                c_start = j * strides[1] - paddings[1]
+                c_end = j * strides[1] + ksize[1] - paddings[1]
+                field_size = (r_end - r_start) * (c_end - c_start)
+                r_start = np.max((r_start, 0))
+                r_end = np.min((r_end, H))
+                c_start = np.max((c_start, 0))
+                c_end = np.min((c_end, W))
+
             x_masked = x[:, :, r_start:r_end, c_start:c_end]
 
-            field_size = ((r_end - r_start) * (c_end - c_start)) \
-                if (exclusive or adaptive) else (ksize[0] * ksize[1])
+            if (exclusive or adaptive):
+                field_size = (r_end - r_start) * (c_end - c_start)
+
             if data_type == np.int8 or data_type == np.uint8:
                 out[:, :, i, j] = (np.rint(
                     np.sum(x_masked, axis=(2, 3)) /
@@ -207,22 +214,34 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                 in_w_start = adaptive_start_index(j, W, ksize[1])
                 in_w_end = adaptive_end_index(j, W, ksize[1])
             else:
-                in_w_start = np.max((j * strides[1] - pad_w_left, 0))
-                in_w_end = np.min((j * strides[1] + ksize[1] - pad_w_left, W))
+                in_h_start = i * strides[0] - pad_h_up
+                in_w_start = j * strides[1] - pad_w_left
+                in_h_end = i * strides[0] + ksize[0] - pad_h_up
+                in_w_end = j * strides[1] + ksize[1] - pad_w_left
+
+                field_size = (in_h_end - in_h_start) * (in_w_end - in_w_start)
+                in_h_start = np.max((in_h_start, 0))
+                in_w_start = np.max((in_w_start, 0))
+                in_h_end = np.min((in_h_end, H))
+                in_w_end = np.min((in_w_end, W))
 
             if data_format == 'NCHW':
                 x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
                 if pool_type == 'avg':
-                    field_size = ((in_h_end - in_h_start) * (in_w_end - in_w_start)) \
-                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
+                    if (exclusive or adaptive):
+                        field_size = (in_h_end - in_h_start) * (
+                            in_w_end - in_w_start)
+
+#                         if (exclusive or adaptive) else (ksize[0] * ksize[1])
                     out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
                 elif pool_type == 'max':
                     out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
             elif data_format == 'NHWC':
                 x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
                 if pool_type == 'avg':
-                    field_size = ((in_h_end - in_h_start) * (in_w_end - in_w_start)) \
-                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
+                    if (exclusive or adaptive):
+                        field_size = (in_h_end - in_h_start) * (
+                            in_w_end - in_w_start)
                     out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
                 elif pool_type == 'max':
                     out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 3d139e9b90c10..eab7126c7a422 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -116,32 +116,44 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         if adaptive:
             d_start = adaptive_start_index(k, D, ksize[0])
             d_end = adaptive_end_index(k, D, ksize[0])
-        else:
-            d_start = np.max((k * strides[0] - pad_d_forth, 0))
-            d_end = np.min((k * strides[0] + ksize[0] - pad_d_forth, D))
 
         for i in range(H_out):
             if adaptive:
                 h_start = adaptive_start_index(i, H, ksize[1])
                 h_end = adaptive_end_index(i, H, ksize[1])
-            else:
-                h_start = np.max((i * strides[1] - pad_h_up, 0))
-                h_end = np.min((i * strides[1] + ksize[1] - pad_h_up, H))
 
             for j in range(W_out):
                 if adaptive:
                     w_start = adaptive_start_index(j, W, ksize[2])
                     w_end = adaptive_end_index(j, W, ksize[2])
                 else:
-                    w_start = np.max((j * strides[2] - pad_w_left, 0))
-                    w_end = np.min((j * strides[2] + ksize[2] - pad_w_left, W))
 
+                    d_start = k * strides[0] - pad_d_forth
+                    d_end = np.min((k * strides[0] + ksize[0] - pad_d_forth,
+                                    D + pad_d_back))
+                    h_start = i * strides[1] - pad_h_up
+                    h_end = np.min(
+                        (i * strides[1] + ksize[1] - pad_h_up, H + pad_h_down))
+                    w_start = j * strides[2] - pad_w_left
+                    w_end = np.min((j * strides[2] + ksize[2] - pad_w_left,
+                                    W + pad_w_right))
+
+                    field_size = (d_end - d_start) * (h_end - h_start) * (
+                        w_end - w_start)
+                    w_start = np.max((w_start, 0))
+                    d_start = np.max((d_start, 0))
+                    h_start = np.max((h_start, 0))
+                    w_end = np.min((w_end, W))
+                    d_end = np.min((d_end, D))
+                    h_end = np.min((h_end, H))
                 if data_format == 'NCDHW':
                     x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
                                  w_end]
                     if pool_type == 'avg':
-                        field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
-                            if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
+                        if (exclusive or adaptive):
+                            field_size = (d_end - d_start) * (
+                                h_end - h_start) * (w_end - w_start)
+
                         out[:, :, k, i, j] = np.sum(x_masked,
                                                     axis=(2, 3, 4)) / field_size
                     elif pool_type == 'max':
@@ -151,8 +163,10 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                     x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
                                  w_end, :]
                     if pool_type == 'avg':
-                        field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
-                            if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
+                        if (exclusive or adaptive):
+                            field_size = (d_end - d_start) * (
+                                h_end - h_start) * (w_end - w_start)
+
                         out[:, k, i, j, :] = np.sum(x_masked,
                                                     axis=(1, 2, 3)) / field_size
                     elif pool_type == 'max':
@@ -564,7 +578,7 @@ def init_exclusive(self):
         self.exclusive = False
 
     def init_paddings(self):
-        self.paddings = [1, 2, 1, 1, 1, 0]
+        self.paddings = [2, 2, 1, 1, 0, 0]
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),

From 65c06141b63d6d6827e1463c1499afd3065cabfc Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Sat, 10 Oct 2020 14:54:58 +0800
Subject: [PATCH 54/91] disable_fuse_all_reduce (#27746)

* disable_fuse_all_reduce

* fix test

* fix ut
---
 .../tests/unittests/test_buffer_shared_memory_reuse_pass.py  | 5 +++++
 ...shared_memory_reuse_pass_and_fuse_optimization_op_pass.py | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
index 9dd617f90b65d..7bdfa3d2dfd74 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -34,6 +34,7 @@ class InplaceTestBase(unittest.TestCase):
     def initParameter(self):
         self.use_cuda = True
         self.fuse_all_optimizer_ops = False
+        self.fuse_all_reduce_ops = False
 
     def setUp(self):
         paddle.enable_static()
@@ -93,6 +94,7 @@ def check_single_card_fetch_var(self):
                 build_strategy.memory_optimize = memory_optimize
                 build_strategy.enable_inplace = enable_inplace
                 build_strategy.fuse_all_optimizer_ops = self.fuse_all_optimizer_ops
+                build_strategy.fuse_all_reduce_ops = self.fuse_all_reduce_ops
                 compiled_prog = fluid.CompiledProgram(prog).with_data_parallel(
                     loss_name=loss.name,
                     build_strategy=build_strategy,
@@ -146,6 +148,7 @@ def check_multi_card_fetch_var(self):
                 build_strategy.memory_optimize = memory_optimize
                 build_strategy.enable_inplace = enable_inplace
                 build_strategy.fuse_all_optimizer_ops = self.fuse_all_optimizer_ops
+                build_strategy.fuse_all_reduce_ops = self.fuse_all_reduce_ops
                 compiled_program = fluid.CompiledProgram(
                     prog).with_data_parallel(
                         loss_name=loss.name,
@@ -175,6 +178,7 @@ class CUDAInplaceTest(InplaceTestBase):
     def initParameter(self):
         self.use_cuda = True
         self.fuse_all_optimizer_ops = False
+        self.fuse_all_reduce_ops = False
 
     def test_multi_card_fetch_var(self):
         self.check_multi_card_fetch_var()
@@ -187,6 +191,7 @@ class CPUInplaceTest(InplaceTestBase):
     def initParameter(self):
         self.use_cuda = False
         self.fuse_all_optimizer_ops = False
+        self.fuse_all_reduce_ops = False
 
     def test_multi_card_fetch_var(self):
         self.check_multi_card_fetch_var()
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py
index 0b14cab4a7846..e9e62bee00680 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py
@@ -20,6 +20,7 @@ class CUDAInplaceTestWithFuseOptimizationOps(InplaceTestBase):
     def initParameter(self):
         self.use_cuda = True
         self.fuse_all_optimizer_ops = True
+        self.fuse_all_reduce_ops = False
 
     def test_multi_card_fetch_var(self):
         self.check_multi_card_fetch_var()
@@ -32,6 +33,7 @@ class CPUInplaceTestWithFuseOptimizationOps(InplaceTestBase):
     def initParameter(self):
         self.use_cuda = False
         self.fuse_all_optimizer_ops = True
+        self.fuse_all_reduce_ops = False
 
     def test_multi_card_fetch_var(self):
         self.check_multi_card_fetch_var()

From 5098891fdf573a9a2db5fedacbefa059c9def8ce Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Sat, 10 Oct 2020 15:34:54 +0800
Subject: [PATCH 55/91] add softmax xpu kernel (#27700)

---
 paddle/fluid/operators/softmax_op_xpu.cc      | 99 +++++++++++++++++++
 .../unittests/xpu/test_softmax_op_xpu.py      | 93 +++++++++++++++++
 2 files changed, 192 insertions(+)
 create mode 100644 paddle/fluid/operators/softmax_op_xpu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py

diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc
new file mode 100644
index 0000000000000..29740000aeb4c
--- /dev/null
+++ b/paddle/fluid/operators/softmax_op_xpu.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+template <typename DeviceContext, typename T>
+class SoftmaxXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    const int rank = x->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    PADDLE_ENFORCE_EQ(axis == -1 || axis == rank - 1, true,
+                      platform::errors::InvalidArgument(
+                          "xpu softmax kernel only support last dimension of x "
+                          "(axis==-1 or axis==x_dims-1), but received axis: "
+                          "%d, x's shape: %s.",
+                          axis, x->dims()));
+
+    // allocate memory on device.
+    out->mutable_data<T>(context.GetPlace());
+
+    const int n = SizeToAxis(axis, x->dims());
+    const int d = SizeFromAxis(axis, x->dims());
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::softmax2d_forward(dev_ctx.x_context(), x->data<float>(),
+                                   out->data<float>(), n, d, d <= 2048);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(softmax2d_forward) return wrong "
+                                   "value[%d], please check whether "
+                                   "Baidu Kunlun Card is properly installed.",
+                                   r));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SoftmaxGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* out = context.Input<Tensor>("Out");
+    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    const int rank = dx->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+
+    // allocate memory on device.
+    dx->mutable_data<T>(context.GetPlace());
+
+    const int n = SizeToAxis(axis, dx->dims());
+    const int d = SizeFromAxis(axis, dx->dims());
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r =
+        xpu::softmax2d_backward(dev_ctx.x_context(), out->data<float>(),
+                                dout->data<float>(), dx->data<float>(), n, d);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(softmax2d_backward) return wrong "
+                                   "value[%d], please check whether "
+                                   "Baidu Kunlun Card is properly installed.",
+                                   r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    softmax, ops::SoftmaxXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    softmax_grad,
+    ops::SoftmaxGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
new file mode 100644
index 0000000000000..92842fbc2e65a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import sys
+import unittest
+sys.path.append("..")
+from op_test import OpTest
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+def stable_softmax(x):
+    """Compute the softmax of vector x in a numerically stable way."""
+    # clip to shiftx, otherwise, when calc loss with
+    # log(exp(shiftx)), may get log(0)=INF
+    shiftx = (x - np.max(x)).clip(-64.)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
+def ref_softmax(x, axis=None, dtype=None):
+    x_t = x.copy()
+    if dtype is not None:
+        x_t = x_t.astype(dtype)
+    if axis is None:
+        axis = -1
+    return np.apply_along_axis(stable_softmax, axis, x_t)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmaxOp(OpTest):
+    def setUp(self):
+        self.op_type = "softmax"
+        self.dtype = np.float32
+        self.shape = [2, 3, 4, 5]
+        self.axis = -1
+        self.set_attrs()
+
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out = np.apply_along_axis(stable_softmax, self.axis, x)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {'axis': self.axis, 'use_xpu': True}
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.XPUPlace(0), atol=1e-4)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(paddle.XPUPlace(0), ['X'], 'Out')
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmaxAxis3(TestXPUSoftmaxOp):
+    def set_attrs(self):
+        self.axis = 3
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmax2D(TestXPUSoftmaxOp):
+    def set_attrs(self):
+        self.shape = [10, 12]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmax3D(TestXPUSoftmaxOp):
+    def set_attrs(self):
+        self.shape = [4, 5, 6]
+
+
+if __name__ == "__main__":
+    unittest.main()

From 0025e0d87b71ee2301fdbbf7e63229c9480ee240 Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Sat, 10 Oct 2020 15:50:10 +0800
Subject: [PATCH 56/91] refine APIs: brelu, hardsigmoid, hardswish, maxout
 (#27658)

---
 paddle/fluid/operators/maxout_op.cc           |  12 +
 paddle/fluid/operators/maxout_op.h            |   7 +
 python/paddle/fluid/layers/nn.py              |  57 ++--
 .../tests/unittests/test_activation_op.py     | 257 ++++++++++++------
 .../fluid/tests/unittests/test_layers.py      |  29 --
 .../fluid/tests/unittests/test_maxout_op.py   | 153 +++++++----
 python/paddle/nn/__init__.py                  |   3 +
 python/paddle/nn/functional/__init__.py       |   5 +-
 python/paddle/nn/functional/activation.py     | 213 +++++++++++++--
 python/paddle/nn/layer/activation.py          | 209 ++++++++++++--
 10 files changed, 685 insertions(+), 260 deletions(-)

diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index 7db2e9421b5ca..6d8d18a3d126e 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -83,6 +83,18 @@ class MaxOutOp : public framework::OperatorWithKernel {
                                      "Attr(groups) of Op(maxout) should be "
                                      "larger than 1. But received %d.",
                                      groups));
+    PADDLE_ENFORCE_EQ(
+        axis == 1 || axis == -1 || axis == 3, true,
+        platform::errors::InvalidArgument(
+            "axis only supported 1, -1 or 3, but recevied axis is: %d", axis));
+    PADDLE_ENFORCE_EQ(in_x_dims.size(), 4,
+                      platform::errors::InvalidArgument(
+                          "x's dims should be 4, but received x's dims is: %d",
+                          in_x_dims.size()));
+
+    if (axis < 0) {
+      axis += in_x_dims.size();
+    }
     PADDLE_ENFORCE_EQ(
         in_x_dims[axis] % groups, 0,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h
index ec3897e4044ad..64b538fc5d5bd 100644
--- a/paddle/fluid/operators/maxout_op.h
+++ b/paddle/fluid/operators/maxout_op.h
@@ -31,6 +31,9 @@ class MaxOutKernel : public framework::OpKernel<T> {
     Tensor* out = context.Output<Tensor>("Out");
     int groups = context.template Attr<int>("groups");
     int axis = context.template Attr<int>("axis");
+    if (axis < 0) {
+      axis += in_x->dims().size();
+    }
 
     math::MaxOutFunctor<DeviceContext, T> maxout_forward;
     maxout_forward(context.template device_context<DeviceContext>(), *in_x, out,
@@ -49,6 +52,10 @@ class MaxOutGradKernel : public framework::OpKernel<T> {
     Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
     int groups = context.template Attr<int>("groups");
     int axis = context.template Attr<int>("axis");
+    if (axis < 0) {
+      axis += in_x->dims().size();
+    }
+
     auto& device_ctx = context.template device_context<DeviceContext>();
     math::SetConstant<DeviceContext, T> zero;
     if (in_x_grad) {
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 90f7cbe395047..8cb0404c18cad 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -9592,10 +9592,6 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
 @templatedoc()
 def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
     """
-    :alias_main: paddle.nn.functional.hard_sigmoid
-	:alias: paddle.nn.functional.hard_sigmoid,paddle.nn.functional.activation.hard_sigmoid
-	:old_api: paddle.fluid.layers.hard_sigmoid
-
     ${comment}
     Parameters:
         x (${x_type}): ${x_comment}
@@ -9613,9 +9609,15 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+
             data = fluid.layers.fill_constant(shape=[3, 2], value=0.5, dtype='float32') # [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]
             result = fluid.layers.hard_sigmoid(data) # [[0.6, 0.6], [0.6, 0.6], [0.6, 0.6]]
     """
+    if in_dygraph_mode():
+        return core.ops.hard_sigmoid(x, 'slope', slope, 'offset', offset)
+
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hard_sigmoid')
 
@@ -9802,10 +9804,6 @@ def prelu(x, mode, param_attr=None, name=None):
 @templatedoc()
 def brelu(x, t_min=0.0, t_max=24.0, name=None):
     """
-    :alias_main: paddle.nn.functional.brelu
-	:alias: paddle.nn.functional.brelu,paddle.nn.functional.activation.brelu
-	:old_api: paddle.fluid.layers.brelu
-
     ${comment}
     Args:
         x(${x_type}): ${x_comment}
@@ -9821,7 +9819,9 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
     .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
             import numpy as np
+            paddle.enable_static()
 
             input_brelu = np.array([[-1,6],[1,15.6]])
             with fluid.dygraph.guard():
@@ -9831,6 +9831,9 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
                 #[[ 1.  6.]
                 #[ 1. 10.]]
     """
+    if in_dygraph_mode():
+        return core.ops.brelu(x, 't_min', t_min, 't_max', t_max)
+
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'brelu')
 
     helper = LayerHelper('brelu', **locals())
@@ -12564,13 +12567,10 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.maxout")
 @templatedoc()
 def maxout(x, groups, name=None, axis=1):
     """
-    :alias_main: paddle.nn.functional.maxout
-	:alias: paddle.nn.functional.maxout,paddle.nn.functional.activation.maxout
-	:old_api: paddle.fluid.layers.maxout
-
     ${comment}
 
     Args:
@@ -12592,31 +12592,16 @@ def maxout(x, groups, name=None, axis=1):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+
             input = fluid.data(
                 name='data',
                 shape=[None, 256, 32, 32],
                 dtype='float32')
             out = fluid.layers.maxout(input, groups=2)
     """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'maxout')
-
-    helper = LayerHelper("maxout", **locals())
-    if axis not in [1, -1, 3]:
-        raise ValueError(
-            "Attr(axis) should be 1 when data format is NCHW, -1 or 3 when data format is NHWC. Received "
-            "Attr(axis): %s." % str(axis))
-    if axis == -1:
-        axis = 3
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    helper.append_op(
-        type="maxout",
-        inputs={"X": x},
-        attrs={"groups": groups,
-               "axis": axis},
-        outputs={"Out": out})
-    return out
+    return paddle.nn.functional.maxout(**locals())
 
 
 def space_to_depth(x, blocksize, name=None):
@@ -14877,10 +14862,6 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
 @templatedoc()
 def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
     """
-    :alias_main: paddle.nn.functional.hard_swish
-	:alias: paddle.nn.functional.hard_swish,paddle.nn.functional.activation.hard_swish
-	:old_api: paddle.fluid.layers.hard_swish
-
     This operator implements the hard_swish activation function.
     Hard_swish is proposed in MobileNetV3, and performs better in computational stability and efficiency compared to swish function.
     For more details please refer to: https://arxiv.org/pdf/1905.02244.pdf
@@ -14911,7 +14892,9 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
     .. code-block:: python
 
         import paddle.fluid as fluid
+        import paddle
         import numpy as np
+        paddle.enable_static()
 
         DATATYPE='float32'
 
@@ -14926,6 +14909,10 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
         out, = exe.run(feed={'x':x_data}, fetch_list=[y.name])
         print(out)  # [[0.66666667, 1.66666667,3., 4.]]
     """
+    if in_dygraph_mode():
+        return core.ops.hard_swish(x, 'threshold', threshold, 'scale', scale,
+                                   'offset', offset)
+
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hard_swish')
 
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 68a5fa5e8f367..6b729e6297bed 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -25,10 +25,11 @@
 import paddle.nn.functional as F
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 class TestSqrtOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input type of sqrt op must be Variable or numpy.ndarray.
             in1 = 1
@@ -45,7 +46,6 @@ def test_errors(self):
 
 class TestActivation(OpTest):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "exp"
         self.init_dtype()
         self.init_kernel_type()
@@ -74,7 +74,6 @@ def init_kernel_type(self):
 
 class TestParameter(object):
     def test_out_name(self):
-        paddle.enable_static()
         with fluid.program_guard(fluid.Program()):
             np_x = np.array([0.1])
             data = fluid.layers.data(name="X", shape=[1])
@@ -96,7 +95,6 @@ def test_dygraph(self):
 
 class TestSigmoid(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "sigmoid"
         self.init_dtype()
 
@@ -118,7 +116,6 @@ def test_check_grad(self):
 
 class TestLogSigmoid(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "logsigmoid"
         self.init_dtype()
 
@@ -192,7 +189,6 @@ def test_errors(self):
 
 class TestTanh(TestActivation, TestParameter):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "tanh"
         self.init_dtype()
         np.random.seed(1024)
@@ -273,7 +269,6 @@ def test_errors(self):
 
 class TestAtan(TestActivation, TestParameter):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "atan"
         self.init_dtype()
 
@@ -311,7 +306,6 @@ def test_dygraph(self):
 
 class TestSinh(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "sinh"
         self.init_dtype()
 
@@ -371,7 +365,6 @@ def test_backward(self):
 
 class TestSinhOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.sinh, 1)
@@ -385,7 +378,6 @@ def test_errors(self):
 
 class TestCosh(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "cosh"
         self.init_dtype()
 
@@ -445,7 +437,6 @@ def test_backward(self):
 
 class TestCoshOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.cosh, 1)
@@ -464,7 +455,6 @@ def ref_tanhshrink(x):
 
 class TestTanhshrink(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "tanh_shrink"
         self.init_dtype()
 
@@ -544,7 +534,6 @@ def ref_hardshrink(x, threshold):
 
 class TestHardShrink(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "hard_shrink"
         self.init_dtype()
 
@@ -575,7 +564,6 @@ def set_attrs(self):
 class TestHardShrinkAPI(unittest.TestCase):
     # test paddle.nn.Hardshrink, paddle.nn.functional.hardshrink
     def setUp(self):
-        paddle.enable_static()
         np.random.seed(1024)
         self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32')
         self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
@@ -704,7 +692,6 @@ def ref_softshrink(x, threshold=0.5):
 
 class TestSoftshrink(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "softshrink"
         self.init_dtype()
 
@@ -784,7 +771,6 @@ def test_errors(self):
 
 class TestSqrt(TestActivation, TestParameter):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "sqrt"
         self.init_dtype()
 
@@ -803,7 +789,6 @@ def test_check_grad(self):
 
 class TestRsqrt(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "rsqrt"
         self.init_dtype()
 
@@ -822,7 +807,6 @@ def test_check_grad(self):
 
 class TestAbs(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "abs"
         self.init_dtype()
 
@@ -846,7 +830,6 @@ def test_check_grad(self):
 
 class TestCeil(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "ceil"
         self.init_dtype()
 
@@ -864,7 +847,6 @@ def test_check_grad(self):
 
 class TestFloor(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "floor"
         self.init_dtype()
 
@@ -884,7 +866,6 @@ def test_check_grad(self):
 
 class TestCos(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "cos"
         self.init_dtype()
 
@@ -903,7 +884,6 @@ def test_check_grad(self):
 
 class TestAcos(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "acos"
         self.init_dtype()
 
@@ -922,7 +902,6 @@ def test_check_grad(self):
 
 class TestSin(TestActivation, TestParameter):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "sin"
         self.init_dtype()
 
@@ -941,7 +920,6 @@ def test_check_grad(self):
 
 class TestAsin(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "asin"
         self.init_dtype()
 
@@ -960,7 +938,6 @@ def test_check_grad(self):
 
 class TestRound(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "round"
         self.init_dtype()
 
@@ -977,7 +954,6 @@ def test_check_grad(self):
 
 class TestRelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "relu"
         self.init_dtype()
 
@@ -1052,7 +1028,6 @@ def get_alpha(self):
         return 0.02
 
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "leaky_relu"
         self.init_dtype()
         alpha = self.get_alpha()
@@ -1162,7 +1137,6 @@ def gelu(x, approximate):
 
 class TestGeluApproximate(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "gelu"
         self.init_dtype()
         approximate = True
@@ -1182,7 +1156,6 @@ def test_check_grad(self):
 
 class TestGelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "gelu"
         self.init_dtype()
         approximate = False
@@ -1254,7 +1227,6 @@ def test_errors(self):
 
 class TestBRelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "brelu"
         self.init_dtype()
 
@@ -1279,9 +1251,35 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-class TestBReluOpError(unittest.TestCase):
+class TestBreluAPI(unittest.TestCase):
+    # test paddle.fluid.layers.brelu
+    def setUp(self):
+        np.random.seed(1024)
+        self.t_min = 0.
+        self.t_max = 24.
+        self.x_np = np.random.uniform(-1, 30, [10, 12]).astype('float32')
+        self.out_ref = np.copy(self.x_np)
+        self.out_ref[self.out_ref < self.t_min] = self.t_min
+        self.out_ref[self.out_ref > self.t_max] = self.t_max
+        self.out_ref = self.out_ref.astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_fluid_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data('X', [10, 12])
+            out = paddle.fluid.layers.brelu(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+            self.assertTrue(np.allclose(self.out_ref, res[0]))
+
+            paddle.disable_static(self.place)
+            x = paddle.to_tensor(self.x_np)
+            out = paddle.fluid.layers.brelu(x)
+            self.assertTrue(np.allclose(self.out_ref, out.numpy()))
+            paddle.enable_static()
+
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.brelu, 1)
@@ -1303,7 +1301,6 @@ def ref_relu6(x, threshold=6.0):
 
 class TestRelu6(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "relu6"
         self.init_dtype()
 
@@ -1378,9 +1375,13 @@ def test_errors(self):
             F.relu6(x_fp16)
 
 
+def ref_hardswish(x, threshold=6.0, scale=6.0, offset=3.0):
+    return (x * np.minimum(np.maximum(x + offset, 0.), threshold) /
+            scale).astype(x.dtype)
+
+
 class TestHardSwish(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = 'hard_swish'
         self.init_dtype()
 
@@ -1392,9 +1393,9 @@ def setUp(self):
         #the same with TestAbs
         x[np.abs(x + offset) < 0.005] = 0.02
         x[np.abs(x - threshold + offset) < 0.005] = threshold - offset + 0.02
-        out = x * np.minimum(np.maximum(x + offset, 0), threshold) / scale
+        out = ref_hardswish(x, threshold, scale, offset)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.attrs = {'threshold': threshold, 'scale': scale, 'offset': offset}
         self.outputs = {'Out': out}
 
@@ -1404,23 +1405,65 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-class TestHardSwishOpError(unittest.TestCase):
-    def test_errors(self):
+class TestHardswishAPI(unittest.TestCase):
+    # test paddle.nn.Hardswish, paddle.nn.functional.hardswish
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.hardswish(x)
+            m = paddle.nn.Hardswish()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_hardswish(self.x_np)
+        for r in res:
+            self.assertTrue(np.allclose(out_ref, r))
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.hardswish(x)
+        m = paddle.nn.Hardswish()
+        out2 = m(x)
+        out_ref = ref_hardswish(self.x_np)
+        for r in [out1, out2]:
+            self.assertTrue(np.allclose(out_ref, r.numpy()))
         paddle.enable_static()
-        with program_guard(Program()):
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.hard_swish(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_hardswish(self.x_np)
+        self.assertTrue(np.allclose(out_ref, res[0]))
+
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out = paddle.fluid.layers.hard_swish(x)
+        self.assertTrue(np.allclose(out_ref, out.numpy()))
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.hard_swish, 1)
+            self.assertRaises(TypeError, F.hardswish, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.hard_swish, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.hardswish, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.hard_swish(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.hardswish(x_fp16)
 
 
 class TestSoftRelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "soft_relu"
         self.init_dtype()
 
@@ -1447,7 +1490,6 @@ def test_check_grad(self):
 
 class TestSoftReluOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.soft_relu, 1)
@@ -1466,7 +1508,6 @@ def elu(x, alpha):
 
 class TestELU(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "elu"
         self.init_dtype()
 
@@ -1540,7 +1581,6 @@ def test_errors(self):
 
 class TestReciprocal(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "reciprocal"
         self.init_dtype()
 
@@ -1559,7 +1599,6 @@ def test_check_grad(self):
 
 class TestLog(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "log"
         self.init_dtype()
 
@@ -1587,7 +1626,6 @@ def test_error(self):
 
 class TestLog1p(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "log1p"
         self.init_dtype()
 
@@ -1633,7 +1671,6 @@ def test_api(self):
 
 class TestSquare(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "square"
         self.init_dtype()
 
@@ -1652,7 +1689,6 @@ def test_check_grad(self):
 
 class TestPow(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "pow"
         self.init_dtype()
 
@@ -1672,7 +1708,6 @@ def test_check_grad(self):
 
 class TestPow_factor_tensor(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "pow"
         self.init_dtype()
 
@@ -1750,7 +1785,6 @@ def test_error(self):
 
 class TestSTanh(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "stanh"
         self.init_dtype()
 
@@ -1772,7 +1806,6 @@ def test_check_grad(self):
 
 class TestSTanhOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.stanh, 1)
@@ -1793,7 +1826,6 @@ def ref_softplus(x, beta=1, threshold=20):
 
 class TestSoftplus(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "softplus"
         self.init_dtype()
 
@@ -1877,7 +1909,6 @@ def ref_softsign(x):
 
 class TestSoftsign(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "softsign"
         self.init_dtype()
 
@@ -1950,7 +1981,6 @@ def test_errors(self):
 
 class TestThresholdedRelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "thresholded_relu"
         self.init_dtype()
 
@@ -1975,7 +2005,6 @@ def test_check_grad(self):
 
 class TestThresholdedReluOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.thresholded_relu, 1)
@@ -1987,54 +2016,107 @@ def test_errors(self):
             fluid.layers.thresholded_relu(x_fp16)
 
 
+def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5):
+    return np.maximum(np.minimum(x * slope + offset, 1.), 0.).astype(x.dtype)
+
+
 class TestHardSigmoid(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "hard_sigmoid"
-        self.init_dtype()
-
-        np.random.seed(1024)
-        X = np.random.uniform(-5, 5, [10, 12]).astype("float32")
-        slope = 0.2
-        offset = 0.5
-        lower_threshold = -offset / slope
-        upper_threshold = (1 - offset) / slope
+        self.dtype = 'float64'
+        self.slope = 0.166666666666667
+        self.offset = 0.5
+        self.set_attrs()
 
-        self.delta = 0.005
+        x = np.random.uniform(-5, 5, [10, 12]).astype(self.dtype)
+        lower_threshold = -self.offset / self.slope
+        upper_threshold = (1. - self.offset) / self.slope
 
         # Same reason as TestAbs
-        X[(X - lower_threshold) < self.delta] = lower_threshold - 0.02
-        X[(X - upper_threshold) < self.delta] = upper_threshold + 0.02
+        delta = 0.005
+        x[np.abs(x - lower_threshold) < delta] = lower_threshold - 0.02
+        x[np.abs(x - upper_threshold) < delta] = upper_threshold - 0.02
 
-        temp = X * slope + offset
-        out = np.maximum(0.0, np.minimum(1.0, temp))
+        out = ref_hardsigmoid(x, self.slope, self.offset)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
+        self.attrs = {'slope': self.slope, 'offset': self.offset}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out')
+    def set_attrs(self):
+        pass
 
 
-class TestHardSigmoidOpError(unittest.TestCase):
-    def test_errors(self):
+class TestHardSigmoidFP32(TestHardSigmoid):
+    def set_attrs(self):
+        self.dtype = 'float32'
+
+
+class TestHardSigmoidSlopeOffset(TestHardSigmoid):
+    def set_attrs(self):
+        self.slope = 0.2
+        self.offset = 0.4
+
+
+class TestHardsigmoidAPI(unittest.TestCase):
+    # test paddle.nn.Hardsigmoid, paddle.nn.functional.hardsigmoid
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.hardsigmoid(x)
+            m = paddle.nn.Hardsigmoid()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_hardsigmoid(self.x_np)
+        for r in res:
+            self.assertTrue(np.allclose(out_ref, r))
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.hardsigmoid(x)
+        m = paddle.nn.Hardsigmoid()
+        out2 = m(x)
+        out_ref = ref_hardsigmoid(self.x_np)
+        for r in [out1, out2]:
+            self.assertTrue(np.allclose(out_ref, r.numpy()))
         paddle.enable_static()
-        with program_guard(Program()):
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.hard_sigmoid(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_hardsigmoid(self.x_np, 0.2, 0.5)
+        self.assertTrue(np.allclose(out_ref, res[0]))
+
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out = paddle.fluid.layers.hard_sigmoid(x)
+        self.assertTrue(np.allclose(out_ref, out.numpy()))
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.hard_sigmoid, 1)
+            self.assertRaises(TypeError, F.hardsigmoid, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.hard_sigmoid, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.hardsigmoid, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.hard_sigmoid(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.hardsigmoid(x_fp16)
 
 
 class TestSwish(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "swish"
         self.init_dtype()
 
@@ -2055,7 +2137,6 @@ def test_check_grad(self):
 
 class TestSwishOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.swish, 1)
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 26073f49bdd3d..e0ec676f1b14c 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1657,21 +1657,6 @@ def test_eye_op(self):
         with self.assertRaises(TypeError):
             layers.eye(num_rows=3, batch_shape=[-1])
 
-    def test_hard_swish(self):
-        with self.static_graph():
-            t = layers.data(name='t', shape=[3, 3], dtype='float32')
-            ret = layers.hard_swish(t)
-            static_ret = self.get_static_graph_result(
-                feed={'t': np.ones(
-                    [3, 3], dtype='float32')}, fetch_list=[ret])[0]
-
-        with self.dynamic_graph():
-            t = np.ones([3, 3], dtype='float32')
-            dy_ret = layers.hard_swish(base.to_variable(t))
-            dy_ret_rlt = dy_ret.numpy()
-
-        self.assertTrue(np.allclose(static_ret, dy_ret_rlt))
-
     def test_while_loop(self):
         with self.static_graph():
             i = layers.fill_constant(shape=[1], dtype='int64', value=0)
@@ -2563,13 +2548,6 @@ def make_l2_normalize(self):
             output = layers.l2_normalize(x, axis=1)
             return output
 
-    def make_maxout(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            data = self._get_data(name='x', shape=[8, 6, 6], dtype="float32")
-            output = layers.maxout(x=data, groups=2)
-            return (output)
-
     def make_crop(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
@@ -2656,13 +2634,6 @@ def make_prelu(self):
                 name='prelu')
             return (out)
 
-    def make_brelu(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.brelu(input, t_min=1.0, t_max=20.0, name='brelu')
-            return (out)
-
     def make_soft_relu(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
diff --git a/python/paddle/fluid/tests/unittests/test_maxout_op.py b/python/paddle/fluid/tests/unittests/test_maxout_op.py
index 6781965b0b4e9..1d38c833773ca 100644
--- a/python/paddle/fluid/tests/unittests/test_maxout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maxout_op.py
@@ -16,32 +16,43 @@
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
 import paddle.fluid.core as core
+import paddle.nn.functional as F
 from op_test import OpTest
 
+paddle.enable_static()
+np.random.seed(1)
 
-def maxout_forward_naive(input, groups, channel_axis):
-    s0, s1, s2, s3 = input.shape
-    if channel_axis == 3:
-        return np.ndarray([s0, s1, s2, s3 // groups, groups], \
-            buffer = input, dtype=input.dtype).max(axis=(4))
-    return np.ndarray([s0, s1 // groups, groups, s2, s3], \
-        buffer = input, dtype=input.dtype).max(axis=(2))
+
+def maxout_forward_naive(x, groups, channel_axis):
+    s0, s1, s2, s3 = x.shape
+    if channel_axis == 1:
+        return np.ndarray([s0, s1 // groups, groups, s2, s3], \
+            buffer = x, dtype=x.dtype).max(axis=2)
+    return np.ndarray([s0, s1, s2, s3 // groups, groups], \
+        buffer = x, dtype=x.dtype).max(axis=4)
 
 
 class TestMaxOutOp(OpTest):
     def setUp(self):
         self.op_type = "maxout"
-        self.init_test_case()
-        input = np.random.random(self.shape)
-        output = self.MaxOut_forward_naive(input, self.groups, self.axis)
+        self.dtype = 'float64'
+        self.shape = [3, 6, 2, 4]
+        self.groups = 2
+        self.axis = 1
+        self.set_attrs()
+
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out = maxout_forward_naive(x, self.groups, self.axis)
 
-        self.inputs = {'X': input}
+        self.inputs = {'X': x}
         self.attrs = {'groups': self.groups, 'axis': self.axis}
+        self.outputs = {'Out': out}
 
-        self.outputs = {'Out': output}
+    def set_attrs(self):
+        pass
 
     def test_check_output(self):
         self.check_output()
@@ -49,65 +60,89 @@ def test_check_output(self):
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
-    def init_test_case(self):
-        self.MaxOut_forward_naive = maxout_forward_naive
-        self.shape = [100, 6, 2, 2]
-        self.groups = 2
-        self.axis = 1
-
 
-class TestMaxOutOpAxis(TestMaxOutOp):
-    def init_test_case(self):
-        self.MaxOut_forward_naive = maxout_forward_naive
-        self.shape = [100, 2, 2, 6]  # NHWC format
-        self.groups = 2
-        self.axis = 3
+class TestMaxOutOpAxis0(TestMaxOutOp):
+    def set_attrs(self):
+        self.axis = -1
 
 
-class TestMaxOutOpAxisAPI(unittest.TestCase):
-    def test_axis(self):
-        data1 = fluid.data(name='data1', shape=[3, 6, 2, 2], dtype='float32')
-        data2 = fluid.data(name='data2', shape=[3, 2, 2, 6], dtype='float32')
-        out1 = fluid.layers.maxout(data1, groups=2, axis=1)
-        out2 = fluid.layers.maxout(data2, groups=2, axis=-1)
-        data1_np = np.random.random((3, 6, 2, 2)).astype("float32")
-        data2_np = np.transpose(data1_np, [0, 2, 3, 1])
+class TestMaxOutOpAxis1(TestMaxOutOp):
+    def set_attrs(self):
+        self.axis = 3
 
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        results = exe.run(fluid.default_main_program(),
-                          feed={"data1": data1_np,
-                                "data2": data2_np},
-                          fetch_list=[out1, out2],
-                          return_numpy=True)
 
-        self.assertTrue(
-            np.allclose(results[0], np.transpose(results[1], (0, 3, 1, 2))))
+class TestMaxOutOpFP32(TestMaxOutOp):
+    def set_attrs(self):
+        self.dtype = 'float32'
 
-    def test_exception(self):
-        input = fluid.data(name="input", shape=[2, 4, 6, 6], dtype="float32")
 
-        def _attr_axis():
-            out = fluid.layers.maxout(input, groups=2, axis=2)
+class TestMaxOutOpGroups(TestMaxOutOp):
+    def set_attrs(self):
+        self.groups = 3
 
-        self.assertRaises(ValueError, _attr_axis)
 
+class TestMaxoutAPI(unittest.TestCase):
+    # test paddle.nn.Maxout, paddle.nn.functional.maxout
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [2, 6, 5, 4]).astype(np.float64)
+        self.groups = 2
+        self.axis = 1
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.maxout(x, self.groups, self.axis)
+            m = paddle.nn.Maxout(self.groups, self.axis)
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = maxout_forward_naive(self.x_np, self.groups, self.axis)
+        for r in res:
+            self.assertTrue(np.allclose(out_ref, r))
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.maxout(x, self.groups, self.axis)
+        m = paddle.nn.Maxout(self.groups, self.axis)
+        out2 = m(x)
+        out_ref = maxout_forward_naive(self.x_np, self.groups, self.axis)
+        for r in [out1, out2]:
+            self.assertTrue(np.allclose(out_ref, r.numpy()))
+
+        out3 = F.maxout(x, self.groups, -1)
+        out3_ref = maxout_forward_naive(self.x_np, self.groups, -1)
+        self.assertTrue(np.allclose(out3_ref, out3.numpy()))
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.maxout(x, groups=self.groups, axis=self.axis)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = maxout_forward_naive(self.x_np, self.groups, self.axis)
+        self.assertTrue(np.allclose(out_ref, res[0]))
+
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out = paddle.fluid.layers.maxout(x, groups=self.groups, axis=self.axis)
+        self.assertTrue(np.allclose(out_ref, out.numpy()))
+        paddle.enable_static()
 
-class TestMaxOutOpError(unittest.TestCase):
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.maxout, 1, 2)
+            self.assertRaises(TypeError, F.maxout, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.maxout, x_int32, 2)
-            # support the input dtype is float32
-            x_fp32 = fluid.data(name='x_fp32', shape=[12, 10], dtype='float32')
-            fluid.layers.maxout(x_fp32, 2)
+            x_int32 = paddle.data(
+                name='x_int32', shape=[2, 4, 6, 8], dtype='int32')
+            self.assertRaises(TypeError, F.maxout, x_int32)
+
+            x_float32 = paddle.data(name='x_float32', shape=[2, 4, 6, 8])
+            self.assertRaises(ValueError, F.maxout, x_float32, 2, 2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index b1acea2ba5f8f..c788727ab97e5 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -55,6 +55,7 @@
 from .layer.activation import GELU  #DEFINE_ALIAS
 from .layer.activation import Tanh  #DEFINE_ALIAS
 from .layer.activation import Hardshrink  #DEFINE_ALIAS
+from .layer.activation import Hardswish  #DEFINE_ALIAS
 from .layer.activation import Hardtanh  #DEFINE_ALIAS
 from .layer.activation import PReLU  #DEFINE_ALIAS
 from .layer.activation import ReLU  #DEFINE_ALIAS
@@ -62,6 +63,7 @@
 from .layer.activation import SELU  #DEFINE_ALIAS
 from .layer.activation import LeakyReLU  #DEFINE_ALIAS
 from .layer.activation import Sigmoid  #DEFINE_ALIAS
+from .layer.activation import Hardsigmoid  #DEFINE_ALIAS
 from .layer.activation import LogSigmoid
 from .layer.activation import Softmax  #DEFINE_ALIAS
 from .layer.activation import Softplus  #DEFINE_ALIAS
@@ -70,6 +72,7 @@
 from .layer.activation import Tanhshrink  #DEFINE_ALIAS
 from .layer.activation import LogSoftmax  #DEFINE_ALIAS
 from .layer.activation import HSigmoid  #DEFINE_ALIAS
+from .layer.activation import Maxout  #DEFINE_ALIAS
 from .layer.common import BilinearTensorProduct  #DEFINE_ALIAS
 from .layer.common import Pool2D  #DEFINE_ALIAS
 from .layer.common import Pad2D  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index d129194112a8f..d2e1832c6b637 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -29,14 +29,13 @@
 __all__ += pooling.__all__
 from . import loss
 __all__ += loss.__all__
-from .activation import brelu  #DEFINE_ALIAS
 from .activation import elu  #DEFINE_ALIAS
 from .activation import erf  #DEFINE_ALIAS
 from .activation import gelu  #DEFINE_ALIAS
 from .activation import hardshrink  #DEFINE_ALIAS
 from .activation import hardtanh  #DEFINE_ALIAS
-from .activation import hard_sigmoid  #DEFINE_ALIAS
-from .activation import hard_swish  #DEFINE_ALIAS
+from .activation import hardsigmoid  #DEFINE_ALIAS
+from .activation import hardswish  #DEFINE_ALIAS
 from .activation import hsigmoid  #DEFINE_ALIAS
 from .activation import leaky_relu  #DEFINE_ALIAS
 from .activation import log_sigmoid  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index f7bbe0c94e03d..2c65acb6f05a4 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -13,11 +13,7 @@
 # limitations under the License.
 
 # TODO: define activation functions of neural network
-from ...fluid.layers import brelu  #DEFINE_ALIAS
 from ...fluid.layers import erf  #DEFINE_ALIAS
-from ...fluid.layers import hard_sigmoid  #DEFINE_ALIAS
-from ...fluid.layers import hard_swish  #DEFINE_ALIAS
-from ...fluid.layers import maxout  #DEFINE_ALIAS
 from ...fluid.layers import soft_relu  #DEFINE_ALIAS
 from ...fluid.layers import swish  #DEFINE_ALIAS
 from ...fluid.layers import sigmoid  #DEFINE_ALIAS
@@ -25,14 +21,13 @@
 from ...tensor.math import tanh  #DEFINE_ALIAS
 
 __all__ = [
-    'brelu',
     'elu',
     'erf',
     'gelu',
     'hardshrink',
     'hardtanh',
-    'hard_sigmoid',
-    'hard_swish',
+    'hardsigmoid',
+    'hardswish',
     'hsigmoid',
     'leaky_relu',
     'log_sigmoid',
@@ -75,10 +70,10 @@ def elu(x, alpha=1.0, name=None):
         alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         A Tensor with the same data type and shape as ``x`` .
-    
+
     Examples:
         .. code-block:: python
 
@@ -89,7 +84,7 @@ def elu(x, alpha=1.0, name=None):
             paddle.disable_static()
 
             x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
-            out = F.elu(x, alpha=0.2) 
+            out = F.elu(x, alpha=0.2)
             # [[-0.12642411  6.        ]
             #  [ 1.          15.6      ]]
     """
@@ -123,16 +118,16 @@ def gelu(x, approximate=False, name=None):
     .. math::
 
         gelu(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}}))
-    
+
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         approximate (bool, optional): Wether to enable approximation. Default is False.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         A Tensor with the same data type and shape as ``x`` .
-    
+
     Examples:
         .. code-block:: python
 
@@ -265,6 +260,109 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
     return out
 
 
+def hardsigmoid(x, name=None):
+    """
+    hardsigmoid activation.
+
+    A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
+    which is much faster than sigmoid.
+
+    .. math::
+
+        hardsigmoid(x)=
+            \\left\\{
+            \\begin{aligned}
+            &0, & & \\text{if } x \\leq -3 \\\\
+            &1, & & \\text{if } x \\geq 3 \\\\
+            &x/6 + 1/2, & & \\text{otherwise}
+            \\end{aligned}
+            \\right.
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.to_tensor([-4., 5., 1.])
+            out = F.hardsigmoid(x) # [0., 1., 0.666667]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.hard_sigmoid(x, 'slope', 0.1666666666666667, 'offset',
+                                     0.5)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'hardsigmoid')
+
+    helper = LayerHelper('hardsigmoid', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='hard_sigmoid',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'slope': 0.1666666666666667,
+               'offset': 0.5})
+    return out
+
+
+def hardswish(x, name=None):
+    """
+    hardswish activation
+
+    hardswish is proposed in MobileNetV3, and performs better in computational stability
+    and efficiency compared to swish function. For more details please refer
+    to: https://arxiv.org/pdf/1905.02244.pdf
+
+    .. math::
+
+        hardswish(x)=
+            \\left\\{
+            \\begin{aligned}
+            &0, & & \\text{if } x \\leq -3 \\\\
+            &x, & & \\text{if } x \\geq 3 \\\\
+            &\\frac{x(x+3)}{6}, & & \\text{otherwise}
+            \\end{aligned}
+            \\right.
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.to_tensor([-4., 5., 1.])
+            out = F.hardswish(x) # [0., 5., 0.666667]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.hard_swish(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'hardswish')
+
+    helper = LayerHelper('hardswish', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='hard_swish', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
 def hsigmoid(input,
              label,
              weight,
@@ -489,7 +587,7 @@ def prelu(x, weight, name=None):
     assert len(weight.shape
                ) == 1, "The dim count of weight shape should be 1 in prelu()."
 
-    # NOTE(): The input of this API should be ``N,C,...`` format, 
+    # NOTE(): The input of this API should be ``N,C,...`` format,
     # which means x.shape[0] is batch_size and x.shape[0] is channel.
     mode = 'all'
     if weight.shape[0] > 1:
@@ -559,15 +657,15 @@ def log_sigmoid(x, name=None):
     .. math::
 
         log\\_sigmoid(x) = log \\frac{1}{1 + e^{-x}}
-    
+
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         A Tensor with the same data type and shape as ``x`` .
-    
+
     Examples:
         .. code-block:: python
 
@@ -591,6 +689,81 @@ def log_sigmoid(x, name=None):
     return out
 
 
+def maxout(x, groups, axis=1, name=None):
+    """
+    maxout activation.
+
+    Assumed the input shape is (N, Ci, H, W).
+    The output shape is (N, Co, H, W).
+    Then Co = Ci/groups and the operator formula is as follows:
+
+    .. math::
+
+        &out_{si+j} = \\max_{k} x_{gsi + sk + j} \\\\
+        &g = groups \\\\
+        &s = \\frac{input.size}{num\\_channels} \\\\
+        &0 \\le i < \\frac{num\\_channels}{groups} \\\\
+        &0 \\le j < s \\\\
+        &0 \\le k < groups
+
+    Parameters:
+        x (Tensor): The input is 4-D Tensor with shape [N, C, H, W] or [N, H, W, C], the data type
+            of input is float32 or float64.
+        groups (int, optional): The groups number of maxout. `groups` specifies the
+            index of channel dimension where maxout will be performed. This must be
+            a factor of number of features. Default is 1.
+        axis (int, optional): The axis along which to perform maxout calculations.
+            It should be 1 when data format is NCHW, be -1 or 3 when data format
+            is NHWC. If ``axis`` < 0, it works the same way as :math:`axis + D` ,
+            where D is the dimensions of ``x`` . ``axis`` only supports 1, 3 or -1.
+            Default is 1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.rand([1, 2, 3, 4])
+            # [[[[0.5002636  0.22272532 0.17402348 0.2874594 ]
+            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
+            #    [0.02879342 0.88725346 0.61093384 0.38833922]]
+            #   [[0.5231306  0.03807496 0.91661984 0.15602879]
+            #    [0.666127   0.616567   0.30741522 0.24044901]
+            #    [0.7142536  0.7351477  0.31588817 0.23782359]]]]
+            out = F.maxout(x, groups=2)
+            # [[[[0.5231306  0.22272532 0.91661984 0.2874594 ]
+            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
+            #    [0.7142536  0.88725346 0.61093384 0.38833922]]]]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.maxout(x, 'groups', groups, 'axis', axis)
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'maxout')
+    if axis not in [1, -1, 3]:
+        raise ValueError(
+            "Attr(axis) should be 1 when data format is NCHW, -1 or 3 when data format is NHWC. Received "
+            "Attr(axis): %s." % str(axis))
+    if axis == -1:
+        axis = 3
+
+    helper = LayerHelper('maxout', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='maxout',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'groups': groups,
+               'axis': axis})
+    return out
+
+
 def relu6(x, name=None):
     """
     relu6 activation
@@ -778,7 +951,7 @@ def softmax(x, axis=-1, dtype=None, name=None):
             :math:`axis + D` . Default is -1.
         dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
             type of the output tensor. If dtype is specified, ``x`` is casted
-            to ``dtype`` before the operation is performed. This is useful for 
+            to ``dtype`` before the operation is performed. This is useful for
             preventing data type overflows. Supported dtype: float32, float64.
             If ``dtype`` is None, the output Tensor has the same dtype as x.
             Default is None.
@@ -1051,13 +1224,13 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
             :math:`axis + D` . Default is -1.
         dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
             type of the output tensor. If dtype is specified, ``x`` is casted
-            to ``dtype`` before the operation is performed. This is useful for 
+            to ``dtype`` before the operation is performed. This is useful for
             preventing data type overflows. Supported dtype: float32, float64.
             If ``dtype`` is None, the output Tensor has the same dtype as x.
             Default is None.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
- 
+
     Returns:
         A Tensor with the same shape and data type (use ``dtype`` if it is
         specified) as x.
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 585d369c607e5..b3b7bd259c744 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -18,6 +18,7 @@
     'ELU',
     'GELU',
     'Hardshrink',
+    'Hardswish',
     'Tanh',
     'Hardtanh',
     'PReLU',
@@ -26,6 +27,7 @@
     'SELU',
     'LeakyReLU',
     'Sigmoid',
+    'Hardsigmoid',
     'Softmax',
     'Softplus',
     'Softshrink',
@@ -33,6 +35,7 @@
     'Tanhshrink',
     'LogSigmoid',
     'LogSoftmax',
+    'Maxout',
     'HSigmoid',
 ]
 
@@ -50,18 +53,18 @@ class ELU(layers.Layer):
     ELU Activation.
 
     .. math::
-    
+
         ELU(x) = max(0, x) + min(0, \\alpha * (e^{x}-1))
 
     Parameters:
         alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -106,11 +109,11 @@ class GELU(layers.Layer):
         approximate (bool, optional): Wether to enable approximation. Default is False.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -120,7 +123,7 @@ class GELU(layers.Layer):
             paddle.disable_static()
 
             x = paddle.to_tensor(np.array([[-1, 0.5],[1, 1.5]]))
-            
+
             m = paddle.nn.GELU()
             out = m(x) # [-0.158655 0.345731 0.841345 1.39979]
 
@@ -184,6 +187,52 @@ def forward(self, x):
         return F.hardshrink(x, self._threshold, self._name)
 
 
+class Hardswish(layers.Layer):
+    """
+    Hardswish activation
+
+    Hardswish is proposed in MobileNetV3, and performs better in computational stability
+    and efficiency compared to swish function. For more details please refer
+    to: https://arxiv.org/pdf/1905.02244.pdf
+
+    .. math::
+
+        Hardswish(x)=
+            \\left\\{
+            \\begin{aligned}
+            &0, & & \\text{if } x \\leq -3 \\\\
+            &x, & & \\text{if } x \\geq 3 \\\\
+            &\\frac{x(x+3)}{6}, & & \\text{otherwise}
+            \\end{aligned}
+            \\right.
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([-4., 5., 1.])
+            m = paddle.nn.Hardswish()
+            out = m(x) # [0., 5., 0.666667]
+    """
+
+    def __init__(self, name=None):
+        super(Hardswish, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.hardswish(x, self._name)
+
+
 class Tanh(layers.Layer):
     """
     Tanh Activation.
@@ -240,11 +289,11 @@ class Hardtanh(layers.Layer):
         max (float, optional): The value of max for Hardtanh. Default is 1.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -274,7 +323,7 @@ class HSigmoid(layers.Layer):
 	:alias: paddle.nn.HSigmoid,paddle.nn.layer.HSigmoid,paddle.nn.layer.activation.HSigmoid
 
     Hierarchical Sigmoid Layer.
-    
+
     The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
     and speed up the model training, especially the training of language model.
     Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
@@ -309,7 +358,7 @@ class HSigmoid(layers.Layer):
             is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
             hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
             set, the bias is initialized zero. Default: None.
-        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and 
+        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and
             `path_code` should be passed to its forward method, otherwise `path_table` and `path_code`
             should not be passed to its forward method. Default: False.
         is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the
@@ -414,19 +463,19 @@ class PReLU(layers.Layer):
 
     Parameters:
         num_parameters (int, optional): Number of `weight` to learn. The supported values are:
-            1 - a single parameter `alpha` is used for all input channels; 
+            1 - a single parameter `alpha` is used for all input channels;
             Number of channels - a seperate `alpha` is used for each input channel.
             Default is 1.
         init (float, optional): Init value of learnable `weight`. Default is 0.25.
-        weight_attr(ParamAttr, optional): The parameter attribute for the learnable `weight`. 
+        weight_attr(ParamAttr, optional): The parameter attribute for the learnable `weight`.
             Default is None. For more information, please refer to :ref:`api_fluid_ParamAttr`.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape. Default dtype is float32.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -487,7 +536,7 @@ class ReLU(layers.Layer):
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -613,11 +662,11 @@ class LeakyReLU(layers.Layer):
             :math:`x < 0` . Default is 0.01.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -643,11 +692,11 @@ def forward(self, x):
 class Sigmoid(layers.Layer):
     """
     this interface is used to construct a callable object of the ``Sigmoid`` class. This layer calcluate the `sigmoid` of input x.
-    
+
     .. math::
 
         Sigmoid(x) = \frac{1}{1 + e^{-x}}
-    
+
     Parameters:
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -656,7 +705,7 @@ class Sigmoid(layers.Layer):
 
     Returns:
         A callable object of Sigmoid.
-    
+
     Examples:
 
         .. code-block:: python
@@ -680,6 +729,53 @@ def forward(self, x):
         return F.sigmoid(x, self.name)
 
 
+class Hardsigmoid(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``Hardsigmoid`` class.
+    This layer calcluate the `hardsigmoid` of input x.
+
+    A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
+    which is much faster than sigmoid.
+
+    .. math::
+
+        Hardsigmoid(x)=
+            \\left\\{
+            \\begin{aligned}
+            &0, & & \\text{if } x \\leq -3 \\\\
+            &1, & & \\text{if } x \\geq 3 \\\\
+            &x/6 + 1/2, & & \\text{otherwise}
+            \\end{aligned}
+            \\right.
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        x: N-D tensor, available dtype is float32, float64.
+
+    Returns:
+        A callable object of Hardsigmoid.
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+
+          m = paddle.nn.Sigmoid()
+          x = paddle.to_tensor([-4., 5., 1.])
+          out = m(x) # [0., 1, 0.666667]
+    """
+
+    def __init__(self, name=None):
+        super(Hardsigmoid, self).__init__()
+        self.name = name
+
+    def forward(self, x):
+        return F.hardsigmoid(x, self.name)
+
+
 class Softplus(layers.Layer):
     """
     Softplus Activation
@@ -842,7 +938,7 @@ def forward(self, x):
 class LogSigmoid(layers.Layer):
     """
     LogSigmoid Activation.
-    
+
     .. math::
 
         LogSigmoid(x) = log \\frac{1}{1 + e^{-x}}
@@ -851,11 +947,11 @@ class LogSigmoid(layers.Layer):
         x (Tensor): The input Tensor with data type float32, or float64.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -961,7 +1057,7 @@ class Softmax(layers.Layer):
             :math:`axis + D` . Default is -1.
         dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
             type of the output tensor. If dtype is specified, ``x`` is casted
-            to ``dtype`` before the operation is performed. This is useful for 
+            to ``dtype`` before the operation is performed. This is useful for
             preventing data type overflows. Supported dtype: float32, float64.
             If ``dtype`` is None, the output Tensor has the same dtype as x.
             Default is None.
@@ -1013,7 +1109,7 @@ class LogSoftmax(layers.Layer):
 
     .. math::
 
-        Out[i, j] = log(softmax(x)) 
+        Out[i, j] = log(softmax(x))
                   = log(\\frac{\exp(X[i, j])}{\\sum_j(exp(X[i, j])})
 
     Parameters:
@@ -1023,7 +1119,7 @@ class LogSoftmax(layers.Layer):
             same way as :math:`axis + D` . Default is -1.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
- 
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
@@ -1060,3 +1156,64 @@ def __init__(self, axis=-1, name=None):
 
     def forward(self, x):
         return F.log_softmax(x, self._axis)
+
+
+class Maxout(layers.Layer):
+    """
+    Maxout Activation.
+
+    Assumed the input shape is (N, Ci, H, W).
+    The output shape is (N, Co, H, W).
+    Then Co = Ci/groups and the operator formula is as follows:
+
+    .. math::
+
+        &out_{si+j} = \max_{k} x_{gsi + sk + j} \\\\
+        &g = groups \\\\
+        &s = \\frac{input.size}{num\\_channels} \\\\
+        &0 \\le i < \\frac{num\\_channels}{groups} \\\\
+        &0 \\le j < s \\\\
+        &0 \\le k < groups
+
+    Parameters:
+        groups (int, optional): The groups number of maxout. `groups` specifies the
+            index of channel dimension where maxout will be performed. This must be
+            a factor of number of features. Default is 1.
+        axis (int, optional): The axis along which to perform maxout calculations.
+            It should be 1 when data format is NCHW, be -1 or 3 when data format
+            is NHWC. If ``axis`` < 0, it works the same way as :math:`axis + D` ,
+            where D is the dimensions of ``x`` . Default is 1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.rand([1, 2, 3, 4])
+            # [[[[0.5002636  0.22272532 0.17402348 0.2874594 ]
+            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
+            #    [0.02879342 0.88725346 0.61093384 0.38833922]]
+            #   [[0.5231306  0.03807496 0.91661984 0.15602879]
+            #    [0.666127   0.616567   0.30741522 0.24044901]
+            #    [0.7142536  0.7351477  0.31588817 0.23782359]]]]
+            m = paddle.nn.Maxout(groups=2)
+            out = m(x)
+            # [[[[0.5231306  0.22272532 0.91661984 0.2874594 ]
+            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
+            #    [0.7142536  0.88725346 0.61093384 0.38833922]]]]
+    """
+
+    def __init__(self, groups, axis=1, name=None):
+        super(Maxout, self).__init__()
+        self._groups = groups
+        self._axis = axis
+        self._name = name
+
+    def forward(self, x):
+        return F.maxout(x, self._groups, self._axis, self._name)

From bf187c7577f588dbc4f759754e93a217e3958eef Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Sat, 10 Oct 2020 15:58:28 +0800
Subject: [PATCH 57/91] Polish the documentation and examples of
 paddle.static.nn.fc. (#27768)

* Reimplement paddle.static.nn.fc, mainly change the parameters and English documentation.

* Polish the documentation format.
---
 .../fluid/tests/unittests/test_fc_op.py       |   6 +-
 python/paddle/static/nn/__init__.py           |   3 +-
 python/paddle/static/nn/common.py             | 165 ++++++++++++++++++
 3 files changed, 170 insertions(+), 4 deletions(-)
 create mode 100644 python/paddle/static/nn/common.py

diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
index ec30cb70c5790..1272d82dfdd1d 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -149,9 +149,9 @@ def run_program(num_flatten_dims):
                     append_batch_size=False,
                     dtype="float32")
 
-                out = fluid.layers.fc(input=x,
-                                      size=1,
-                                      num_flatten_dims=num_flatten_dims)
+                out = paddle.static.nn.fc(x=x,
+                                          size=1,
+                                          num_flatten_dims=num_flatten_dims)
 
             place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
             ) else fluid.CUDAPlace(0)
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index 31a99f6282718..d50bb33f24001 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -39,7 +39,8 @@
     'switch_case',
 ]
 
-from ...fluid.layers import fc  #DEFINE_ALIAS
+from .common import fc  #DEFINE_ALIAS
+
 from ...fluid.layers import batch_norm  #DEFINE_ALIAS
 from ...fluid.layers import bilinear_tensor_product  #DEFINE_ALIAS
 from ...fluid.layers import case  #DEFINE_ALIAS
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
new file mode 100644
index 0000000000000..59ffacdaebed5
--- /dev/null
+++ b/python/paddle/static/nn/common.py
@@ -0,0 +1,165 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid.framework import static_only
+
+__all__ = ['fc']
+
+
+@static_only
+def fc(x,
+       size,
+       num_flatten_dims=1,
+       weight_attr=None,
+       bias_attr=None,
+       activation=None,
+       name=None):
+    """
+
+    Fully-Connected layer can take a tensor or a list of tensor as its inputs.
+    It creates a 2-D weight tensor for each input tensor, which represents its
+    weight matrix from each input unit to each output unit. The fully connected
+    layer multiplies each input tensor with its corresponding weight to produce
+    an output tensor with shape :math:`[batch\_size, *, size]` , where :math:`*`
+    means any number of additional dimensions. If a list of tensor is given,
+    the results of multiple output tensors with shape :math:`[batch\_size, *, size]`
+    will be summed up. If :attr:`bias_attr` is not False, a 1-D bias tensor will
+    be created and added to the output. Finally, if :attr:`activation` is not None,
+    it will be applied to the output as well.
+
+    For a single input tensor :math:`X` , the equation is:
+
+    .. math::
+
+        Out = Act({XW + b})
+
+    For a list of input tensor, the equation is:
+
+    .. math::
+
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+
+    where:
+
+    * :math:`N`: The number of the input tensors. :math:`N` equals to :math:`len(X)` if :math:`X` is list of tensor.
+    * :math:`X_i`: The i-th input tensor.
+    * :math:`W_i`: The i-th weight matrix corresponding i-th input tensor.
+    * :math:`b`: The bias created by this layer (if needed).
+    * :math:`Act`: The activation function.
+    * :math:`Out`: The output tensor.
+
+    .. code-block:: text
+
+        # Case 1, input is a single tensor:
+        x.data = [[[0.1, 0.2],
+                   [0.3, 0.4]]]
+        x.shape = (1, 2, 2) # 1 is batch_size
+
+        out = paddle.static.nn.fc(x=x, size=1, num_flatten_dims=2)
+
+        # Get the output:
+        out.data = [[0.83234344], [0.34936576]]
+        out.shape = (1, 2, 1)
+
+        # Case 2, input is a list of tensor:
+        x0.data = [[[0.1, 0.2],
+                    [0.3, 0.4]]]
+        x0.shape = (1, 2, 2) # 1 is batch_size
+
+        x1.data = [[[0.1, 0.2, 0.3]]]
+        x1.shape = (1, 1, 3)
+
+        out = paddle.static.nn.fc(x=[x0, x1], size=2)
+
+        # Get the output:
+        out.data = [[0.18669507, 0.1893476]]
+        out.shape = (1, 2)
+
+    Args:
+        x (Tensor|list of Tensor): A tensor or a list of tensor. The number of dimensions
+            of each tensor is at least 2. The data type should be float16, float32 or float64.
+        size (int): The number of output units in this layer, which also means the feature
+            size of output tensor.
+        num_flatten_dims (int, optional): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multi-dimensional tensor will first be flattened
+            into a 2-D matrix. The parameter :attr:`num_flatten_dims` determines how the input
+            tensor is flattened: the first :math:`num\_flatten\_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest :math:`rank(x) - num\_flatten\_dims` dimensions are
+            flattened to form the second dimension of the final matrix (width of the matrix).
+            For example, assuming that :attr:`x` is a 5-dimensional tensor with a shape
+            :math:`[2, 3, 4, 5, 6]` , and :attr:`num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape :math:`[2 * 3 * 4, 5 * 6] = [24, 30]` .
+            Default: 1.
+        weight_attr (ParamAttr, optional): The attribute for the learnable weight.
+            The default value is None, and the weight will be initialized to zero.
+            For detailed information, please refer to :attr:`paddle.ParamAttr`.
+        bias_attr (ParamAttr|bool, optional): The attribute of the learnable bias. 
+            If it is set to False, no bias will be added to the output.
+            If it is set to None or one kind of ParamAttr, a bias parameter will
+            be created according to ParamAttr. For detailed information, please refer
+            to :attr:`paddle.ParamAttr`. The default value is None and the bias will be
+            initialized to zero. 
+        activation (str, optional): Activation to be applied to the output of
+            this layer, such as tanh, softmax, sigmoid, relu. For more information,
+            please refer to :ref:`api_guide_activations_en` . Default: None.
+        name (str, optional): The default value is None. Normally there is no need for user to set
+            it. For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor, its shape is :math:`[batch\_size, *, size]` , and the data type is same with input.
+
+    Raises:
+        ValueError: If dimensions of the input tensor is less than 2.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          paddle.enable_static()
+
+          # When input is a single tensor
+          x = paddle.static.data(name="x", shape=[1, 2, 2], dtype="float32")
+          # x: [[[0.1 0.2]
+          #      [0.3 0.4]]]
+          out = paddle.static.nn.fc(
+              x=x,
+              size=1,
+              num_flatten_dims=2,
+              weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=0.5)),
+              bias_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=1.0)))
+          # out: [[[1.15]
+          #        [1.35]]]
+
+          # When input is multiple tensors
+          x0 = paddle.static.data(name="x0", shape=[1, 2, 2], dtype="float32")
+          # x0: [[[0.1 0.2]
+          #       [0.3 0.4]]]
+          x1 = paddle.static.data(name="x1", shape=[1, 1, 3], dtype="float32")
+          # x1: [[[0.1 0.2 0.3]]]
+          out = paddle.static.nn.fc(
+              x=[x0, x1],
+              size=2,
+              weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=0.5)),
+              bias_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=1.0)))
+          # out: [[1.8 1.8]]
+    """
+    return paddle.fluid.layers.fc(input=x,
+                                  size=size,
+                                  num_flatten_dims=num_flatten_dims,
+                                  param_attr=weight_attr,
+                                  bias_attr=bias_attr,
+                                  act=activation,
+                                  name=name)

From 1bada985fcf7931a1cf09c2e8e55344ca5defbb9 Mon Sep 17 00:00:00 2001
From: iducn <45056973+iducn@users.noreply.github.com>
Date: Sat, 10 Oct 2020 16:20:43 +0800
Subject: [PATCH 58/91] Modify the output information of the shell
 script,test=document_fix (#27805)

---
 tools/get_cpu_info.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/get_cpu_info.sh b/tools/get_cpu_info.sh
index a1881f551da1c..81eb19dc0661e 100755
--- a/tools/get_cpu_info.sh
+++ b/tools/get_cpu_info.sh
@@ -36,7 +36,7 @@ if [ $numa_nodes -lt $sockets ]; then
 fi
 
 echo "********** Software Information **********"
-echo "OS Version             : `cat /proc/version`"
+echo "OS Version             : `uname -o`"
 echo "Kernel Release Version : `uname -r`"
 echo "Kernel Patch Version   : `uname -v`"
 echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"

From c425cf182210b70be955ca8b4f6d5c4ddd612b4d Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Sat, 10 Oct 2020 17:18:30 +0800
Subject: [PATCH 59/91] [API 2.0]Update 2.0 api from fluid to paddle (#27802)

---
 python/paddle/fluid/executor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 7d067b6347844..f5660c3fc91a1 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -54,11 +54,11 @@ def global_scope():
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
+          import paddle
           import numpy
 
-          fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace())
-          numpy.array(fluid.global_scope().find_var("data").get_tensor())
+          paddle.static.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), paddle.CPUPlace())
+          numpy.array(paddle.static.global_scope().find_var("data").get_tensor())
     """
     return g_scope
 

From ad99e638fd0ebcdaf906b000371fbcd72cb74c65 Mon Sep 17 00:00:00 2001
From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com>
Date: Sat, 10 Oct 2020 17:58:17 +0800
Subject: [PATCH 60/91] add double grad op for matmul (#27776)

* add matmul doublegrad op

* fix compile errors

* modify code according to review

* delete float16
---
 paddle/fluid/operators/matmul_op.cc           | 244 +++++++++++++++++-
 .../fluid/tests/unittests/test_nn_grad.py     |  32 +++
 2 files changed, 275 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 809164df2056c..129298edafcf9 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -348,6 +348,181 @@ framework::DDim GetDimForInput(const framework::InferShapeContext &ctx,
   return dim;
 }
 
+template <typename DeviceContext, typename T>
+class MatMulDoubleGradKernel : public framework::OpKernel<T> {
+ public:
+  void MatMul(const framework::ExecutionContext &context,
+              const framework::Tensor &a, bool trans_a,
+              const framework::Tensor &b, bool trans_b, bool flag,
+              framework::Tensor *out) const {
+    out->mutable_data<T>(context.GetPlace());
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
+    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
+
+    int head_number = 1;
+#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
+    head_number = context.Attr<int>("head_number");
+#endif
+
+    if (head_number <= 1 && a.dims().size() == 3 && b.dims().size() <= 2) {
+      // the transpose_X must be false, if is true, the transpose cost much time
+      if (!trans_a) {
+        mat_dim_a.height_ *= mat_dim_a.batch_size_;
+        mat_dim_a.batch_size_ = 0;
+      }
+    }
+    blas.MatMul(a, mat_dim_a, b, mat_dim_b,
+                static_cast<T>(context.Attr<float>("alpha")), out,
+                static_cast<T>(flag));
+  }
+
+  void CalcInputGrad(const framework::ExecutionContext &context,
+                     const framework::Tensor &a, bool trans_a,
+                     bool is_fold_init_dims_a, const framework::Tensor &b,
+                     bool trans_b, bool is_fold_init_dims_b, bool flag,
+                     framework::Tensor *out) const {
+    if (out == nullptr) return;
+    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
+                        out->dims().size() == 2;
+    if (!need_combine) {
+      MatMul(context, a, trans_a, b, trans_b, flag, out);
+    } else {
+      auto &ctx = context.template device_context<DeviceContext>();
+      MatMul(context, is_fold_init_dims_a
+                          ? FoldInitDims(a)
+                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
+             trans_a, is_fold_init_dims_b
+                          ? FoldInitDims(b)
+                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
+             trans_b, flag, out);
+    }
+  }
+
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto x = *context.Input<framework::Tensor>("X");
+    auto y = *context.Input<framework::Tensor>("Y");
+    auto dout = *context.Input<framework::LoDTensor>("DOut");
+    auto *ddx = context.Input<framework::LoDTensor>("DDX");
+    auto *ddy = context.Input<framework::LoDTensor>("DDY");
+
+    auto *dx = context.Output<framework::LoDTensor>("DX");
+    auto *dy = context.Output<framework::LoDTensor>("DY");
+    auto *ddout = context.Output<framework::LoDTensor>("DDOut");
+
+    bool transpose_x = context.Attr<bool>("transpose_X");
+    bool transpose_y = context.Attr<bool>("transpose_Y");
+
+    ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+
+    framework::DDim dx_dims;
+    if (dx) {
+      dx_dims = dx->dims();
+      if (dx_dims != x.dims()) {
+        dx->Resize(x.dims());
+      }
+    }
+
+    framework::DDim dy_dims;
+    if (dy) {
+      dy_dims = dy->dims();
+      if (dy_dims != y.dims()) {
+        dy->Resize(y.dims());
+      }
+    }
+
+    framework::DDim ddout_dims;
+    if (ddout) {
+      ddout_dims = ddout->dims();
+      if (ddout_dims != dout.dims()) {
+        ddout->Resize(dout.dims());
+      }
+    }
+
+    bool ddout_flag = false;
+    if (ddx) {
+      auto ddx_mat = *ddx;
+      if (ddx_mat.dims() != x.dims()) {
+        ddx_mat.Resize(x.dims());
+      }
+      if (dy) {
+        if (transpose_x && transpose_y) {
+          // dy = dout' * ddx'
+          CalcInputGrad(context, dout, true, true, ddx_mat, true, false, false,
+                        dy);
+        } else if (transpose_x) {
+          // dy = ddx * dout
+          CalcInputGrad(context, ddx_mat, false, false, dout, false, true,
+                        false, dy);
+        } else if (transpose_y) {
+          // dy = dout' * ddx
+          CalcInputGrad(context, dout, true, true, ddx_mat, false, true, false,
+                        dy);
+        } else {
+          // dy = ddx' * dout
+          CalcInputGrad(context, ddx_mat, true, true, dout, false, true, false,
+                        dy);
+        }
+      }
+
+      if (ddout) {
+        CalcInputGrad(context, ddx_mat, transpose_x, true, y, transpose_y,
+                      false, ddout_flag, ddout);
+        ddout_flag = true;
+      }
+    }
+
+    if (ddy) {
+      auto ddy_mat = *ddy;
+      if (ddy_mat.dims() != y.dims()) {
+        ddy_mat.Resize(y.dims());
+      }
+      if (dx) {
+        if (transpose_x && transpose_y) {
+          // dx = ddy' * dout'
+          CalcInputGrad(context, ddy_mat, true, true, dout, true, false, false,
+                        dx);
+        } else if (transpose_x) {
+          // dx = ddy * dout'
+          CalcInputGrad(context, ddy_mat, false, false, dout, true, false,
+                        false, dx);
+        } else if (transpose_y) {
+          // dx = dout * ddy
+          CalcInputGrad(context, dout, false, false, ddy_mat, false, true,
+                        false, dx);
+        } else {
+          // dx = dout * ddy'
+          CalcInputGrad(context, dout, false, false, ddy_mat, true, false,
+                        false, dx);
+        }
+      }
+
+      if (ddout) {
+        CalcInputGrad(context, x, transpose_x, true, ddy_mat, transpose_y,
+                      false, ddout_flag, ddout);
+      }
+    }
+
+    if (dx) {
+      if (dx_dims != x.dims()) {
+        dx->Resize(dx_dims);
+      }
+    }
+
+    if (dy) {
+      if (dy_dims != y.dims()) {
+        dy->Resize(dy_dims);
+      }
+    }
+
+    if (ddout) {
+      if (ddout_dims != dout.dims()) {
+        ddout->Resize(ddout_dims);
+      }
+    }
+  }
+};
+
 class MatMulOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -647,6 +822,61 @@ class MatMulOpGradMaker : public framework::SingleGradOpMaker<T> {
     retv->SetAttrMap(this->Attrs());
   }
 };
+
+class MatMulOpDoubleGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "matmul");
+    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "matmul");
+    OP_INOUT_CHECK(context->HasInput("DOut"), "Input", "DOut", "matmul");
+
+    if (context->HasOutput("DX") && context->HasInput("DDY")) {
+      context->ShareDim("X", "DX");
+    }
+
+    if (context->HasOutput("DY") && context->HasInput("DDX")) {
+      context->ShareDim("Y", "DY");
+    }
+
+    if (context->HasOutput("DDOut") &&
+        (context->HasInput("DDY") || context->HasInput("DDX"))) {
+      context->ShareDim("DOut", "DDOut");
+    }
+  }
+};
+
+template <typename T>
+class MatMulOpDoubleGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("matmul_grad_grad");
+    retv->SetInput("X", this->Input("X"));
+    retv->SetInput("Y", this->Input("Y"));
+    retv->SetInput("DOut", this->Input(framework::GradVarName("Out")));
+    retv->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    retv->SetInput("DDY", this->OutputGrad(framework::GradVarName("Y")));
+
+    auto ddx = this->OutputGrad(framework::GradVarName("X"));
+    auto ddy = this->OutputGrad(framework::GradVarName("Y"));
+
+    if (!ddx.empty() || !ddy.empty()) {
+      retv->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+    }
+    retv->SetOutput(
+        "DX", ddy.empty() ? this->EmptyInputGrad() : this->InputGrad("X"));
+    retv->SetOutput(
+        "DY", ddx.empty() ? this->EmptyInputGrad() : this->InputGrad("Y"));
+
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -654,7 +884,10 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(matmul, ops::MatMulOp, ops::MatMulOpMaker,
                   ops::MatMulOpGradMaker<paddle::framework::OpDesc>,
                   ops::MatMulOpGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad);
+REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad,
+                  ops::MatMulOpDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::MatMulOpDoubleGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(matmul_grad_grad, ops::MatMulOpDoubleGrad);
 REGISTER_OP_CPU_KERNEL(
     matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MatMulKernel<paddle::platform::CPUDeviceContext, double>);
@@ -663,6 +896,11 @@ REGISTER_OP_CPU_KERNEL(
     ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, double>);
 
+REGISTER_OP_CPU_KERNEL(
+    matmul_grad_grad,
+    ops::MatMulDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MatMulDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
+
 #ifdef PADDLE_WITH_CUDA
 REGISTER_OP_CUDA_KERNEL(
     matmul, ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>,
@@ -675,4 +913,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::MatMulGradKernel<paddle::platform::CUDADeviceContext,
                           paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul_grad_grad,
+    ops::MatMulDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MatMulDoubleGradKernel<paddle::platform::CUDADeviceContext, double>);
 #endif
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index 5d1e016287e07..bf1955c5711f5 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -153,6 +153,38 @@ def test_grad(self):
             self.func(p)
 
 
+class TestMatmulDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        eps = 0.005
+        x_shapes = [[2], [2, 3], [2, 4, 3], [2, 3, 4, 5], [2, 3, 4]]
+        y_shapes = [[2], [3, 2], [2, 4, 5], [2, 3, 3, 5], [4, 3]]
+        transpose_xs = [False, True, True, False, False]
+        transpose_ys = [False, True, False, True, False]
+        dtypes = [np.float64, np.float64, np.float32, np.float32, np.float64]
+        typenames = ["float64", "float64", "float32", "float32", "float64"]
+        for i, (x_shape, y_shape, transpose_x, transpose_y, dtype, typename) \
+            in enumerate(zip(x_shapes, y_shapes, transpose_xs, transpose_ys, dtypes, typenames)):
+            x = layers.create_parameter(
+                dtype=typename, shape=x_shape, name='x{}'.format(i))
+            y = layers.create_parameter(
+                dtype=typename, shape=y_shape, name='y{}'.format(i))
+            out = layers.matmul(
+                x, y, transpose_x, transpose_y, name='out{}'.format(i))
+
+            x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+            y_arr = np.random.uniform(-1, 1, y_shape).astype(dtype)
+            gradient_checker.double_grad_check(
+                [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 class TestReshapeDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):

From a2d08aa916823b38ad71840248df7cd6d6aa7838 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Sun, 11 Oct 2020 09:10:42 +0800
Subject: [PATCH 61/91] update for windows compile. (#27813)

---
 cmake/init.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/init.cmake b/cmake/init.cmake
index 902dfb11fc0af..5f36a9adf1ae6 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -28,5 +28,6 @@ endif()
 
 if(WIN32)
     set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
+    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -Os -DNDEBUG")
 endif()
 

From 74d3a55072a8b3ec1172fbe951e0a7ce9658568d Mon Sep 17 00:00:00 2001
From: hong19860320 <9973393+hong19860320@users.noreply.github.com>
Date: Sun, 11 Oct 2020 21:51:31 +0800
Subject: [PATCH 62/91] Add Swish and ThresholdedReLU for API 2.0 (#27758)

---
 .../tests/unittests/test_activation_op.py     | 153 ++++++++++++++----
 python/paddle/nn/__init__.py                  |   2 +
 python/paddle/nn/functional/activation.py     | 101 ++++++++++--
 python/paddle/nn/layer/activation.py          |  89 ++++++++--
 4 files changed, 288 insertions(+), 57 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 6b729e6297bed..ac3d0a3a78562 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -1979,22 +1979,24 @@ def test_errors(self):
             F.softsign(x_fp16)
 
 
+def ref_thresholded_relu(x, threshold=1.0):
+    out = (x > threshold) * x
+    return out
+
+
 class TestThresholdedRelu(TestActivation):
     def setUp(self):
         self.op_type = "thresholded_relu"
         self.init_dtype()
 
-        threshold = 0.25
-        self.delta = 0.005
-        np.random.seed(1024)
-        X = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-
-        # Same reason as TestAbs
-        X[np.abs(X - threshold) < self.delta] = threshold + 0.2
-        out = (X > threshold) * X
+        threshold = 15
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
-        self.attrs = {'threshold': threshold}
+        np.random.seed(1024)
+        x = np.random.uniform(-20, 20, [10, 12]).astype(self.dtype)
+        x[np.abs(x) < 0.005] = 0.02
+        out = ref_thresholded_relu(x, threshold)
+        self.inputs = {'X': x}
+        self.attrs = {"threshold": threshold}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -2003,17 +2005,61 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-class TestThresholdedReluOpError(unittest.TestCase):
+class TestThresholdedReluAPI(unittest.TestCase):
+    # test paddle.nn.ThresholdedReLU, paddle.nn.functional.thresholded_relu
+    def setUp(self):
+        self.threshold = 15
+        np.random.seed(1024)
+        self.x_np = np.random.uniform(-20, 20, [10, 12]).astype(np.float64)
+        self.x_np[np.abs(self.x_np) < 0.005] = 0.02
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.thresholded_relu(x, self.threshold)
+            thresholded_relu = paddle.nn.ThresholdedReLU(self.threshold)
+            out2 = thresholded_relu(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_thresholded_relu(self.x_np, self.threshold)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.thresholded_relu(x, self.threshold)
+        thresholded_relu = paddle.nn.ThresholdedReLU(self.threshold)
+        out2 = thresholded_relu(x)
+        out_ref = ref_thresholded_relu(self.x_np, self.threshold)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        paddle.enable_static()
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.thresholded_relu(x, self.threshold)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_thresholded_relu(self.x_np, self.threshold)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
-        with program_guard(Program()):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.thresholded_relu, 1)
+            self.assertRaises(TypeError, F.thresholded_relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.thresholded_relu, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.thresholded_relu, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.thresholded_relu(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.thresholded_relu(x_fp16)
 
 
 def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5):
@@ -2115,37 +2161,82 @@ def test_errors(self):
             F.hardsigmoid(x_fp16)
 
 
+def ref_swish(x):
+    out = x * expit(x)
+    return out
+
+
 class TestSwish(TestActivation):
     def setUp(self):
         self.op_type = "swish"
         self.init_dtype()
 
         np.random.seed(1024)
-        X = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        beta = 2.3
-        out = X * expit(beta * X)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
-        self.attrs = {'beta': beta}
+        x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+        out = ref_swish(x)
+        self.inputs = {'X': x}
+        self.attrs = {'slope': 1.0}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+        self.check_grad(['X'], 'Out')
+
 
+class TestSwishAPI(unittest.TestCase):
+    # test paddle.nn.Swish, paddle.nn.functional.swish
+    def setUp(self):
+        np.random.seed(1024)
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.swish(x)
+            swish = paddle.nn.Swish()
+            out2 = swish(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_swish(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.swish(x)
+        swish = paddle.nn.Swish()
+        out2 = swish(x)
+        out_ref = ref_swish(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        paddle.enable_static()
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.swish(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_swish(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
 
-class TestSwishOpError(unittest.TestCase):
     def test_errors(self):
-        with program_guard(Program()):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.swish, 1)
+            self.assertRaises(TypeError, F.swish, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.swish, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.swish, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.swish(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.swish(x_fp16)
 
 
 #------------------ Test Error Activation----------------------
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index c788727ab97e5..b16e95b7130f9 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -69,7 +69,9 @@
 from .layer.activation import Softplus  #DEFINE_ALIAS
 from .layer.activation import Softshrink  #DEFINE_ALIAS
 from .layer.activation import Softsign  #DEFINE_ALIAS
+from .layer.activation import Swish  #DEFINE_ALIAS
 from .layer.activation import Tanhshrink  #DEFINE_ALIAS
+from .layer.activation import ThresholdedReLU  #DEFINE_ALIAS
 from .layer.activation import LogSoftmax  #DEFINE_ALIAS
 from .layer.activation import HSigmoid  #DEFINE_ALIAS
 from .layer.activation import Maxout  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 2c65acb6f05a4..53fa9814e6ef0 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -15,9 +15,7 @@
 # TODO: define activation functions of neural network
 from ...fluid.layers import erf  #DEFINE_ALIAS
 from ...fluid.layers import soft_relu  #DEFINE_ALIAS
-from ...fluid.layers import swish  #DEFINE_ALIAS
 from ...fluid.layers import sigmoid  #DEFINE_ALIAS
-from ...fluid.layers import thresholded_relu  #DEFINE_ALIAS
 from ...tensor.math import tanh  #DEFINE_ALIAS
 
 __all__ = [
@@ -787,8 +785,6 @@ def relu6(x, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-1, 0.3, 6.5]))
             out = F.relu6(x) # [0, 0.3, 6]
     """
@@ -839,8 +835,6 @@ def selu(x,
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([[0.0, 1.0],[2.0, 3.0]]))
             out = F.selu(x) # [[0, 1.050701],[2.101402, 3.152103]]
     """
@@ -1054,8 +1048,6 @@ def softplus(x, beta=1, threshold=20, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
     """
@@ -1103,8 +1095,6 @@ def softshrink(x, threshold=0.5, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.9, -0.2, 0.1, 0.8]))
             out = F.softshrink(x) # [-0.4, 0, 0, 0.3]
     """
@@ -1151,8 +1141,6 @@ def softsign(x, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
     """
@@ -1167,6 +1155,47 @@ def softsign(x, name=None):
     return out
 
 
+def swish(x, name=None):
+    """
+    swish activation.
+
+    .. math::
+
+        swish(x) = \\frac{x}{1 + e^{-x}}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            x = paddle.to_tensor(np.array([-2., 0., 1.]))
+            out = F.swish(x) # [-0.238406, 0., 0.731059]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.swish(x, 'slop', 1.0)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'swish')
+    helper = LayerHelper('swish', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='swish',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'slope': 1.0})
+    return out
+
+
 def tanhshrink(x, name=None):
     """
     tanhshrink activation
@@ -1190,8 +1219,6 @@ def tanhshrink(x, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
     """
@@ -1206,6 +1233,52 @@ def tanhshrink(x, name=None):
     return out
 
 
+def thresholded_relu(x, threshold=1.0, name=None):
+    """
+    thresholded relu activation.
+
+    .. math::
+
+        thresholded\\_relu(x) = \\begin{cases}
+                                 x, \\text{if } x > threshold \\\\
+                                 0, \\text{otherwise}
+                                \\end{cases}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        threshold (float, optional): The value of threshold for thresholded_relu. Default is 1.0
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            x = paddle.to_tensor(np.array([2., 0., 1.]))
+            out = F.thresholded_relu(x) # [2., 0., 0.]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.thresholded_relu(x, 'threshold', threshold)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'thresholded_relu')
+    helper = LayerHelper('thresholded_relu', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='thresholded_relu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold})
+    return out
+
+
 def log_softmax(x, axis=-1, dtype=None, name=None):
     """
     This operator implements the log_softmax layer. The calculation process is
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index b3b7bd259c744..cd17f26e09e37 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -32,7 +32,9 @@
     'Softplus',
     'Softshrink',
     'Softsign',
+    'Swish',
     'Tanhshrink',
+    'ThresholdedReLU',
     'LogSigmoid',
     'LogSoftmax',
     'Maxout',
@@ -580,8 +582,6 @@ class ReLU6(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-1, 0.3, 6.5]))
             m = paddle.nn.ReLU6()
             out = m(x) # [0, 0.3, 6]
@@ -623,8 +623,6 @@ class SELU(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([[0.0, 1.0],[2.0, 3.0]]))
             m = paddle.nn.SELU()
             out = m(x) # [[0, 1.050701],[2.101402, 3.152103]]
@@ -801,8 +799,6 @@ class Softplus(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             m = paddle.nn.Softplus()
             out = m(x) # [0.513015, 0.598139, 0.744397, 0.854355]
@@ -845,8 +841,6 @@ class Softshrink(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.9, -0.2, 0.1, 0.8]))
             m = paddle.nn.Softshrink()
             out = m(x) # [-0.4, 0, 0, 0.3]
@@ -883,8 +877,6 @@ class Softsign(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             m = paddle.nn.Softsign()
             out = m(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
@@ -898,6 +890,41 @@ def forward(self, x):
         return F.softsign(x, self._name)
 
 
+class Swish(layers.Layer):
+    """
+    Swish Activation.
+
+    .. math::
+
+        Swish(x) = \\frac{x}{1 + e^{-x}}
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            x = paddle.to_tensor(np.array([-2., 0., 1.]))
+            m = paddle.nn.Swish()
+            out = m(x) # [-0.238406, 0., 0.731059]
+    """
+
+    def __init__(self, name=None):
+        super(Swish, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.swish(x, self._name)
+
+
 class Tanhshrink(layers.Layer):
     """
     Tanhshrink Activation
@@ -920,8 +947,6 @@ class Tanhshrink(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             m = paddle.nn.Tanhshrink()
             out = m(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
@@ -935,6 +960,46 @@ def forward(self, x):
         return F.tanhshrink(x, self._name)
 
 
+class ThresholdedReLU(layers.Layer):
+    """
+    Thresholded ReLU Activation
+
+    .. math::
+
+        ThresholdedReLU(x) = \\begin{cases}
+                               x, \\text{if } x > threshold \\\\
+                               0, \\text{otherwise}
+                              \\end{cases}
+
+    Parameters:
+        threshold (float, optional): The value of threshold for ThresholdedReLU. Default is 1.0
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            x = paddle.to_tensor(np.array([2., 0., 1.]))
+            m = paddle.nn.ThresholdedReLU()
+            out = m(x) # [2., 0., 0.]
+    """
+
+    def __init__(self, threshold=1.0, name=None):
+        super(ThresholdedReLU, self).__init__()
+        self._threshold = threshold
+        self._name = name
+
+    def forward(self, x):
+        return F.thresholded_relu(x, self._threshold, self._name)
+
+
 class LogSigmoid(layers.Layer):
     """
     LogSigmoid Activation.

From 9b49f02441254cdbf6cf503668f43c549ca3eb01 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sun, 11 Oct 2020 23:12:19 +0800
Subject: [PATCH 63/91] Polish jit.save/load design & remove
 paddle.SaveLoadConfig (#27623)

* replace config by kwargs

* change save path form dir to prefix

* fix failed unittests

* revert unittest name change

* polish en docs

* add more tests for coverage
---
 python/paddle/__init__.py                     |   1 -
 .../contrib/slim/tests/test_imperative_qat.py |  20 +-
 .../tests/test_imperative_qat_channelwise.py  |  20 +-
 python/paddle/fluid/dygraph/checkpoint.py     |  73 +--
 python/paddle/fluid/dygraph/io.py             |  59 +-
 python/paddle/fluid/dygraph/jit.py            | 602 ++++++------------
 python/paddle/fluid/dygraph/static_runner.py  |   4 +-
 .../dygraph_to_static/predictor_utils.py      |   5 +-
 .../unittests/dygraph_to_static/test_bert.py  |  24 +-
 .../unittests/dygraph_to_static/test_bmn.py   |  21 +-
 .../unittests/dygraph_to_static/test_lac.py   |  24 +-
 .../unittests/dygraph_to_static/test_mnist.py |  31 +-
 .../dygraph_to_static/test_mobile_net.py      |  28 +-
 .../dygraph_to_static/test_resnet.py          |  21 +-
 .../dygraph_to_static/test_resnet_v2.py       |  19 +-
 .../test_save_inference_model.py              |  45 +-
 .../dygraph_to_static/test_se_resnet.py       |  26 +-
 .../dygraph_to_static/test_transformer.py     |   1 +
 .../dygraph_to_static/transformer_util.py     |   7 +-
 .../unittests/test_directory_migration.py     |  21 +-
 .../unittests/test_imperative_save_load.py    |   5 -
 .../tests/unittests/test_jit_save_load.py     | 207 +++---
 .../test_load_state_dict_from_old_format.py   |  32 +-
 python/paddle/framework/__init__.py           |   5 +-
 python/paddle/framework/io.py                 | 179 ++++--
 25 files changed, 665 insertions(+), 815 deletions(-)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 3c52bbdcccaf8..0af32da4e690b 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -235,7 +235,6 @@
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
 from .framework import load  #DEFINE_ALIAS
-from .framework import SaveLoadConfig  #DEFINE_ALIAS
 from .framework import DataParallel  #DEFINE_ALIAS
 
 from .framework import NoamDecay  #DEFINE_ALIAS
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index df505cf2435e7..eb924e13a7e4f 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -31,6 +31,7 @@
 from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.log_helper import get_logger
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 paddle.enable_static()
 
@@ -231,10 +232,11 @@ def test_qat_save(self):
             before_save = lenet(test_img)
 
         # save inference quantized model
-        path = "./mnist_infer_model"
+        path = "./qat_infer_model/lenet"
+        save_dir = "./qat_infer_model"
         paddle.jit.save(
             layer=lenet,
-            model_path=path,
+            path=path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
@@ -245,12 +247,12 @@ def test_qat_save(self):
         else:
             place = core.CPUPlace()
         exe = fluid.Executor(place)
-        [inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=path,
-                executor=exe,
-                model_filename="__model__",
-                params_filename="__variables__"))
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             dirname=save_dir,
+             executor=exe,
+             model_filename="lenet" + INFER_MODEL_SUFFIX,
+             params_filename="lenet" + INFER_PARAMS_SUFFIX)
         after_save, = exe.run(inference_program,
                               feed={feed_target_names[0]: test_data},
                               fetch_list=fetch_targets)
@@ -339,7 +341,7 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000):
 
         paddle.jit.save(
             layer=lenet,
-            model_path="./dynamic_mnist",
+            path="./dynamic_mnist/model",
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
index 80d388ac0da62..ddf37a0ebf8c2 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
@@ -31,6 +31,7 @@
 from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.log_helper import get_logger
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 paddle.enable_static()
 
@@ -231,10 +232,11 @@ def test_qat_save(self):
             before_save = lenet(test_img)
 
         # save inference quantized model
-        path = "./mnist_infer_model"
+        path = "./qat_infer_model/mnist"
+        save_dir = "./qat_infer_model"
         paddle.jit.save(
             layer=lenet,
-            model_path=path,
+            path=path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
@@ -245,12 +247,12 @@ def test_qat_save(self):
         else:
             place = core.CPUPlace()
         exe = fluid.Executor(place)
-        [inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=path,
-                executor=exe,
-                model_filename="__model__",
-                params_filename="__variables__"))
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             dirname=save_dir,
+             executor=exe,
+             model_filename="mnist" + INFER_MODEL_SUFFIX,
+             params_filename="mnist" + INFER_PARAMS_SUFFIX)
         after_save, = exe.run(inference_program,
                               feed={feed_target_names[0]: test_data},
                               fetch_list=fetch_targets)
@@ -339,7 +341,7 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000):
 
         paddle.jit.save(
             layer=lenet,
-            model_path="./dynamic_mnist",
+            path="./dynamic_mnist/model",
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index f4ea4d670e600..fb87ea4455d34 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -24,8 +24,8 @@
 import warnings
 from .. import core
 from .base import guard
-from paddle.fluid.dygraph.jit import SaveLoadConfig, deprecate_save_load_configs
-from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers, EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.jit import _SaveLoadConfig
+from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
 
 __all__ = [
     'save_dygraph',
@@ -33,35 +33,23 @@
 ]
 
 
-# NOTE(chenweihang): deprecate load_dygraph's argument keep_name_table,
-# ensure compatibility when user still use keep_name_table argument
-def deprecate_keep_name_table(func):
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        def __warn_and_build_configs__(keep_name_table):
-            warnings.warn(
-                "The argument `keep_name_table` has deprecated, please use `SaveLoadConfig.keep_name_table`.",
-                DeprecationWarning)
-            config = SaveLoadConfig()
-            config.keep_name_table = keep_name_table
-            return config
-
-        # deal with arg `keep_name_table`
-        if len(args) > 1 and isinstance(args[1], bool):
-            args = list(args)
-            args[1] = __warn_and_build_configs__(args[1])
-        # deal with kwargs
-        elif 'keep_name_table' in kwargs:
-            kwargs['config'] = __warn_and_build_configs__(kwargs[
-                'keep_name_table'])
-            kwargs.pop('keep_name_table')
-        else:
-            # do nothing
-            pass
+def _parse_load_config(configs):
+    supported_configs = ['model_filename', 'params_filename', 'keep_name_table']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.fluid.load_dygraph` is not supported."
+                % (key))
 
-        return func(*args, **kwargs)
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.model_filename = configs.get('model_filename', None)
+    inner_config.params_filename = configs.get('params_filename', None)
+    inner_config.keep_name_table = configs.get('keep_name_table', None)
 
-    return wrapper
+    return inner_config
 
 
 @dygraph_only
@@ -132,12 +120,12 @@ def save_dygraph(state_dict, model_path):
         pickle.dump(model_dict, f, protocol=2)
 
 
+# NOTE(chenweihang): load_dygraph will deprecated in future, we don't 
+# support new loading features for it
 # TODO(qingqing01): remove dygraph_only to support loading static model.
 # maybe need to unify the loading interface after 2.0 API is ready.
 # @dygraph_only
-@deprecate_save_load_configs
-@deprecate_keep_name_table
-def load_dygraph(model_path, config=None):
+def load_dygraph(model_path, **configs):
     '''
     :api_attr: imperative
     
@@ -152,10 +140,13 @@ def load_dygraph(model_path, config=None):
     Args:
         model_path(str) : The file prefix store the state_dict. 
             (The path should Not contain suffix '.pdparams') 
-        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig`
-            object that specifies additional configuration options, these options 
-            are for compatibility with ``jit.save/io.save_inference_model`` formats. 
-            Default None.
+        **configs (dict, optional): other save configuration options for compatibility. We do not 
+            recommend using these configurations, if not necessary, DO NOT use them. Default None.
+            The following options are currently supported:
+            (1) model_filename (string): The inference model file name of the paddle 1.x ``save_inference_model`` 
+            save format. Default file name is :code:`__model__` . 
+            (2) params_filename (string): The persistable variables file name of the paddle 1.x ``save_inference_model`` 
+            save format. No default file name, save variables separately by default.
 
     Returns:
         state_dict(dict) : the dict store the state_dict
@@ -196,8 +187,7 @@ def load_dygraph(model_path, config=None):
     opti_file_path = model_prefix + ".pdopt"
 
     # deal with argument `config`
-    if config is None:
-        config = SaveLoadConfig()
+    config = _parse_load_config(configs)
 
     if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
         # Load state dict by `save_dygraph` save format
@@ -246,7 +236,6 @@ def load_dygraph(model_path, config=None):
                 persistable_var_dict = _construct_params_and_buffers(
                     model_prefix,
                     programs,
-                    config.separate_params,
                     config.params_filename,
                     append_suffix=False)
 
@@ -255,9 +244,9 @@ def load_dygraph(model_path, config=None):
                 for var_name in persistable_var_dict:
                     para_dict[var_name] = persistable_var_dict[var_name].numpy()
 
-                # if __variables.info__ exists, we can recover structured_name
-                var_info_path = os.path.join(model_prefix,
-                                             EXTRA_VAR_INFO_FILENAME)
+                # if *.info exists, we can recover structured_name
+                var_info_filename = str(config.params_filename) + ".info"
+                var_info_path = os.path.join(model_prefix, var_info_filename)
                 if os.path.exists(var_info_path):
                     with open(var_info_path, 'rb') as f:
                         extra_var_info = pickle.load(f)
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 4a3dacbd1acae..a10adeb14aa7d 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -31,8 +31,10 @@
 
 __all__ = ['TranslatedLayer']
 
-VARIABLE_FILENAME = "__variables__"
-EXTRA_VAR_INFO_FILENAME = "__variables.info__"
+INFER_MODEL_SUFFIX = ".pdmodel"
+INFER_PARAMS_SUFFIX = ".pdiparams"
+INFER_PARAMS_INFO_SUFFIX = ".pdiparams.info"
+
 LOADED_VAR_SUFFIX = "load"
 PARAMETER_NAME_PREFIX = "param"
 BUFFER_NAME_PREFIX = "buffer"
@@ -424,11 +426,8 @@ def _load_persistable_vars_by_program(model_path,
     return load_var_dict
 
 
-def _load_persistable_vars(model_path,
-                           var_info_path,
-                           program_holder,
-                           separate_params=False,
-                           params_filename=None):
+def _load_persistable_vars(model_path, var_info_path, program_holder,
+                           params_filename):
     # 1. load extra var info
     with open(var_info_path, 'rb') as f:
         extra_var_info = pickle.load(f)
@@ -464,33 +463,22 @@ def _load_persistable_vars(model_path,
             new_var = framework._varbase_creator(
                 name=new_name, persistable=True)
 
-        # load separate vars
-        if separate_params is True:
-            framework._dygraph_tracer().trace_op(
-                type='load',
-                inputs={},
-                outputs={'Out': new_var},
-                attrs={'file_path': os.path.join(model_path, name)})
-
         new_var.stop_gradient = extra_var_info[name]['stop_gradient']
         load_var_dict[new_name] = new_var
         load_var_list.append(new_var)
 
     # 3. load all vars
-    if separate_params is False:
-        if params_filename is not None:
-            var_file_path = os.path.join(model_path, params_filename)
-        else:
-            var_file_path = os.path.join(model_path, VARIABLE_FILENAME)
-        if not os.path.exists(var_file_path):
-            if len(extra_var_info) != 0:
-                raise ValueError("The model to be loaded is incomplete.")
-        else:
-            framework._dygraph_tracer().trace_op(
-                type='load_combine',
-                inputs={},
-                outputs={'Out': load_var_list},
-                attrs={'file_path': var_file_path})
+    assert params_filename is not None, "params_filename should not be None."
+    var_file_path = os.path.join(model_path, params_filename)
+    if not os.path.exists(var_file_path):
+        if len(extra_var_info) != 0:
+            raise ValueError("The model to be loaded is incomplete.")
+    else:
+        framework._dygraph_tracer().trace_op(
+            type='load_combine',
+            inputs={},
+            outputs={'Out': load_var_list},
+            attrs={'file_path': var_file_path})
 
     return load_var_dict
 
@@ -532,14 +520,13 @@ def _construct_program_holders(model_path, model_filename=None):
 
 def _construct_params_and_buffers(model_path,
                                   programs,
-                                  separate_params=False,
                                   params_filename=None,
                                   append_suffix=True):
-    var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+    var_info_filename = str(params_filename) + ".info"
+    var_info_path = os.path.join(model_path, var_info_filename)
     if os.path.exists(var_info_path):
         var_dict = _load_persistable_vars(model_path, var_info_path,
-                                          programs['forward'], separate_params,
-                                          params_filename)
+                                          programs['forward'], params_filename)
     else:
         var_dict = _load_persistable_vars_by_program(
             model_path, programs['forward'], params_filename)
@@ -700,18 +687,16 @@ def _construct(model_path, configs=None):
             raise ValueError("There is no directory named '%s'" % model_path)
         model_filename = None
         params_filename = None
-        separate_params = False
         if configs is not None:
             model_filename = configs.model_filename
             params_filename = configs.params_filename
-            separate_params = configs.separate_params
 
         # 1. load program desc & construct _ProgramHolder
         programs = _construct_program_holders(model_path, model_filename)
 
         # 2. load layer parameters & buffers
-        persistable_vars = _construct_params_and_buffers(
-            model_path, programs, separate_params, params_filename)
+        persistable_vars = _construct_params_and_buffers(model_path, programs,
+                                                         params_filename)
 
         # 3. construct TranslatedLayer object
         translated_layer = TranslatedLayer(programs, persistable_vars)
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 194ebafb08eef..6cdd13fba82ac 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -29,7 +29,7 @@
 from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticFunction, unwrap_decorators
-from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
+from paddle.fluid.dygraph.io import TranslatedLayer, INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
 from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.executor import Executor, scope_guard
 from paddle.fluid.framework import Block, ParamBase, Program, Variable
@@ -39,7 +39,7 @@
 
 __all__ = [
     'TracedLayer', 'declarative', 'dygraph_to_static_func', 'set_code_level',
-    'set_verbosity', 'save', 'load', 'SaveLoadConfig'
+    'set_verbosity', 'save', 'load'
 ]
 
 
@@ -228,73 +228,7 @@ def decorated(python_func):
     return decorated
 
 
-class SaveLoadConfig(object):
-    """
-    The additional configuration options may be used in function 
-    ``paddle.jit.save/load`` and ``paddle.load`` .
-    
-    Examples:
-        1. Using ``SaveLoadConfig`` when saving model
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn as nn
-            import paddle.optimizer as opt
-
-            class SimpleNet(nn.Layer):
-                def __init__(self, in_size, out_size):
-                    super(SimpleNet, self).__init__()
-                    self._linear = nn.Linear(in_size, out_size)
-
-                @paddle.jit.to_static
-                def forward(self, x):
-                    y = self._linear(x)
-                    z = self._linear(y)
-                    return z
-
-            # enable dygraph mode
-            paddle.disable_static() 
-
-            # train model
-            net = SimpleNet(8, 8)
-            adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-            x = paddle.randn([4, 8], 'float32')
-            for i in range(10):
-                out = net(x)
-                loss = paddle.tensor.mean(out)
-                loss.backward()
-                adam.step()
-                adam.clear_grad()
-
-            # use SaveLoadconfig when saving model
-            model_path = "simplenet.example.model"
-            config = paddle.SaveLoadConfig()
-            config.model_filename = "__simplenet__"
-            paddle.jit.save(
-                layer=net,
-                model_path=model_path,
-                config=config)
-
-        2. Using ``SaveLoadConfig`` when loading model
-
-        .. code-block:: python
-
-            import paddle
-
-            # enable dygraph mode
-            paddle.disable_static() 
-
-            # use SaveLoadconfig when loading model
-            model_path = "simplenet.example.model"
-            config = paddle.SaveLoadConfig()
-            config.model_filename = "__simplenet__"
-            infer_net = paddle.jit.load(model_path, config=config)
-            # inference
-            x = paddle.randn([4, 8], 'float32')
-            pred = infer_net(x)
-    """
-
+class _SaveLoadConfig(object):
     def __init__(self):
         self._output_spec = None
         self._model_filename = None
@@ -316,335 +250,105 @@ def __init__(self):
 
     @property
     def output_spec(self):
-        """
-        Selects the output targets of the saved model ( ``paddle.jit.TranslatedLayer`` ).
-        By default, all return variables of original Layer's forward function
-        are kept as the output of the saved TranslatedLayer.
-
-        The ``output_spec`` type should be list[Variable]. If the provided ``output_spec``
-        list is not all output variables, the saved model will be pruned according to the
-        given ``output_spec`` list.
-
-        .. note::
-            The ``output_spec`` is only used when saving model.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                import paddle.nn as nn
-                import paddle.optimizer as opt
-
-                class SimpleNet(nn.Layer):
-                    def __init__(self, in_size, out_size):
-                        super(SimpleNet, self).__init__()
-                        self._linear = nn.Linear(in_size, out_size)
-
-                    @paddle.jit.to_static
-                    def forward(self, x):
-                        y = self._linear(x)
-                        z = self._linear(y)
-                        loss = paddle.tensor.mean(z)
-                        return z, loss
-
-                # enable dygraph mode
-                paddle.disable_static() 
-
-                # train model
-                net = SimpleNet(8, 8)
-                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-                x = paddle.randn([4, 8], 'float32')
-                for i in range(10):
-                    out, loss = net(x)
-                    loss.backward()
-                    adam.step()
-                    adam.clear_grad()
-
-                # use SaveLoadconfig.output_spec
-                model_path = "simplenet.example.model.output_spec"
-                config = paddle.SaveLoadConfig()
-                config.output_spec = [out]
-                paddle.jit.save(
-                    layer=net,
-                    model_path=model_path,
-                    config=config)
-
-                infer_net = paddle.jit.load(model_path)
-                x = paddle.randn([4, 8], 'float32')
-                pred = infer_net(x)
-        """
         return self._output_spec
 
     @output_spec.setter
     def output_spec(self, spec):
+        if spec is None:
+            return
         if not isinstance(spec, list):
             raise TypeError(
-                "The SaveLoadConfig.output_spec should be 'list', but received input type is %s."
+                "The config `output_spec` should be 'list', but received input type is %s."
                 % type(input))
             for var in spec:
                 if not isinstance(var, core.VarBase):
                     raise TypeError(
-                        "The element in SaveLoadConfig.output_spec list should be 'Variable', but received element's type is %s."
+                        "The element in config `output_spec` list should be 'Variable', but received element's type is %s."
                         % type(var))
         self._output_spec = spec
 
     @property
     def model_filename(self):
-        """
-        The name of file to save the translated program of target Layer.
-        Default filename is :code:`__model__` .
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                import paddle.nn as nn
-                import paddle.optimizer as opt
-
-                class SimpleNet(nn.Layer):
-                    def __init__(self, in_size, out_size):
-                        super(SimpleNet, self).__init__()
-                        self._linear = nn.Linear(in_size, out_size)
-
-                    @paddle.jit.to_static
-                    def forward(self, x):
-                        y = self._linear(x)
-                        z = self._linear(y)
-                        return z
-
-                # enable dygraph mode
-                paddle.disable_static() 
-
-                # train model
-                net = SimpleNet(8, 8)
-                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-                x = paddle.randn([4, 8], 'float32')
-                for i in range(10):
-                    out = net(x)
-                    loss = paddle.tensor.mean(out)
-                    loss.backward()
-                    adam.step()
-                    adam.clear_grad()
-
-                # saving with configs.model_filename
-                model_path = "simplenet.example.model.model_filename"
-                config = paddle.SaveLoadConfig()
-                config.model_filename = "__simplenet__"
-                paddle.jit.save(
-                    layer=net,
-                    model_path=model_path,
-                    config=config)
-
-                # loading with configs.model_filename
-                infer_net = paddle.jit.load(model_path, config=config)
-                x = paddle.randn([4, 8], 'float32')
-                pred = infer_net(x)
-        """
         return self._model_filename
 
     @model_filename.setter
     def model_filename(self, filename):
+        if filename is None:
+            return
         if not isinstance(filename, six.string_types):
             raise TypeError(
-                "The SaveLoadConfig.model_filename should be str, but received input's type is %s."
+                "The config `model_filename` should be str, but received input's type is %s."
                 % type(filename))
         if len(filename) == 0:
-            raise ValueError(
-                "The SaveLoadConfig.model_filename is empty string.")
+            raise ValueError("The config `model_filename` is empty string.")
         self._model_filename = filename
 
     @property
     def params_filename(self):
-        """
-        The name of file to save all persistable variables in target Layer. 
-        Default file name is :code:`__variables__` .
-        
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                import paddle.nn as nn
-                import paddle.optimizer as opt
-
-                class SimpleNet(nn.Layer):
-                    def __init__(self, in_size, out_size):
-                        super(SimpleNet, self).__init__()
-                        self._linear = nn.Linear(in_size, out_size)
-
-                    @paddle.jit.to_static
-                    def forward(self, x):
-                        y = self._linear(x)
-                        z = self._linear(y)
-                        return z
-
-                # enable dygraph mode
-                paddle.disable_static() 
-
-                # train model
-                net = SimpleNet(8, 8)
-                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-                x = paddle.randn([4, 8], 'float32')
-                for i in range(10):
-                    out = net(x)
-                    loss = paddle.tensor.mean(out)
-                    loss.backward()
-                    adam.step()
-                    adam.clear_grad()
-
-                model_path = "simplenet.example.model.params_filename"
-                config = paddle.SaveLoadConfig()
-                config.params_filename = "__params__"
-
-                # saving with configs.params_filename
-                paddle.jit.save(
-                    layer=net,
-                    model_path=model_path,
-                    config=config)
-
-                # loading with configs.params_filename
-                infer_net = paddle.jit.load(model_path, config=config)
-                x = paddle.randn([4, 8], 'float32')
-                pred = infer_net(x)
-        """
         return self._params_filename
 
     @params_filename.setter
     def params_filename(self, filename):
+        if filename is None:
+            return
         if not isinstance(filename, six.string_types):
             raise TypeError(
-                "The SaveLoadConfig.params_filename should be str, but received input's type is %s."
+                "The config `params_filename` should be str, but received input's type is %s."
                 % type(filename))
         if len(filename) == 0:
-            raise ValueError(
-                "The SaveLoadConfig.params_filename is empty string.")
+            raise ValueError("The config `params_filename` is empty string.")
         self._params_filename = filename
 
-    # NOTE: [why not use params_filename=None control params saved separately]
-    # The new save interface does not recommend parameters to be saved separately. 
-    # Here, the concept should be separated as clearly as possible. 
-    # Setting params_filename=None only means that the saved file name is set 
-    # and without any other meaning. New separate_params control for file saved
-    # separately can makes the concept clearer.
-    @property
-    def separate_params(self):
-        """
-        Configure whether to save the Layer parameters as separete files.
-        (In order to be compatible with the behavior of ``paddle.static.save_inference_model`` )
-
-        If True, each parameter will be saved to a file separately, the file name is the parameter name,
-        and the SaveLoadConfig.params_filename configuration will not take effect. Default False.
-
-        .. note::
-            Only used for ``paddle.jit.save`` .
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                import paddle.nn as nn
-                import paddle.optimizer as opt
-
-                class SimpleNet(nn.Layer):
-                    def __init__(self, in_size, out_size):
-                        super(SimpleNet, self).__init__()
-                        self._linear = nn.Linear(in_size, out_size)
-
-                    @paddle.jit.to_static
-                    def forward(self, x):
-                        y = self._linear(x)
-                        z = self._linear(y)
-                        return z
-
-                # enable dygraph mode
-                paddle.disable_static() 
-
-                # train model
-                net = SimpleNet(8, 8)
-                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-                x = paddle.randn([4, 8], 'float32')
-                for i in range(10):
-                    out = net(x)
-                    loss = paddle.tensor.mean(out)
-                    loss.backward()
-                    adam.step()
-                    adam.clear_grad()
-
-                model_path = "simplenet.example.model.separate_params"
-                config = paddle.SaveLoadConfig()
-                config.separate_params = True
-
-                # saving with configs.separate_params
-                paddle.jit.save(
-                    layer=net,
-                    model_path=model_path,
-                    config=config)
-                # [result] the saved model directory contains:
-                # linear_0.b_0  linear_0.w_0  __model__  __variables.info__
-
-                # loading with configs.params_filename
-                infer_net = paddle.jit.load(model_path, config=config)
-                x = paddle.randn([4, 8], 'float32')
-                pred = infer_net(x)
-        """
-        return self._separate_params
-
-    @separate_params.setter
-    def separate_params(self, value):
-        if not isinstance(value, bool):
-            raise TypeError(
-                "The SaveLoadConfig.separate_params should be bool value, but received input's type is %s."
-                % type(value))
-        self._separate_params = value
-
     @property
     def keep_name_table(self):
-        """
-        Configures whether keep ``structured_name -> parameter_name`` dict in loaded state dict.
-        This dict is the debugging information saved when call ``paddle.save`` . 
-        It is generally only used for debugging and does not affect the actual training or inference. 
-        By default, it will not be retained in ``paddle.load`` result. Default: False.
-        
-        .. note::
-            Only used for ``paddle.load`` .
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-            
-                paddle.disable_static()
-
-                linear = paddle.nn.Linear(5, 1)
-
-                state_dict = linear.state_dict()
-                paddle.save(state_dict, "paddle_dy.pdparams")
-
-                config = paddle.SaveLoadConfig()
-                config.keep_name_table = True
-                para_state_dict = paddle.load("paddle_dy.pdparams", config)
-
-                print(para_state_dict)
-                # the name_table is 'StructuredToParameterName@@'
-                # {'bias': array([0.], dtype=float32), 
-                #  'StructuredToParameterName@@': 
-                #     {'bias': u'linear_0.b_0', 'weight': u'linear_0.w_0'}, 
-                #  'weight': array([[ 0.04230034],
-                #     [-0.1222527 ],
-                #     [ 0.7392676 ],
-                #     [-0.8136974 ],
-                #     [ 0.01211023]], dtype=float32)}
-        """
         return self._keep_name_table
 
     @keep_name_table.setter
     def keep_name_table(self, value):
+        if value is None:
+            return
         if not isinstance(value, bool):
             raise TypeError(
-                "The SaveLoadConfig.keep_name_table should be bool value, but received input's type is %s."
+                "The config `keep_name_table` should be bool value, but received input's type is %s."
                 % type(value))
         self._keep_name_table = value
 
 
+def _parse_save_configs(configs):
+    supported_configs = ['output_spec']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.jit.save` is not supported."
+                % (key))
+
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.output_spec = configs.get('output_spec', None)
+
+    return inner_config
+
+
+def _parse_load_config(configs):
+    supported_configs = ['model_filename', 'params_filename']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.jit.load` is not supported."
+                % (key))
+
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.model_filename = configs.get('model_filename', None)
+    inner_config.params_filename = configs.get('params_filename', None)
+
+    return inner_config
+
+
 def _get_input_var_names(inputs, input_spec):
     name_none_error = "The %s's name is None. " \
         "When using jit.save, please set InputSepc's name in " \
@@ -712,47 +416,88 @@ def _get_output_vars(outputs, output_spec):
     return result_list
 
 
-# NOTE(chenweihang): change jit.save/load argument `configs` to `config`
-def deprecate_save_load_configs(func):
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        if 'configs' in kwargs:
-            kwargs['config'] = kwargs['configs']
-            kwargs.pop('configs')
-        return func(*args, **kwargs)
+# NOTE(chenweihang): [ Handling of use cases of API paddle.jit.load ]
+# `paddle.jit.load` may be used to load saved results of:
+# 1. Expected cases:
+#   - paddle.jit.save
+#   - paddle.static.save_inference_model
+#   - paddle.fluid.io.save_inference_model
+# 2. Error cases:
+#   - paddle.save: no .pdmodel for prefix
+#   - paddle.static.save: no .pdiparams but .pdparams exists
+#   - paddle.fluid.io.save_params/save_persistables: no __model__
+# TODO(chenweihang): polish error message in above error cases
+def _build_load_path_and_config(path, config):
+    # NOTE(chenweihang): If both [prefix save format] and [directory save format] exist,
+    # raise error, avoid confusing behavior
+    prefix_format_path = path + INFER_MODEL_SUFFIX
+    prefix_format_exist = os.path.exists(prefix_format_path)
+    directory_format_exist = os.path.isdir(path)
+    if prefix_format_exist and directory_format_exist:
+        raise ValueError(
+            "The %s.pdmodel and %s directory exist at the same time, "
+            "don't know which one to load, please make sure that the specified target "
+            "of ``path`` is unique." % (path, path))
+    elif not prefix_format_exist and not directory_format_exist:
+        raise ValueError("The ``path`` (%s) to load model not exists." % path)
+    else:
+        if prefix_format_exist:
+            file_prefix = os.path.basename(path)
+            model_path = os.path.dirname(path)
+            if config.model_filename is not None:
+                warnings.warn(
+                    "When loading the result saved with the "
+                    "specified file prefix, the ``model_filename`` config does "
+                    "not take effect.")
+            config.model_filename = file_prefix + INFER_MODEL_SUFFIX
+            if config.params_filename is not None:
+                warnings.warn(
+                    "When loading the result saved with the "
+                    "specified file prefix, the ``params_filename`` config does "
+                    "not take effect.")
+            config.params_filename = file_prefix + INFER_PARAMS_SUFFIX
+        else:
+            # Compatible with the old save_inference_model format
+            model_path = path
 
-    return wrapper
+    return model_path, config
 
 
-@deprecate_save_load_configs
 @switch_to_static_graph
-def save(layer, model_path, input_spec=None, config=None):
+def save(layer, path, input_spec=None, **configs):
     """
-    Saves input declarative Layer as :ref:`api_imperative_TranslatedLayer` 
+    Saves input Layer as ``paddle.jit.TranslatedLayer``
     format model, which can be used for inference or fine-tuning after loading.
 
     It will save the translated program and all related persistable 
-    variables of input declarative Layer to given ``model_path``.
+    variables of input Layer to given ``path``.
     
-    The default saved translated program file name is ``__model__``,
-    and the default saved persistable variables file name is ``__variables__``,
-    and it also saved some additional variable description information to file 
-    ``__variables.info__``, these additional information is used in fine-tuning.
+    ``path`` is the prefix of saved objects, and the saved translated program file 
+    suffix is ``.pdmodel``, the saved persistable variables file suffix is ``.pdiparams``,
+    and here also saved some additional variable description information to a file,  
+    its suffix is ``.pdiparams.info``, these additional information is used in fine-tuning.
 
     The saved model can be loaded by follow APIs:
-      - :ref:`api_imperative_jit_load`
-      - :ref:`api_fluid_io_load_inference_model` (need pass ``params_filename='__variables__'``)
+      - ``paddle.jit.load`` 
+      - ``paddle.static.load_inference_model`` 
       - Other C++ inference APIs
 
     Args:
-        layer (Layer): the Layer to be saved. The Layer should be decorated by `@declarative`.
-        model_path (str): the directory to save the model.
-        input_spec (list[Variable], optional): Describes the input of the saved model. 
+        layer (Layer): the Layer to be saved. The Layer should be decorated by `@paddle.jit.to_static`.
+        path (str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
+        input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model. 
             It is the example inputs that will be passed to saved TranslatedLayer's forward
             function. If None, all input variables of the original Layer's forward function
             would be the inputs of the saved model. Default None.
-        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object
-            that specifies additional configuration options. Default None.
+        **configs (dict, optional): other save configuration options for compatibility. We do not 
+            recommend using these configurations, they may be removed in the future. If not necessary, 
+            DO NOT use them. Default None.
+            The following options are currently supported:
+            (1) output_spec (list[Tensor]): Selects the output targets of the saved model.
+            By default, all return variables of original Layer's forward function are kept as the 
+            output of the saved model. If the provided ``output_spec`` list is not all output variables, 
+            the saved model will be pruned according to the given ``output_spec`` list. 
+
     Returns:
         None
 
@@ -804,10 +549,6 @@ def train(layer, loader, loss_fn, opt):
                         print("Epoch {} batch {}: loss = {}".format(
                             epoch_id, batch_id, np.mean(loss.numpy())))
 
-            # enable dygraph mode
-            place = paddle.CPUPlace()
-            paddle.disable_static(place) 
-
             # 1. train & save model.
 
             # create network
@@ -818,7 +559,6 @@ def train(layer, loader, loss_fn, opt):
             # create data loader
             dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
             loader = paddle.io.DataLoader(dataset,
-                places=place,
                 batch_size=BATCH_SIZE,
                 shuffle=True,
                 drop_last=True,
@@ -828,11 +568,11 @@ def train(layer, loader, loss_fn, opt):
             train(layer, loader, loss_fn, adam)
 
             # save
-            model_path = "linear.example.model"
-            paddle.jit.save(layer, model_path)
+            path = "example_model/linear"
+            paddle.jit.save(layer, path)
     """
 
-    # 1. input check
+    # 1. input build & check
     prog_translator = ProgramTranslator()
     if not prog_translator.enable_to_static:
         raise RuntimeError(
@@ -843,9 +583,17 @@ def train(layer, loader, loss_fn, opt):
             "The input layer of paddle.jit.save should be 'Layer', but received layer type is %s."
             % type(layer))
 
-    configs = config
-    if configs is None:
-        configs = SaveLoadConfig()
+    # path check
+    file_prefix = os.path.basename(path)
+    if file_prefix == "":
+        raise ValueError(
+            "The input path MUST be format of dirname/file_prefix "
+            "[dirname\\file_prefix in Windows system], but received "
+            "file_prefix is empty string.")
+
+    dirname = os.path.dirname(path)
+    if dirname and not os.path.exists(dirname):
+        os.makedirs(dirname)
 
     # avoid change user given input_spec
     inner_input_spec = None
@@ -866,6 +614,9 @@ def train(layer, loader, loss_fn, opt):
                     "The element in input_spec list should be 'Variable' or `paddle.static.InputSpec`, but received element's type is %s."
                     % type(var))
 
+    # parse configs
+    configs = _parse_save_configs(configs)
+
     # 2. get program from Layer
     # TODO(chenweihang): add support for other method, not only forward
     if isinstance(layer.forward, StaticFunction):
@@ -927,9 +678,12 @@ def train(layer, loader, loss_fn, opt):
     # 5. save inference model
     from paddle.fluid.io import save_inference_model
 
-    # VARIABLE_FILENAME keep nameing style consistent with '__model__'
-    if configs.params_filename is None:
-        configs.params_filename = VARIABLE_FILENAME
+    # construct new save_inference_model arguments
+    model_path = dirname
+    # NOTE(chenweihang): because prefix contains model and params filename,
+    # so we don't support set model_filename & params_filename 
+    model_filename = file_prefix + INFER_MODEL_SUFFIX
+    params_filename = file_prefix + INFER_PARAMS_SUFFIX
 
     with scope_guard(scope):
         save_inference_model(
@@ -938,9 +692,8 @@ def train(layer, loader, loss_fn, opt):
             target_vars=output_vars,
             executor=Executor(_current_expected_place()),
             main_program=concrete_program.main_program.clone(),
-            model_filename=configs.model_filename,
-            params_filename=None
-            if configs.separate_params else configs.params_filename,
+            model_filename=model_filename,
+            params_filename=params_filename,
             export_for_deployment=configs._export_for_deployment,
             program_only=configs._program_only)
 
@@ -958,23 +711,23 @@ def train(layer, loader, loss_fn, opt):
         # Due to compatibility issues, we cannot change the original storage structure, 
         # but we can save these information in `jit.save` without changing the original 
         # storage to improve user experience. So we save extra information into
-        # file `__variables.info__`
-        extra_var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+        # file `***.pdiparams.info`
+        extra_var_info_path = path + INFER_PARAMS_INFO_SUFFIX
         with open(extra_var_info_path, 'wb') as f:
             pickle.dump(extra_var_info, f, protocol=2)
 
 
-@deprecate_save_load_configs
 @dygraph_only
-def load(model_path, config=None):
+def load(path, **configs):
     """
     :api_attr: imperative
 
-    Load model saved by :ref:`api_imperative_jit_save` or :ref:`api_fluid_io_save_inference_model`
-    as :ref:`api_imperative_TranslatedLayer`, then performing inference or fine-tune training.
+    Load model saved by ``paddle.jit.save`` or ``paddle.static.save_inference_model`` or 
+    paddle 1.x API ``paddle.fluid.io.save_inference_model`` as ``paddle.jit.TranslatedLayer``, 
+    then performing inference or fine-tune training.
 
     .. note::
-        For some historical reasons, if you load model saved by :ref:`api_fluid_io_save_inference_model`,
+        If you load model saved by ``paddle.static.save_inference_model`` ,
         there will be the following limitations when using it in fine-tuning:
         1. Imperative mode do not support LoDTensor. All original model's feed targets or parametars that depend on LoD are temporarily unavailable.
         2. All saved model's feed targets need to be passed into TranslatedLayer's forward function.
@@ -982,15 +735,23 @@ def load(model_path, config=None):
         4. The parameter's ``trainable`` information is lost and can not be recovered.
 
     Args:
-        model_path (str): The directory path where the model is saved.
-        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object that specifies 
-            additional configuration options. Default None.
+        path (str): The path prefix to load model. The format is ``dirname/file_prefix`` or ``file_prefix``.
+        **configs (dict, optional): other load configuration options for compatibility. We do not 
+            recommend using these configurations, they may be removed in the future. If not necessary, 
+            DO NOT use them. Default None.
+            The following options are currently supported:
+            (1) model_filename (string): The inference model file name of the paddle 1.x 
+            ``save_inference_model`` save format. Default file name is :code:`__model__` . 
+            (2) params_filename (string): The persistable variables file name of the paddle 1.x 
+            ``save_inference_model`` save format. No default file name, save variables separately 
+            by default.
+
 
     Returns:
         TranslatedLayer: A Layer object can run saved translated model.
 
     Examples:
-        1. Load model saved by :ref:`api_imperative_jit_save` then performing inference and fine-tune training.
+        1. Load model saved by ``paddle.jit.save`` then performing inference and fine-tune training.
 
         .. code-block:: python
 
@@ -1039,10 +800,6 @@ def train(layer, loader, loss_fn, opt):
                         print("Epoch {} batch {}: loss = {}".format(
                             epoch_id, batch_id, np.mean(loss.numpy())))
 
-            # enable dygraph mode
-            place = paddle.CPUPlace()
-            paddle.disable_static(place) 
-
             # 1. train & save model.
 
             # create network
@@ -1053,7 +810,6 @@ def train(layer, loader, loss_fn, opt):
             # create data loader
             dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
             loader = paddle.io.DataLoader(dataset,
-                places=place,
                 batch_size=BATCH_SIZE,
                 shuffle=True,
                 drop_last=True,
@@ -1063,13 +819,13 @@ def train(layer, loader, loss_fn, opt):
             train(layer, loader, loss_fn, adam)
 
             # save
-            model_path = "linear.example.model"
-            paddle.jit.save(layer, model_path)
+            path = "example_model/linear"
+            paddle.jit.save(layer, path)
 
             # 2. load model
 
             # load
-            loaded_layer = paddle.jit.load(model_path)
+            loaded_layer = paddle.jit.load(path)
 
             # inference
             loaded_layer.eval()
@@ -1082,15 +838,17 @@ def train(layer, loader, loss_fn, opt):
             train(loaded_layer, loader, loss_fn, adam)
 
 
-        2. Load model saved by :ref:`api_fluid_io_save_inference_model` then performing and fine-tune training.
+        2. Load model saved by ``paddle.fluid.io.save_inference_model`` then performing and fine-tune training.
 
         .. code-block:: python
 
             import numpy as np
             import paddle
             import paddle.fluid as fluid
+            import paddle.static as static
             import paddle.nn as nn
             import paddle.optimizer as opt
+            import paddle.nn.functional as F
 
             BATCH_SIZE = 16
             BATCH_NUM = 4
@@ -1112,18 +870,18 @@ def __getitem__(self, idx):
                 def __len__(self):
                     return self.num_samples
 
-            image = fluid.data(name='image', shape=[None, 784], dtype='float32')
-            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            pred = fluid.layers.fc(input=image, size=10, act='softmax')
-            loss = fluid.layers.cross_entropy(input=pred, label=label)
-            avg_loss = fluid.layers.mean(loss)
+            image = static.data(name='image', shape=[None, 784], dtype='float32')
+            label = static.data(name='label', shape=[None, 1], dtype='int64')
+            pred = static.nn.fc(input=image, size=10, act='softmax')
+            loss = F.cross_entropy(input=pred, label=label)
+            avg_loss = paddle.mean(loss)
 
-            optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+            optimizer = paddle.optimizer.SGD(learning_rate=0.001)
             optimizer.minimize(avg_loss)
 
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
+            place = paddle.CPUPlace()
+            exe = static.Executor(place)
+            exe.run(static.default_startup_program())
 
             # create data loader
             dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
@@ -1138,7 +896,7 @@ def __len__(self):
             # 1. train and save inference model
             for data in loader():
                 exe.run(
-                    fluid.default_main_program(),
+                    static.default_main_program(),
                     feed=data, 
                     fetch_list=[avg_loss])
 
@@ -1179,6 +937,10 @@ def __len__(self):
                     print("Epoch {} batch {}: loss = {}".format(
                         epoch_id, batch_id, np.mean(loss.numpy())))
     """
+    # 1. construct correct config
+    config = _parse_load_config(configs)
+    model_path, config = _build_load_path_and_config(path, config)
+
     return TranslatedLayer._construct(model_path, config)
 
 
diff --git a/python/paddle/fluid/dygraph/static_runner.py b/python/paddle/fluid/dygraph/static_runner.py
index d482077cd4f2a..e8738da07e993 100644
--- a/python/paddle/fluid/dygraph/static_runner.py
+++ b/python/paddle/fluid/dygraph/static_runner.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from paddle.fluid.dygraph.jit import SaveLoadConfig
+from paddle.fluid.dygraph.jit import _SaveLoadConfig
 from paddle.fluid.dygraph.io import TranslatedLayer
 
 
@@ -31,7 +31,7 @@ class StaticModelRunner(object):
     """
 
     def __new__(cls, model_dir, model_filename=None, params_filename=None):
-        configs = SaveLoadConfig()
+        configs = _SaveLoadConfig()
         if model_filename is not None:
             configs.model_filename = model_filename
         if params_filename is not None:
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
index ba0adaf32e15d..63edd35f59bd4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
@@ -28,11 +28,12 @@ class PredictorTools(object):
     Paddle-Inference predictor
     '''
 
-    def __init__(self, model_path, params_file, feeds_var):
+    def __init__(self, model_path, model_file, params_file, feeds_var):
         '''
         __init__
         '''
         self.model_path = model_path
+        self.model_file = model_file
         self.params_file = params_file
 
         self.feeds_var = feeds_var
@@ -43,7 +44,7 @@ def _load_model_and_set_config(self):
         '''
         if os.path.exists(os.path.join(self.model_path, self.params_file)):
             config = AnalysisConfig(
-                os.path.join(self.model_path, "__model__"),
+                os.path.join(self.model_path, self.model_file),
                 os.path.join(self.model_path, self.params_file))
         else:
             config = AnalysisConfig(os.path.join(self.model_path))
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
index f105dd5e94744..6c26189a4adb3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import time
 import unittest
-
 import numpy as np
+
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from bert_dygraph_model import PretrainModelLayer
 from bert_utils import get_bert_config, get_feed_data_reader
@@ -31,7 +33,10 @@
 SEED = 2020
 STEP_NUM = 10
 PRINT_STEP = 2
-MODEL_SAVE_PATH = "./bert.inference.model"
+MODEL_SAVE_DIR = "./inference"
+MODEL_SAVE_PREFIX = "./inference/bert"
+MODEL_FILENAME = "bert" + INFER_MODEL_SUFFIX
+PARAMS_FILENAME = "bert" + INFER_PARAMS_SUFFIX
 DY_STATE_DICT_SAVE_PATH = "./bert.dygraph"
 
 
@@ -85,7 +90,7 @@ def train(bert_config, data_reader, to_static):
             step_idx += 1
             if step_idx == STEP_NUM:
                 if to_static:
-                    fluid.dygraph.jit.save(bert, MODEL_SAVE_PATH)
+                    fluid.dygraph.jit.save(bert, MODEL_SAVE_PREFIX)
                 else:
                     fluid.dygraph.save_dygraph(bert.state_dict(),
                                                DY_STATE_DICT_SAVE_PATH)
@@ -104,11 +109,15 @@ def train_static(bert_config, data_reader):
 
 
 def predict_static(data):
+    paddle.enable_static()
     exe = fluid.Executor(place)
     # load inference model
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_PATH, executor=exe, params_filename=VARIABLE_FILENAME)
+         MODEL_SAVE_DIR,
+         executor=exe,
+         model_filename=MODEL_FILENAME,
+         params_filename=PARAMS_FILENAME)
     pred_res = exe.run(inference_program,
                        feed=dict(zip(feed_target_names, data)),
                        fetch_list=fetch_targets)
@@ -143,7 +152,7 @@ def predict_dygraph(bert_config, data):
 
 def predict_dygraph_jit(data):
     with fluid.dygraph.guard(place):
-        bert = fluid.dygraph.jit.load(MODEL_SAVE_PATH)
+        bert = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
         bert.eval()
 
         src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = data
@@ -155,7 +164,8 @@ def predict_dygraph_jit(data):
 
 
 def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, data)
+    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
+                            data)
     out = output()
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index af7e73c41464d..f54f70e4b854b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -21,7 +21,7 @@
 from paddle.fluid import ParamAttr
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph import ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from predictor_utils import PredictorTools
 
@@ -422,7 +422,10 @@ class Args(object):
     prop_boundary_ratio = 0.5
     num_sample = 2
     num_sample_perbin = 2
-    infer_dir = './bmn_infer_model'
+    model_save_dir = "./inference"
+    model_save_prefix = "./inference/bmn"
+    model_filename = "bmn" + INFER_MODEL_SUFFIX
+    params_filename = "bmn" + INFER_PARAMS_SUFFIX
     dy_param_path = './bmn_dy_param'
 
 
@@ -620,7 +623,7 @@ def train_bmn(args, place, to_static):
 
                 if batch_id == args.train_batch_num:
                     if to_static:
-                        fluid.dygraph.jit.save(bmn, args.infer_dir)
+                        fluid.dygraph.jit.save(bmn, args.model_save_prefix)
                     else:
                         fluid.dygraph.save_dygraph(bmn.state_dict(),
                                                    args.dy_param_path)
@@ -735,13 +738,15 @@ def predict_dygraph(self, data):
             return pred_res
 
     def predict_static(self, data):
+        paddle.enable_static()
         exe = fluid.Executor(self.place)
         # load inference model
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             self.args.infer_dir,
+             self.args.model_save_dir,
              executor=exe,
-             params_filename=VARIABLE_FILENAME)
+             model_filename=self.args.model_filename,
+             params_filename=self.args.params_filename)
         pred_res = exe.run(inference_program,
                            feed={feed_target_names[0]: data},
                            fetch_list=fetch_targets)
@@ -750,7 +755,7 @@ def predict_static(self, data):
 
     def predict_dygraph_jit(self, data):
         with fluid.dygraph.guard(self.place):
-            bmn = fluid.dygraph.jit.load(self.args.infer_dir)
+            bmn = fluid.dygraph.jit.load(self.args.model_save_prefix)
             bmn.eval()
 
             x = to_variable(data)
@@ -760,7 +765,9 @@ def predict_dygraph_jit(self, data):
             return pred_res
 
     def predict_analysis_inference(self, data):
-        output = PredictorTools(self.args.infer_dir, VARIABLE_FILENAME, [data])
+        output = PredictorTools(self.args.model_save_dir,
+                                self.args.model_filename,
+                                self.args.params_filename, [data])
         out = output()
         return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index 4d735b565ddbc..d8cb3854d3e23 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -26,7 +26,7 @@
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph import Embedding, Linear, GRUUnit
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from predictor_utils import PredictorTools
 
@@ -395,7 +395,10 @@ class Args(object):
     base_learning_rate = 0.01
     bigru_num = 2
     print_steps = 1
-    model_save_dir = "./lac_model"
+    model_save_dir = "./inference"
+    model_save_prefix = "./inference/lac"
+    model_filename = "lac" + INFER_MODEL_SUFFIX
+    params_filename = "lac" + INFER_PARAMS_SUFFIX
     dy_param_path = "./lac_dy_param"
 
 
@@ -498,13 +501,11 @@ def do_train(args, to_static):
                 step += 1
         # save inference model
         if to_static:
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.output_spec = [crf_decode]
             fluid.dygraph.jit.save(
                 layer=model,
-                model_path=args.model_save_dir,
+                path=args.model_save_prefix,
                 input_spec=[words, length],
-                configs=configs)
+                output_spec=[crf_decode])
         else:
             fluid.dygraph.save_dygraph(model.state_dict(), args.dy_param_path)
 
@@ -573,13 +574,15 @@ def predict_static(self, batch):
         LAC model contains h_0 created in `__init__` that is necessary for inferring.
         Load inference model to test it's ok for prediction.
         """
+        paddle.enable_static()
         exe = fluid.Executor(self.place)
         # load inference model
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
              self.args.model_save_dir,
              executor=exe,
-             params_filename=VARIABLE_FILENAME)
+             model_filename=self.args.model_filename,
+             params_filename=self.args.params_filename)
 
         words, targets, length = batch
         pred_res = exe.run(
@@ -592,7 +595,7 @@ def predict_static(self, batch):
     def predict_dygraph_jit(self, batch):
         words, targets, length = batch
         with fluid.dygraph.guard(self.place):
-            model = fluid.dygraph.jit.load(self.args.model_save_dir)
+            model = fluid.dygraph.jit.load(self.args.model_save_prefix)
             model.eval()
 
             pred_res = model(to_variable(words), to_variable(length))
@@ -602,8 +605,9 @@ def predict_dygraph_jit(self, batch):
     def predict_analysis_inference(self, batch):
         words, targets, length = batch
 
-        output = PredictorTools(self.args.model_save_dir, VARIABLE_FILENAME,
-                                [words, length])
+        output = PredictorTools(self.args.model_save_dir,
+                                self.args.model_filename,
+                                self.args.params_filename, [words, length])
         out = output()
         return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index bd600d2f2dbd6..8a21c4cfd0eca 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -25,7 +25,7 @@
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph.nn import Conv2D, Linear, Pool2D
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 
 from predictor_utils import PredictorTools
@@ -218,34 +218,39 @@ def train(self, to_static=False):
     def check_jit_save_load(self, model, inputs, input_spec, to_static, gt_out):
         if to_static:
             infer_model_path = "./test_mnist_inference_model_by_jit_save"
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.output_spec = [gt_out]
+            model_save_dir = "./inference"
+            model_save_prefix = "./inference/mnist"
+            model_filename = "mnist" + INFER_MODEL_SUFFIX
+            params_filename = "mnist" + INFER_PARAMS_SUFFIX
             fluid.dygraph.jit.save(
                 layer=model,
-                model_path=infer_model_path,
+                path=model_save_prefix,
                 input_spec=input_spec,
-                configs=configs)
+                output_spec=[gt_out])
             # load in static mode
             static_infer_out = self.jit_load_and_run_inference_static(
-                infer_model_path, inputs)
+                model_save_dir, model_filename, params_filename, inputs)
             self.assertTrue(np.allclose(gt_out.numpy(), static_infer_out))
             # load in dygraph mode
             dygraph_infer_out = self.jit_load_and_run_inference_dygraph(
-                infer_model_path, inputs)
+                model_save_prefix, inputs)
             self.assertTrue(np.allclose(gt_out.numpy(), dygraph_infer_out))
             # load in Paddle-Inference
             predictor_infer_out = self.predictor_load_and_run_inference_analysis(
-                infer_model_path, inputs)
+                model_save_dir, model_filename, params_filename, inputs)
             self.assertTrue(np.allclose(gt_out.numpy(), predictor_infer_out))
 
     @switch_to_static_graph
-    def jit_load_and_run_inference_static(self, model_path, inputs):
+    def jit_load_and_run_inference_static(self, model_path, model_filename,
+                                          params_filename, inputs):
+        paddle.enable_static()
         exe = fluid.Executor(self.place)
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
              dirname=model_path,
              executor=exe,
-             params_filename=VARIABLE_FILENAME)
+             model_filename=model_filename,
+             params_filename=params_filename)
         assert len(inputs) == len(feed_target_names)
         results = exe.run(inference_program,
                           feed=dict(zip(feed_target_names, inputs)),
@@ -258,8 +263,10 @@ def jit_load_and_run_inference_dygraph(self, model_path, inputs):
         pred = infer_net(inputs[0])
         return pred.numpy()
 
-    def predictor_load_and_run_inference_analysis(self, model_path, inputs):
-        output = PredictorTools(model_path, VARIABLE_FILENAME, inputs)
+    def predictor_load_and_run_inference_analysis(
+            self, model_path, model_filename, params_filename, inputs):
+        output = PredictorTools(model_path, model_filename, params_filename,
+                                inputs)
         out = output()
         return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index a377075062b26..a086bf1455a81 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -20,7 +20,7 @@
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 import unittest
 
@@ -439,7 +439,10 @@ class Args(object):
     train_step = 10
     place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
     ) else fluid.CPUPlace()
-    model_save_path = model + ".inference.model"
+    model_save_dir = "./inference"
+    model_save_prefix = "./inference/" + model
+    model_filename = model + INFER_MODEL_SUFFIX
+    params_filename = model + INFER_PARAMS_SUFFIX
     dy_state_dict_save_path = model + ".dygraph"
 
 
@@ -504,7 +507,7 @@ def train_mobilenet(args, to_static):
                 t_last = time.time()
                 if batch_id > args.train_step:
                     if to_static:
-                        fluid.dygraph.jit.save(net, args.model_save_path)
+                        fluid.dygraph.jit.save(net, args.model_save_prefix)
                     else:
                         fluid.dygraph.save_dygraph(net.state_dict(),
                                                    args.dy_state_dict_save_path)
@@ -514,11 +517,15 @@ def train_mobilenet(args, to_static):
 
 
 def predict_static(args, data):
+    paddle.enable_static()
     exe = fluid.Executor(args.place)
     # load inference model
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
-         args.model_save_path, executor=exe, params_filename=VARIABLE_FILENAME)
+         args.model_save_dir,
+         executor=exe,
+         model_filename=args.model_filename,
+         params_filename=args.params_filename)
 
     pred_res = exe.run(inference_program,
                        feed={feed_target_names[0]: data},
@@ -545,7 +552,7 @@ def predict_dygraph(args, data):
 
 def predict_dygraph_jit(args, data):
     with fluid.dygraph.guard(args.place):
-        model = fluid.dygraph.jit.load(args.model_save_path)
+        model = fluid.dygraph.jit.load(args.model_save_prefix)
         model.eval()
 
         pred_res = model(data)
@@ -554,7 +561,8 @@ def predict_dygraph_jit(args, data):
 
 
 def predict_analysis_inference(args, data):
-    output = PredictorTools(args.model_save_path, VARIABLE_FILENAME, [data])
+    output = PredictorTools(args.model_save_dir, args.model_filename,
+                            args.params_filename, [data])
     out = output()
     return out
 
@@ -565,7 +573,9 @@ def setUp(self):
 
     def train(self, model_name, to_static):
         self.args.model = model_name
-        self.args.model_save_path = model_name + ".inference.model"
+        self.args.model_save_prefix = "./inference/" + model_name
+        self.args.model_filename = model_name + INFER_MODEL_SUFFIX
+        self.args.params_filename = model_name + INFER_PARAMS_SUFFIX
         self.args.dy_state_dict_save_path = model_name + ".dygraph"
         out = train_mobilenet(self.args, to_static)
         return out
@@ -579,7 +589,9 @@ def assert_same_loss(self, model_name):
 
     def assert_same_predict(self, model_name):
         self.args.model = model_name
-        self.args.model_save_path = model_name + ".inference.model"
+        self.args.model_save_prefix = "./inference/" + model_name
+        self.args.model_filename = model_name + INFER_MODEL_SUFFIX
+        self.args.params_filename = model_name + INFER_PARAMS_SUFFIX
         self.args.dy_state_dict_save_path = model_name + ".dygraph"
         local_random = np.random.RandomState(SEED)
         image = local_random.random_sample([1, 3, 224, 224]).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 203c8ddb3488c..095940d79eac6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -24,7 +24,7 @@
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import declarative, ProgramTranslator
 from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from predictor_utils import PredictorTools
 
@@ -38,7 +38,11 @@
 epoch_num = 1
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
     else fluid.CPUPlace()
-MODEL_SAVE_PATH = "./resnet.inference.model"
+
+MODEL_SAVE_DIR = "./inference"
+MODEL_SAVE_PREFIX = "./inference/resnet"
+MODEL_FILENAME = "resnet" + INFER_MODEL_SUFFIX
+PARAMS_FILENAME = "resnet" + INFER_PARAMS_SUFFIX
 DY_STATE_DICT_SAVE_PATH = "./resnet.dygraph"
 program_translator = ProgramTranslator()
 
@@ -261,7 +265,7 @@ def train(to_static):
                             total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
                 if batch_id == 10:
                     if to_static:
-                        fluid.dygraph.jit.save(resnet, MODEL_SAVE_PATH)
+                        fluid.dygraph.jit.save(resnet, MODEL_SAVE_PREFIX)
                     else:
                         fluid.dygraph.save_dygraph(resnet.state_dict(),
                                                    DY_STATE_DICT_SAVE_PATH)
@@ -287,10 +291,14 @@ def predict_dygraph(data):
 
 
 def predict_static(data):
+    paddle.enable_static()
     exe = fluid.Executor(place)
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_PATH, executor=exe, params_filename=VARIABLE_FILENAME)
+         MODEL_SAVE_DIR,
+         executor=exe,
+         model_filename=MODEL_FILENAME,
+         params_filename=PARAMS_FILENAME)
 
     pred_res = exe.run(inference_program,
                        feed={feed_target_names[0]: data},
@@ -301,7 +309,7 @@ def predict_static(data):
 
 def predict_dygraph_jit(data):
     with fluid.dygraph.guard(place):
-        resnet = fluid.dygraph.jit.load(MODEL_SAVE_PATH)
+        resnet = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
         resnet.eval()
 
         pred_res = resnet(data)
@@ -310,7 +318,8 @@ def predict_dygraph_jit(data):
 
 
 def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, [data])
+    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
+                            [data])
     out = output()
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
index 75c251253c05a..a8cfeb90bd814 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
@@ -34,7 +34,11 @@
 epoch_num = 1
 place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \
     else paddle.CPUPlace()
-MODEL_SAVE_PATH = "./resnet_v2.inference.model"
+
+MODEL_SAVE_DIR = "./inference"
+MODEL_SAVE_PREFIX = "./inference/resnet_v2"
+MODEL_FILENAME = "resnet_v2" + paddle.fluid.dygraph.io.INFER_MODEL_SUFFIX
+PARAMS_FILENAME = "resnet_v2" + paddle.fluid.dygraph.io.INFER_PARAMS_SUFFIX
 DY_STATE_DICT_SAVE_PATH = "./resnet_v2.dygraph"
 program_translator = paddle.jit.ProgramTranslator()
 
@@ -255,7 +259,7 @@ def train(to_static):
                         total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
             if batch_id == 10:
                 if to_static:
-                    paddle.jit.save(resnet, MODEL_SAVE_PATH)
+                    paddle.jit.save(resnet, MODEL_SAVE_PREFIX)
                 else:
                     paddle.fluid.dygraph.save_dygraph(resnet.state_dict(),
                                                       DY_STATE_DICT_SAVE_PATH)
@@ -289,9 +293,10 @@ def predict_static(data):
     exe = paddle.static.Executor(place)
     [inference_program, feed_target_names,
      fetch_targets] = paddle.static.load_inference_model(
-         MODEL_SAVE_PATH,
+         MODEL_SAVE_DIR,
          executor=exe,
-         params_filename=paddle.fluid.dygraph.io.VARIABLE_FILENAME)
+         model_filename=MODEL_FILENAME,
+         params_filename=PARAMS_FILENAME)
 
     pred_res = exe.run(inference_program,
                        feed={feed_target_names[0]: data},
@@ -302,7 +307,7 @@ def predict_static(data):
 
 def predict_dygraph_jit(data):
     paddle.disable_static(place)
-    resnet = paddle.jit.load(MODEL_SAVE_PATH)
+    resnet = paddle.jit.load(MODEL_SAVE_PREFIX)
     resnet.eval()
 
     pred_res = resnet(data)
@@ -313,8 +318,8 @@ def predict_dygraph_jit(data):
 
 
 def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_PATH,
-                            paddle.fluid.dygraph.io.VARIABLE_FILENAME, [data])
+    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
+                            [data])
     out = output()
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
index cf7708c675aa9..b431d5ae048a9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
@@ -16,14 +16,14 @@
 
 import os
 import unittest
-
 import numpy as np
-import paddle.fluid as fluid
 
+import paddle
+import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
-from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
 
 SEED = 2020
 
@@ -66,14 +66,13 @@ def test_save_inference_model(self):
                 adam.minimize(loss)
                 layer.clear_gradients()
             # test for saving model in dygraph.guard
-            infer_model_dir = "./test_dy2stat_save_inference_model_in_guard"
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.output_spec = [pred]
+            infer_model_prefix = "./test_dy2stat_inference_in_guard/model"
+            infer_model_dir = "./test_dy2stat_inference_in_guard"
             fluid.dygraph.jit.save(
                 layer=layer,
-                model_path=infer_model_dir,
+                path=infer_model_prefix,
                 input_spec=[x],
-                configs=configs)
+                output_spec=[pred])
             # Check the correctness of the inference
             dygraph_out, _ = layer(x)
         self.check_save_inference_model(layer, [x_data], dygraph_out.numpy())
@@ -91,30 +90,30 @@ def check_save_inference_model(self,
 
         expected_persistable_vars = set([p.name for p in model.parameters()])
 
-        infer_model_dir = "./test_dy2stat_save_inference_model"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        if fetch is not None:
-            configs.output_spec = fetch
-        configs.separate_params = True
+        infer_model_prefix = "./test_dy2stat_inference/model"
+        infer_model_dir = "./test_dy2stat_inference"
+        model_filename = "model" + INFER_MODEL_SUFFIX
+        params_filename = "model" + INFER_PARAMS_SUFFIX
         fluid.dygraph.jit.save(
             layer=model,
-            model_path=infer_model_dir,
+            path=infer_model_prefix,
             input_spec=feed if feed else None,
-            configs=configs)
-        saved_var_names = set([
-            filename for filename in os.listdir(infer_model_dir)
-            if filename != '__model__' and filename != EXTRA_VAR_INFO_FILENAME
-        ])
-        self.assertEqual(saved_var_names, expected_persistable_vars)
+            output_spec=fetch if fetch else None)
         # Check the correctness of the inference
-        infer_out = self.load_and_run_inference(infer_model_dir, inputs)
+        infer_out = self.load_and_run_inference(infer_model_dir, model_filename,
+                                                params_filename, inputs)
         self.assertTrue(np.allclose(gt_out, infer_out))
 
-    def load_and_run_inference(self, model_path, inputs):
+    def load_and_run_inference(self, model_path, model_filename,
+                               params_filename, inputs):
+        paddle.enable_static()
         exe = fluid.Executor(place)
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             dirname=model_path, executor=exe)
+             dirname=model_path,
+             executor=exe,
+             model_filename=model_filename,
+             params_filename=params_filename)
         results = exe.run(inference_program,
                           feed=dict(zip(feed_target_names, inputs)),
                           fetch_list=fetch_targets)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index 8f11a58588463..15cff501838a1 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -24,7 +24,7 @@
 from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D
 from paddle.fluid.dygraph import declarative
 from paddle.fluid.dygraph import ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from predictor_utils import PredictorTools
 
@@ -35,7 +35,10 @@
 EPOCH_NUM = 1
 PRINT_STEP = 2
 STEP_NUM = 10
-MODEL_SAVE_PATH = "./se_resnet.inference.model"
+MODEL_SAVE_DIR = "./inference"
+MODEL_SAVE_PREFIX = "./inference/se_resnet"
+MODEL_FILENAME = "se_resnet" + INFER_MODEL_SUFFIX
+PARAMS_FILENAME = "se_resnet" + INFER_PARAMS_SUFFIX
 DY_STATE_DICT_SAVE_PATH = "./se_resnet.dygraph"
 
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
@@ -383,10 +386,10 @@ def train(train_reader, to_static):
                 step_idx += 1
                 if step_idx == STEP_NUM:
                     if to_static:
-                        configs = fluid.dygraph.jit.SaveLoadConfig()
-                        configs.output_spec = [pred]
-                        fluid.dygraph.jit.save(se_resnext, MODEL_SAVE_PATH,
-                                               [img], configs)
+                        fluid.dygraph.jit.save(
+                            se_resnext,
+                            MODEL_SAVE_PREFIX, [img],
+                            output_spec=[pred])
                     else:
                         fluid.dygraph.save_dygraph(se_resnext.state_dict(),
                                                    DY_STATE_DICT_SAVE_PATH)
@@ -414,10 +417,14 @@ def predict_dygraph(data):
 
 
 def predict_static(data):
+    paddle.enable_static()
     exe = fluid.Executor(place)
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_PATH, executor=exe, params_filename=VARIABLE_FILENAME)
+         MODEL_SAVE_DIR,
+         executor=exe,
+         model_filename=MODEL_FILENAME,
+         params_filename=PARAMS_FILENAME)
 
     pred_res = exe.run(inference_program,
                        feed={feed_target_names[0]: data},
@@ -428,7 +435,7 @@ def predict_static(data):
 
 def predict_dygraph_jit(data):
     with fluid.dygraph.guard(place):
-        se_resnext = fluid.dygraph.jit.load(MODEL_SAVE_PATH)
+        se_resnext = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
         se_resnext.eval()
 
         pred_res = se_resnext(data)
@@ -437,7 +444,8 @@ def predict_dygraph_jit(data):
 
 
 def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, [data])
+    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
+                            [data])
     out = output()
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
index 4fc8d27d30cb8..6721e7a51d2bc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
@@ -32,6 +32,7 @@
 
 
 def train_static(args, batch_generator):
+    paddle.enable_static()
     paddle.manual_seed(SEED)
     paddle.framework.random._manual_program_seed(SEED)
     train_prog = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py
index 8ebb99fda660e..e264a300d8c18 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py
@@ -277,7 +277,8 @@ def load_dygraph(model_path, keep_name_table=False):
     To load python2 saved models in python3.
     """
     try:
-        para_dict, opti_dict = fluid.load_dygraph(model_path, keep_name_table)
+        para_dict, opti_dict = fluid.load_dygraph(
+            model_path, keep_name_table=keep_name_table)
         return para_dict, opti_dict
     except UnicodeDecodeError:
         warnings.warn(
@@ -287,7 +288,7 @@ def load_dygraph(model_path, keep_name_table=False):
         if six.PY3:
             load_bak = pickle.load
             pickle.load = partial(load_bak, encoding="latin1")
-            para_dict, opti_dict = fluid.load_dygraph(model_path,
-                                                      keep_name_table)
+            para_dict, opti_dict = fluid.load_dygraph(
+                model_path, keep_name_table=keep_name_table)
             pickle.load = load_bak
             return para_dict, opti_dict
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 2f35b45aa670c..7d48f2c419085 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -43,15 +43,14 @@ def test_new_directory(self):
             'paddle.distributed.prepare_context', 'paddle.DataParallel',
             'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
             'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
-            'paddle.jit.save', 'paddle.jit.load', 'paddle.SaveLoadConfig',
-            'paddle.NoamDecay', 'paddle.PiecewiseDecay',
-            'paddle.NaturalExpDecay', 'paddle.ExponentialDecay',
-            'paddle.InverseTimeDecay', 'paddle.PolynomialDecay',
-            'paddle.CosineDecay', 'paddle.static.Executor',
-            'paddle.static.global_scope', 'paddle.static.scope_guard',
-            'paddle.static.append_backward', 'paddle.static.gradients',
-            'paddle.static.BuildStrategy', 'paddle.static.CompiledProgram',
-            'paddle.static.ExecutionStrategy',
+            'paddle.jit.save', 'paddle.jit.load', 'paddle.NoamDecay',
+            'paddle.PiecewiseDecay', 'paddle.NaturalExpDecay',
+            'paddle.ExponentialDecay', 'paddle.InverseTimeDecay',
+            'paddle.PolynomialDecay', 'paddle.CosineDecay',
+            'paddle.static.Executor', 'paddle.static.global_scope',
+            'paddle.static.scope_guard', 'paddle.static.append_backward',
+            'paddle.static.gradients', 'paddle.static.BuildStrategy',
+            'paddle.static.CompiledProgram', 'paddle.static.ExecutionStrategy',
             'paddle.static.default_main_program',
             'paddle.static.default_startup_program', 'paddle.static.Program',
             'paddle.static.name_scope', 'paddle.static.program_guard',
@@ -104,9 +103,7 @@ def test_old_directory(self):
             'paddle.imperative.TracedLayer', 'paddle.imperative.declarative',
             'paddle.imperative.ProgramTranslator',
             'paddle.imperative.TranslatedLayer', 'paddle.imperative.jit.save',
-            'paddle.imperative.jit.load',
-            'paddle.imperative.jit.SaveLoadConfig',
-            'paddle.imperative.NoamDecay'
+            'paddle.imperative.jit.load', 'paddle.imperative.NoamDecay'
             'paddle.imperative.PiecewiseDecay',
             'paddle.imperative.NaturalExpDecay',
             'paddle.imperative.ExponentialDecay',
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index bee53fd10f5fe..45709a358635c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -917,11 +917,6 @@ def test_load_compatible_with_keep_name_table(self):
             state_dict = emb.state_dict()
             fluid.save_dygraph(state_dict, os.path.join('saved_dy', 'emb_dy'))
 
-            para_state_dict, opti_state_dict = fluid.load_dygraph(
-                os.path.join('saved_dy', 'emb_dy'), True)
-            self.assertTrue(para_state_dict != None)
-            self.assertTrue(opti_state_dict == None)
-
             para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy'), keep_name_table=True)
             self.assertTrue(para_state_dict != None)
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 9940424618504..71ec1271a041e 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -23,7 +23,7 @@
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
 
 BATCH_SIZE = 32
 BATCH_NUM = 10
@@ -127,8 +127,8 @@ class MultiLoadingLinearNet(fluid.dygraph.Layer):
     def __init__(self, size, model_path):
         super(MultiLoadingLinearNet, self).__init__()
         self._linear = Linear(size, size)
-        self._load_linear1 = fluid.dygraph.jit.load(model_path)
-        self._load_linear2 = fluid.dygraph.jit.load(model_path)
+        self._load_linear1 = paddle.jit.load(model_path)
+        self._load_linear2 = paddle.jit.load(model_path)
 
     @declarative
     def forward(self, x):
@@ -218,23 +218,20 @@ def train_with_label(layer, input_size=784, label_size=1):
 
 class TestJitSaveLoad(unittest.TestCase):
     def setUp(self):
-        self.model_path = "model.test_jit_save_load"
+        self.model_path = "test_jit_save_load/model"
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
         paddle.manual_seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
-    def train_and_save_model(self, model_path=None, configs=None):
+    def train_and_save_model(self, model_path=None):
         layer = LinearNet(784, 1)
         example_inputs, layer, _ = train(layer)
         final_model_path = model_path if model_path else self.model_path
         orig_input_types = [type(x) for x in example_inputs]
-        fluid.dygraph.jit.save(
-            layer=layer,
-            model_path=final_model_path,
-            input_spec=example_inputs,
-            configs=configs)
+        paddle.jit.save(
+            layer=layer, path=final_model_path, input_spec=example_inputs)
         new_input_types = [type(x) for x in example_inputs]
         self.assertEqual(orig_input_types, new_input_types)
         return layer
@@ -243,13 +240,10 @@ def test_save_load(self):
         # train and save model
         train_layer = self.train_and_save_model()
         # load model
-        program_translator = ProgramTranslator()
-        program_translator.enable(False)
-        loaded_layer = fluid.dygraph.jit.load(self.model_path)
+        loaded_layer = paddle.jit.load(self.model_path)
         self.load_and_inference(train_layer, loaded_layer)
         self.load_dygraph_state_dict(train_layer)
         self.load_and_finetune(train_layer, loaded_layer)
-        program_translator.enable(True)
 
     def load_and_inference(self, train_layer, infer_layer):
         train_layer.eval()
@@ -274,7 +268,7 @@ def load_dygraph_state_dict(self, train_layer):
         # construct new model
         new_layer = LinearNet(784, 1)
         orig_state_dict = new_layer.state_dict()
-        load_state_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
+        load_state_dict = paddle.load(self.model_path)
         for structured_name in orig_state_dict:
             self.assertTrue(structured_name in load_state_dict)
         new_layer.set_state_dict(load_state_dict)
@@ -286,20 +280,24 @@ def load_dygraph_state_dict(self, train_layer):
             np.array_equal(train_layer(x).numpy(), new_layer(x).numpy()))
 
     def test_load_dygraph_no_path(self):
-        model_path = "model.test_jit_save_load.no_path"
-        new_layer = LinearNet(784, 1)
+        model_path = "test_jit_save_load.no_path/model_path"
         with self.assertRaises(ValueError):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
     def test_jit_load_model_incomplete(self):
-        model_path = "model.test_jit_save_load.remove_variables"
-        self.train_and_save_model(model_path=model_path)
-        # remove `__variables__`	
-        var_path = os.path.join(model_path, VARIABLE_FILENAME)
+        model_path = "test_jit_save_load.remove_variables/model"
+        self.train_and_save_model(model_path)
+        # remove `.pdiparams`	
+        var_path = model_path + INFER_PARAMS_SUFFIX
         os.remove(var_path)
         with self.assertRaises(ValueError):
             paddle.jit.load(model_path)
 
+    def test_jit_load_no_path(self):
+        path = "test_jit_save_load.no_path/model_path"
+        with self.assertRaises(ValueError):
+            loaded_layer = paddle.jit.load(path)
+
 
 class TestSaveLoadWithInputSpec(unittest.TestCase):
     def setUp(self):
@@ -313,8 +311,7 @@ def test_with_input_spec(self):
             net.forward, input_spec=[InputSpec(
                 [None, 8], name='x')])
 
-        model_path = "model.input_spec.output_spec"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
+        model_path = "input_spec.output_spec/model"
         # check inputs and outputs
         self.assertTrue(len(net.forward.inputs) == 1)
         input_x = net.forward.inputs[0]
@@ -322,11 +319,11 @@ def test_with_input_spec(self):
         self.assertTrue(input_x.name == 'x')
 
         # 1. prune loss
-        configs.output_spec = net.forward.outputs[:1]
-        fluid.dygraph.jit.save(net, model_path, configs=configs)
+        output_spec = net.forward.outputs[:1]
+        paddle.jit.save(net, model_path, output_spec=output_spec)
 
         # 2. load to infer
-        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        infer_layer = paddle.jit.load(model_path)
         x = fluid.dygraph.to_variable(
             np.random.random((4, 8)).astype('float32'))
         pred = infer_layer(x)
@@ -334,8 +331,7 @@ def test_with_input_spec(self):
     def test_multi_in_out(self):
         net = LinearNetMultiInput(8, 8)
 
-        model_path = "model.multi_inout.output_spec1"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
+        model_path = "multi_inout.output_spec1/model"
         # 1. check inputs and outputs
         self.assertTrue(len(net.forward.inputs) == 2)
         input_x = net.forward.inputs[0]
@@ -344,11 +340,11 @@ def test_multi_in_out(self):
         self.assertTrue(input_y.shape == (-1, 8))
 
         # 2. prune loss
-        configs.output_spec = net.forward.outputs[:2]
-        fluid.dygraph.jit.save(net, model_path, configs=configs)
+        output_spec = net.forward.outputs[:2]
+        paddle.jit.save(net, model_path, output_spec=output_spec)
 
         # 3. load to infer
-        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        infer_layer = paddle.jit.load(model_path)
         x = fluid.dygraph.to_variable(
             np.random.random((4, 8)).astype('float32'))
         y = fluid.dygraph.to_variable(
@@ -357,11 +353,11 @@ def test_multi_in_out(self):
         pred_x, pred_y = infer_layer(x, y)
 
         # 1. prune y and loss
-        model_path = "model.multi_inout.output_spec2"
-        configs.output_spec = net.forward.outputs[:1]
-        fluid.dygraph.jit.save(net, model_path, [input_x], configs)
+        model_path = "multi_inout.output_spec2/model"
+        output_spec = net.forward.outputs[:1]
+        paddle.jit.save(net, model_path, [input_x], output_spec=output_spec)
         # 2. load again
-        infer_layer2 = fluid.dygraph.jit.load(model_path, configs=configs)
+        infer_layer2 = paddle.jit.load(model_path)
         # 3. predict
         pred_xx = infer_layer2(x)
 
@@ -377,44 +373,6 @@ def setUp(self):
         paddle.manual_seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
-    def basic_save_load(self, layer, model_path, configs):
-        # 1. train & save
-        example_inputs, train_layer, _ = train(layer)
-        fluid.dygraph.jit.save(
-            layer=train_layer,
-            model_path=model_path,
-            input_spec=example_inputs,
-            configs=configs)
-        # 2. load 
-        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
-        train_layer.eval()
-        # 3. inference & compare
-        x = fluid.dygraph.to_variable(
-            np.random.random((1, 784)).astype('float32'))
-        self.assertTrue(
-            np.array_equal(train_layer(x).numpy(), infer_layer(x).numpy()))
-
-    def test_model_filename(self):
-        layer = LinearNet(784, 1)
-        model_path = "model.save_load_config.output_spec"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.model_filename = "__simplenet__"
-        self.basic_save_load(layer, model_path, configs)
-
-    def test_params_filename(self):
-        layer = LinearNet(784, 1)
-        model_path = "model.save_load_config.params_filename"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.params_filename = "__params__"
-        self.basic_save_load(layer, model_path, configs)
-
-    def test_separate_params(self):
-        layer = LinearNet(784, 1)
-        model_path = "model.save_load_config.separate_params"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.separate_params = True
-        self.basic_save_load(layer, model_path, configs)
-
     def test_output_spec(self):
         train_layer = LinearNetReturnLoss(8, 8)
         adam = fluid.optimizer.AdamOptimizer(
@@ -427,27 +385,47 @@ def test_output_spec(self):
             adam.minimize(loss)
             train_layer.clear_gradients()
 
-        model_path = "model.save_load_config.output_spec"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.output_spec = [out]
-        fluid.dygraph.jit.save(
+        model_path = "save_load_config.output_spec"
+        output_spec = [out]
+        paddle.jit.save(
             layer=train_layer,
-            model_path=model_path,
+            path=model_path,
             input_spec=[x],
-            configs=configs)
+            output_spec=output_spec)
 
         train_layer.eval()
-        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        infer_layer = paddle.jit.load(model_path)
         x = fluid.dygraph.to_variable(
             np.random.random((4, 8)).astype('float32'))
         self.assertTrue(
             np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
 
+    def test_save_no_support_config_error(self):
+        layer = LinearNet(784, 1)
+        path = "no_support_config_test"
+        with self.assertRaises(ValueError):
+            paddle.jit.save(layer=layer, path=path, model_filename="")
+
+    def test_load_empty_model_filename_error(self):
+        path = "error_model_filename_test"
+        with self.assertRaises(ValueError):
+            paddle.jit.load(path, model_filename="")
+
+    def test_load_empty_params_filename_error(self):
+        path = "error_params_filename_test"
+        with self.assertRaises(ValueError):
+            paddle.jit.load(path, params_filename="")
+
+    def test_load_with_no_support_config(self):
+        path = "no_support_config_test"
+        with self.assertRaises(ValueError):
+            paddle.jit.load(path, separate_params=True)
+
 
 class TestJitMultipleLoading(unittest.TestCase):
     def setUp(self):
         self.linear_size = 4
-        self.model_path = "model.jit_multi_load"
+        self.model_path = "jit_multi_load/model"
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
@@ -459,8 +437,8 @@ def setUp(self):
     def train_and_save_orig_model(self):
         layer = LinearNet(self.linear_size, self.linear_size)
         example_inputs, layer, _ = train(layer, self.linear_size, 1)
-        fluid.dygraph.jit.save(
-            layer=layer, model_path=self.model_path, input_spec=example_inputs)
+        paddle.jit.save(
+            layer=layer, path=self.model_path, input_spec=example_inputs)
 
     def test_load_model_retransform_inference(self):
         multi_loaded_layer = MultiLoadingLinearNet(self.linear_size,
@@ -475,7 +453,7 @@ def test_load_model_retransform_inference(self):
 class TestJitPruneModelAndLoad(unittest.TestCase):
     def setUp(self):
         self.linear_size = 4
-        self.model_path = "model.jit_prune_model_and_load"
+        self.model_path = "jit_prune_model_and_load/model"
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
@@ -494,13 +472,12 @@ def train_and_save(self):
             adam.minimize(loss)
             train_layer.clear_gradients()
 
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.output_spec = [hidden]
-        fluid.dygraph.jit.save(
+        output_spec = [hidden]
+        paddle.jit.save(
             layer=train_layer,
-            model_path=self.model_path,
+            path=self.model_path,
             input_spec=[x],
-            configs=configs)
+            output_spec=output_spec)
 
         return train_layer
 
@@ -508,7 +485,7 @@ def test_load_pruned_model(self):
         train_layer = self.train_and_save()
         train_layer.eval()
 
-        infer_layer = fluid.dygraph.jit.load(self.model_path)
+        infer_layer = paddle.jit.load(self.model_path)
 
         x = fluid.dygraph.to_variable(
             np.random.random((4, 8)).astype('float32'))
@@ -519,7 +496,7 @@ def test_load_var_not_in_extra_var_info(self):
         self.train_and_save()
 
         # chage extra var info
-        var_info_path = os.path.join(self.model_path, EXTRA_VAR_INFO_FILENAME)
+        var_info_path = self.model_path + INFER_PARAMS_INFO_SUFFIX
         with open(var_info_path, 'rb') as f:
             extra_var_info = pickle.load(f)
             extra_var_info.clear()
@@ -527,7 +504,7 @@ def test_load_var_not_in_extra_var_info(self):
             pickle.dump(extra_var_info, f, protocol=2)
 
         with self.assertRaises(RuntimeError):
-            fluid.dygraph.jit.load(self.model_path)
+            paddle.jit.load(self.model_path)
 
 
 class TestJitSaveMultiCases(unittest.TestCase):
@@ -561,7 +538,7 @@ def test_no_prune_to_static_after_train(self):
 
         train(layer)
 
-        model_path = "test_no_prune_to_static_after_train"
+        model_path = "test_no_prune_to_static_after_train/model"
         paddle.jit.save(layer, model_path)
 
         self.verify_inference_correctness(layer, model_path)
@@ -569,7 +546,7 @@ def test_no_prune_to_static_after_train(self):
     def test_no_prune_to_static_no_train(self):
         layer = LinearNetWithInputSpec(784, 1)
 
-        model_path = "test_no_prune_to_static_no_train"
+        model_path = "test_no_prune_to_static_no_train/model"
         paddle.jit.save(layer, model_path)
 
         self.verify_inference_correctness(layer, model_path)
@@ -579,7 +556,7 @@ def test_no_prune_no_to_static_after_train(self):
 
         train(layer)
 
-        model_path = "test_no_prune_no_to_static_after_train"
+        model_path = "test_no_prune_no_to_static_after_train/model"
         paddle.jit.save(
             layer,
             model_path,
@@ -593,16 +570,15 @@ def test_no_prune_no_to_static_after_train_with_examples(self):
 
         example_inputs, _, _ = train(layer)
 
-        model_path = "test_no_prune_no_to_static_after_train_with_examples"
-        fluid.dygraph.jit.save(
-            layer=layer, model_path=model_path, input_spec=example_inputs)
+        model_path = "test_no_prune_no_to_static_after_train_with_examples/model"
+        paddle.jit.save(layer=layer, path=model_path, input_spec=example_inputs)
 
         self.verify_inference_correctness(layer, model_path)
 
     def test_no_prune_no_to_static_no_train(self):
         layer = LinearNetNotDeclarative(784, 1)
 
-        model_path = "test_no_prune_no_to_static_no_train"
+        model_path = "test_no_prune_no_to_static_no_train/model"
         paddle.jit.save(
             layer,
             model_path,
@@ -616,9 +592,7 @@ def test_prune_to_static_after_train(self):
 
         out = train_with_label(layer)
 
-        model_path = "test_prune_to_static_after_train"
-        configs = paddle.SaveLoadConfig()
-        configs.output_spec = [out]
+        model_path = "test_prune_to_static_after_train/model"
         paddle.jit.save(
             layer,
             model_path,
@@ -626,18 +600,17 @@ def test_prune_to_static_after_train(self):
                 InputSpec(
                     shape=[None, 784], dtype='float32', name="image")
             ],
-            configs=configs)
+            output_spec=[out])
 
         self.verify_inference_correctness(layer, model_path, True)
 
     def test_prune_to_static_no_train(self):
         layer = LinerNetWithLabel(784, 1)
 
-        model_path = "test_prune_to_static_no_train"
-        configs = paddle.SaveLoadConfig()
+        model_path = "test_prune_to_static_no_train/model"
         # TODO: no train, cannot get output_spec var here
         # now only can use index
-        configs.output_spec = layer.forward.outputs[:1]
+        output_spec = layer.forward.outputs[:1]
         paddle.jit.save(
             layer,
             model_path,
@@ -645,7 +618,7 @@ def test_prune_to_static_no_train(self):
                 InputSpec(
                     shape=[None, 784], dtype='float32', name="image")
             ],
-            configs=configs)
+            output_spec=output_spec)
 
         self.verify_inference_correctness(layer, model_path, True)
 
@@ -654,7 +627,7 @@ def test_no_prune_input_spec_name_warning(self):
 
         train(layer)
 
-        model_path = "test_no_prune_input_spec_name_warning"
+        model_path = "test_no_prune_input_spec_name_warning/model"
         paddle.jit.save(
             layer,
             model_path,
@@ -675,18 +648,16 @@ def test_not_prune_output_spec_name_warning(self):
 
         train(layer)
 
-        model_path = "test_not_prune_output_spec_name_warning"
-        configs = paddle.SaveLoadConfig()
+        model_path = "test_not_prune_output_spec_name_warning/model"
         out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
-        configs.output_spec = [out]
-        paddle.jit.save(layer, model_path, configs=configs)
+        paddle.jit.save(layer, model_path, output_spec=[out])
 
         self.verify_inference_correctness(layer, model_path)
 
     def test_prune_input_spec_name_error(self):
         layer = LinerNetWithLabel(784, 1)
 
-        model_path = "test_prune_input_spec_name_error"
+        model_path = "test_prune_input_spec_name_error/model"
         with self.assertRaises(ValueError):
             paddle.jit.save(
                 layer,
@@ -707,10 +678,8 @@ def test_prune_output_spec_name_error(self):
 
         train_with_label(layer)
 
-        model_path = "test_prune_to_static_after_train"
-        configs = paddle.SaveLoadConfig()
+        model_path = "test_prune_to_static_after_train/model"
         out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
-        configs.output_spec = [out]
         with self.assertRaises(ValueError):
             paddle.jit.save(
                 layer,
@@ -719,12 +688,12 @@ def test_prune_output_spec_name_error(self):
                     InputSpec(
                         shape=[None, 784], dtype='float32', name="image")
                 ],
-                configs=configs)
+                output_spec=[out])
 
 
 class TestJitSaveLoadEmptyLayer(unittest.TestCase):
     def setUp(self):
-        self.model_path = "model.jit_save_load_empty_layer"
+        self.model_path = "jit_save_load_empty_layer/model"
         # enable dygraph mode
         paddle.disable_static()
 
@@ -740,7 +709,7 @@ def test_save_load_empty_layer(self):
 
 class TestJitSaveLoadNoParamLayer(unittest.TestCase):
     def setUp(self):
-        self.model_path = "model.jit_save_load_no_param_layer"
+        self.model_path = "jit_save_load_no_param_layer/model"
         # enable dygraph mode
         paddle.disable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
index fdc1e6b52aba1..35ad6fdb30e7b 100644
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -63,6 +63,8 @@ def setUp(self):
         self.epoch_num = 1
         self.batch_size = 128
         self.batch_num = 10
+        # enable static mode
+        paddle.enable_static()
 
     def train_and_save_model(self, only_params=False):
         with new_program_scope():
@@ -136,13 +138,12 @@ def test_load_with_model_filename(self):
         self.params_filename = None
         orig_param_dict = self.train_and_save_model()
 
-        config = paddle.SaveLoadConfig()
-        config.separate_params = True
-        config.model_filename = self.model_filename
-        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
+        load_param_dict, _ = fluid.load_dygraph(
+            self.save_dirname, model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
-        new_load_param_dict = paddle.load(self.save_dirname, config)
+        new_load_param_dict = paddle.load(
+            self.save_dirname, model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_with_param_filename(self):
@@ -151,12 +152,12 @@ def test_load_with_param_filename(self):
         self.params_filename = "static_mnist.params"
         orig_param_dict = self.train_and_save_model()
 
-        config = paddle.SaveLoadConfig()
-        config.params_filename = self.params_filename
-        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
+        load_param_dict, _ = fluid.load_dygraph(
+            self.save_dirname, params_filename=self.params_filename)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
-        new_load_param_dict = paddle.load(self.save_dirname, config)
+        new_load_param_dict = paddle.load(
+            self.save_dirname, params_filename=self.params_filename)
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_with_model_and_param_filename(self):
@@ -165,13 +166,16 @@ def test_load_with_model_and_param_filename(self):
         self.params_filename = "static_mnist.params"
         orig_param_dict = self.train_and_save_model()
 
-        config = paddle.SaveLoadConfig()
-        config.params_filename = self.params_filename
-        config.model_filename = self.model_filename
-        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
+        load_param_dict, _ = fluid.load_dygraph(
+            self.save_dirname,
+            params_filename=self.params_filename,
+            model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
-        new_load_param_dict = paddle.load(self.save_dirname, config)
+        new_load_param_dict = paddle.load(
+            self.save_dirname,
+            params_filename=self.params_filename,
+            model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_state_dict_from_save_params(self):
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 2ce442add2e02..7e2f0eb2fb8bb 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -20,8 +20,8 @@
 ]
 
 __all__ += [
-    'grad', 'LayerList', 'load', 'save', 'SaveLoadConfig', 'to_variable',
-    'no_grad', 'DataParallel'
+    'grad', 'LayerList', 'load', 'save', 'to_variable', 'no_grad',
+    'DataParallel'
 ]
 
 __all__ += [
@@ -50,7 +50,6 @@
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
 from .io import save
 from .io import load
-from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
 from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
 
 from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 7175f3101448f..c196c1d689bfe 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -26,7 +26,9 @@
 from paddle import fluid
 from paddle.fluid import core
 from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer
-from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers, EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.jit import _SaveLoadConfig
+from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
 
 __all__ = [
     'save',
@@ -55,19 +57,16 @@ def _load_state_dict_from_save_inference_model(model_path, config):
     # 2. load layer parameters & buffers
     with fluid.dygraph.guard():
         persistable_var_dict = _construct_params_and_buffers(
-            model_path,
-            programs,
-            config.separate_params,
-            config.params_filename,
-            append_suffix=False)
+            model_path, programs, config.params_filename, append_suffix=False)
 
         # 3. construct state_dict
         load_param_dict = dict()
         for var_name in persistable_var_dict:
             load_param_dict[var_name] = persistable_var_dict[var_name].numpy()
 
-        # if __variables.info__ exists, we can recover structured_name
-        var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+        # if *.info exists, we can recover structured_name
+        var_info_filename = str(config.params_filename) + ".info"
+        var_info_path = os.path.join(model_path, var_info_filename)
         if os.path.exists(var_info_path):
             with open(var_info_path, 'rb') as f:
                 extra_var_info = pickle.load(f)
@@ -116,12 +115,99 @@ def _load_state_dict_from_save_params(model_path):
     return load_param_dict
 
 
+# NOTE(chenweihang): [ Handling of use cases of API paddle.load ]
+# `paddle.load` may be used to load saved results of:
+# 1. Expected cases:
+#   - need [full filename] when loading
+#       - paddle.save
+#       - paddle.static.save
+#       - paddle.fluid.save_dygraph
+#   - need [prefix] when loading [compatible for paddle 2.x]
+#       - paddle.jit.save
+#       - paddle.static.save_inference_model
+#   - need [directory] when loading [compatible for paddle 1.x]
+#       - paddle.fluid.io.save_inference_model
+#       - paddle.fluid.io.save_params/save_persistable
+# 2. Error cases:
+#   - no error case
+def _build_load_path_and_config(path, config):
+    # NOTE(chenweihang): If both [prefix save format] and [directory save format] exist,
+    # raise error, avoid confusing behavior
+    prefix_format_path = path + INFER_MODEL_SUFFIX
+    prefix_format_exist = os.path.exists(prefix_format_path)
+    directory_format_exist = os.path.isdir(path)
+    if prefix_format_exist and directory_format_exist:
+        raise ValueError(
+            "The %s.pdmodel and %s directory exist at the same time, "
+            "don't know which one to load, please make sure that the specified target "
+            "of ``path`` is unique." % (path, path))
+    elif not prefix_format_exist and not directory_format_exist:
+        error_msg = "The ``path`` (%s) to load model not exists."
+        # if current path is a prefix, and the path.pdparams or path.pdopt
+        # is exist, users may want use `paddle.load` load the result of 
+        # `fluid.save_dygraph`, we raise error here for users
+        params_file_path = path + ".pdparams"
+        opti_file_path = path + ".pdopt"
+        if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
+            error_msg += " If you want to load the results saved by `fluid.save_dygraph`, " \
+                "please specify the full file name, not just the file name prefix. For " \
+                "example, it should be written as `paddle.load('model.pdparams')` instead of " \
+                "`paddle.load('model')`."
+        raise ValueError(error_msg % path)
+    else:
+        if prefix_format_exist:
+            file_prefix = os.path.basename(path)
+            model_path = os.path.dirname(path)
+            if config.model_filename is not None:
+                warnings.warn(
+                    "When loading the result saved with the "
+                    "specified file prefix, the ``model_filename`` config does "
+                    "not take effect.")
+            config.model_filename = file_prefix + INFER_MODEL_SUFFIX
+            if config.params_filename is not None:
+                warnings.warn(
+                    "When loading the result saved with the "
+                    "specified file prefix, the ``params_filename`` config does "
+                    "not take effect.")
+            config.params_filename = file_prefix + INFER_PARAMS_SUFFIX
+        else:
+            # Compatible with the old save_inference_model format
+            model_path = path
+
+    return model_path, config
+
+
+def _parse_load_config(configs):
+    supported_configs = ['model_filename', 'params_filename', 'keep_name_table']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.load` is not supported."
+                % key)
+
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.model_filename = configs.get('model_filename', None)
+    inner_config.params_filename = configs.get('params_filename', None)
+    inner_config.keep_name_table = configs.get('keep_name_table', None)
+
+    return inner_config
+
+
 def save(obj, path):
     '''
     Save an object to the specified path.
     
     .. note::
         Now only supports save ``state_dict`` of Layer or Optimizer.
+
+    .. note::
+        ``paddle.save`` will not add a suffix to the saved results, 
+        but we recommend that you use the following paddle standard suffixes:
+        1. for ``Layer.state_dict`` -> ``.pdparams``
+        2. for ``Optimizer.state_dict`` -> ``.pdopt``
     
     Args:
         obj(Object) : The object to be saved.
@@ -178,7 +264,7 @@ def save(obj, path):
         pickle.dump(saved_obj, f, protocol=2)
 
 
-def load(path, config=None):
+def load(path, **configs):
     '''
     Load an object can be used in paddle from specified path.
 
@@ -186,21 +272,39 @@ def load(path, config=None):
         Now only supports load ``state_dict`` of Layer or Optimizer.
 
     .. note::
-        ``paddle.load`` supports loading ``state_dict`` from the result of several 
-        paddle1.x save APIs in static mode, but due to some historical reasons, 
-        if you load ``state_dict`` from the saved result of 
-        ``paddle.static.save_inference_model/paddle.fluid.io.save_params/paddle.fluid.io.save_persistables`` , 
+        ``paddle.load`` supports loading ``state_dict`` of Layer or Optimizer from 
+        the result of other save APIs except ``paddle.load`` , but the argument 
+        ``path`` format is different:
+        1. loading from ``paddle.static.save`` or ``paddle.Model().save(training=True)`` ,  
+        ``path`` needs to be a complete file name, such as ``model.pdparams`` or 
+        ``model.pdopt`` ; 
+        2. loading from ``paddle.jit.save`` or ``paddle.static.save_inference_model`` 
+        or ``paddle.Model().save(training=False)`` , ``path`` need to be a file prefix, 
+        such as ``model/mnist``, and ``paddle.load`` will get information from 
+        ``mnist.pdmodel`` and ``mnist.pdiparams`` ;
+        3. loading from paddle 1.x APIs ``paddle.fluid.io.save_inference_model`` or 
+        ``paddle.fluid.io.save_params/save_persistables`` , ``path`` need to be a 
+        directory, such as ``model`` and model is a directory.
+
+    .. note::
+        If you load ``state_dict`` from the saved result of 
+        ``paddle.static.save`` or ``paddle.static.save_inference_model`` , 
         the structured variable name will cannot be restored. You need to set the argument 
         ``use_structured_name=False`` when using ``Layer.set_state_dict`` later.
 
     Args:
         path(str) : The path to load the target object. Generally, the path is the target 
-            file path, when compatible with loading the saved results of 
-            ``paddle.jit.save/paddle.static.save_inference_model`` , the path is a directory. 
-        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig`
-            object that specifies additional configuration options, these options 
-            are for compatibility with ``paddle.jit.save/paddle.static.save_inference_model`` 
-            formats. Default None.
+            file path. When compatible with loading the saved results other APIs, the path 
+            can be a file prefix or directory. 
+        **configs (dict, optional): other load configuration options for compatibility. We do not 
+            recommend using these configurations, they may be removed in the future. If not necessary, 
+            DO NOT use them. Default None.
+            The following options are currently supported:
+            (1) model_filename (string): The inference model file name of the paddle 1.x 
+            ``save_inference_model`` save format. Default file name is :code:`__model__` . 
+            (2) params_filename (string): The persistable variables file name of the paddle 1.x 
+            ``save_inference_model`` save format. No default file name, save variables separately 
+            by default.
 
     Returns:
         Object(Object): a target object can be used in paddle
@@ -227,26 +331,9 @@ def load(path, config=None):
             load_layer_state_dict = paddle.load("emb.pdparams")
             load_opt_state_dict = paddle.load("adam.pdopt")
     '''
-    # 1. input check
-    if not os.path.exists(path):
-        error_msg = "The path `%s` does not exist."
-        # if current path is a prefix, and the path.pdparams or path.pdopt
-        # is exist, users may want use `paddle.load` load the result of 
-        # `fluid.save_dygraph`, we raise error here for users
-        params_file_path = path + ".pdparams"
-        opti_file_path = path + ".pdopt"
-        if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
-            error_msg += " If you want to load the results saved by `fluid.save_dygraph`, " \
-                "please specify the full file name, not just the file name prefix. For " \
-                "example, it should be written as `paddle.load('model.pdparams')` instead of " \
-                "`paddle.load('model')`."
-        raise ValueError(error_msg % path)
-
-    if config is None:
-        config = paddle.SaveLoadConfig()
-
-    # 2. load target
     load_result = None
+    config = _parse_load_config(configs)
+
     if os.path.isfile(path):
         # we think path is file means this file is created by paddle.save
         with open(path, 'rb') as f:
@@ -255,16 +342,15 @@ def load(path, config=None):
 
         if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
             del load_result["StructuredToParameterName@@"]
-    elif os.path.isdir(path):
-        # we think path is directory means compatible with loading 
-        # store results of static mode related save APIs
-
+    else:
+        # file prefix and directory are compatible cases
+        model_path, config = _build_load_path_and_config(path, config)
         # check whether model file exists
         if config.model_filename is None:
             model_filename = '__model__'
         else:
             model_filename = config.model_filename
-        model_file_path = os.path.join(path, model_filename)
+        model_file_path = os.path.join(model_path, model_filename)
 
         if os.path.exists(model_file_path):
             # Load state dict by `jit.save/io.save_inference_model` save format
@@ -274,7 +360,7 @@ def load(path, config=None):
             # `save_inference_model` not save structured name, we need to remind 
             # the user to configure the `use_structured_name` argument when `set_state_dict`
             # NOTE(chenweihang): `jit.save` doesn't save optimizer state 
-            load_result = _load_state_dict_from_save_inference_model(path,
+            load_result = _load_state_dict_from_save_inference_model(model_path,
                                                                      config)
         else:
             # load state dict by `io.save_params/persistables` save format
@@ -283,9 +369,6 @@ def load(path, config=None):
             # mapping info will lost, so users need to give variable list, but users build 
             # variable list in dygraph mode is difficult, we recommend users to use
             # paddle.static.load_program_state in this case
-            load_result = _load_state_dict_from_save_params(path)
-    else:
-        raise ValueError(
-            "Unsupported path format, now only supports file or directory.")
+            load_result = _load_state_dict_from_save_params(model_path)
 
     return load_result

From 81d3992c4778fa3e54f79749254ba4c45d9ee23f Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Mon, 12 Oct 2020 10:05:25 +0800
Subject: [PATCH 64/91] Fix docker build error (#27735)

---
 python/unittest_py/requirements.txt                     | 1 +
 tools/dockerfile/Dockerfile.ubuntu                      | 7 +------
 tools/dockerfile/ubuntu16_dev.sh                        | 5 +++++
 tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 | 6 +-----
 4 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index 389d45fc6b95e..b61ba138441c9 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -2,4 +2,5 @@ PyGithub
 coverage
 pycrypto ; platform_system != "Windows"
 mock
+opencv-python<=4.2.0.32
 visualdl ; python_version>="3.5"
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index 9fe58885fa553..9b5602d4943ad 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -156,19 +156,14 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
 
 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.8 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip --no-cache-dir install opencv-python==4.2.0.32
+    pip --no-cache-dir install 'ipykernel==4.6.0'
 
 #For docstring checker
 RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
diff --git a/tools/dockerfile/ubuntu16_dev.sh b/tools/dockerfile/ubuntu16_dev.sh
index e7827b6598eeb..212e9acfea541 100755
--- a/tools/dockerfile/ubuntu16_dev.sh
+++ b/tools/dockerfile/ubuntu16_dev.sh
@@ -28,11 +28,13 @@ function ref_whl(){
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
   else
     ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
   fi
   
   if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_GPU} == "ON" ]]; then
@@ -40,11 +42,13 @@ function ref_whl(){
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
   else
     ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
   fi
 }
 
@@ -55,6 +59,7 @@ function install_whl(){
   sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle3_whl} && pip3.5 install ${ref_paddle3_whl} && rm  -f ${ref_paddle3_whl}" Dockerfile.tmp
   sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle36_whl} && pip3.6 install ${ref_paddle36_whl} && rm -f ${ref_paddle36_whl}" Dockerfile.tmp
   sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle37_whl} && pip3.7 install ${ref_paddle37_whl} && rm -f ${ref_paddle37_whl}" Dockerfile.tmp
+  sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle38_whl} && pip3.8 install ${ref_paddle38_whl} && rm -f ${ref_paddle38_whl}" Dockerfile.tmp
 }
 
 function install_gcc(){
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
index c27fdcea2401c..55c30579fb91e 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
@@ -174,16 +174,12 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
 
 RUN pip3 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0  && \
     pip3 --no-cache-dir install ipykernel==4.6.0 jupyter==1.0.0  && \
-    pip3 --no-cache-dir install opencv-python  && \
     pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0  && \
     pip3.6 --no-cache-dir install ipykernel==4.6.0 jupyter==1.0.0  && \
-    pip3.6 --no-cache-dir install opencv-python  && \
     pip3.7 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0  && \
     pip3.7 --no-cache-dir install ipykernel==4.6.0 jupyter==1.0.0  && \
-    pip3.7 --no-cache-dir install opencv-python  && \
     pip --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0  && \
-    pip --no-cache-dir install ipykernel==4.6.0 jupyter==1.0.0  && \
-    pip --no-cache-dir install  opencv-python
+    pip --no-cache-dir install ipykernel==4.6.0 
 
 #For docstring checker
 RUN pip3 --no-cache-dir install pylint pytest astroid isort 

From 8fabb1c32fec435f08a18151422ac9566f563cde Mon Sep 17 00:00:00 2001
From: cc <52520497+juncaipeng@users.noreply.github.com>
Date: Mon, 12 Oct 2020 10:10:25 +0800
Subject: [PATCH 65/91] Add test attribute in channelwise_quant op,
 test=develop (#27742)

* Add test attribute in channelwise_quant op, test=develop
---
 paddle/fluid/operators/fake_quantize_op.cc    |  4 ++++
 paddle/fluid/operators/fake_quantize_op.h     |  9 +++++---
 .../slim/quantization/quantization_pass.py    | 22 ++++++++++++++-----
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index e9b4c7dacf8b4..04fa8db9a5a6f 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -404,6 +404,10 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
                                 "the received is %d",
                                 bit_length));
         });
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
     AddComment(R"DOC(
 The scale of FakeChannelWiseQuantize operator is a vector.
 In detail, each channel of the input X has a scale value.
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 2f5afbe0eedf9..94a75f930beba 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -146,16 +146,19 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
 
     auto* out = context.Output<framework::Tensor>("Out");
     auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
     out->mutable_data<T>(context.GetPlace());
 
     int bit_length = context.Attr<int>("bit_length");
     int bin_cnt = std::pow(2, bit_length - 1) - 1;
     int quant_axis = context.Attr<int>("quant_axis");
+    bool is_test = context.Attr<bool>("is_test");
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
-                                                 out_scale_data);
+    if (!is_test) {
+      T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
+      FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
+                                                   out_scale_data);
+    }
     ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
         dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
   }
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index b5a8d90194331..eba881a2637ae 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -758,6 +758,7 @@ def _insert_channel_quant_op(self, graph, var_node, name, quant_bits,
             attrs={
                 'bit_length': quant_bits,
                 'quant_axis': quant_axis,
+                'is_test': self._is_test,
                 'op_role': core.op_proto_and_checker_maker.OpRole.Forward
             },
             inputs={'X': var_node},
@@ -1125,7 +1126,7 @@ def apply(self, graph):
                     self._restore_var(input_arg_name, quantized_param_v)
                     self._remove_fake_quant_and_dequant_op(graph, op_node)
 
-# Remove all fake dequant op
+        # Remove all fake dequant op
         ops = graph.all_op_nodes()
         for op_node in ops:
             op_name = op_node.name()
@@ -1331,16 +1332,25 @@ def _is_float(self, v):
 
     def _quant(self, x, scale, num_bits, quant_axis):
         assert quant_axis in [0, 1], 'quant_axis should be 0 or 1 for now.'
+        bnt = (1 << (num_bits - 1)) - 1
+
+        def _clip(x, scale):
+            x[x > scale] = scale
+            x[x < -scale] = -scale
+            return x
+
         if isinstance(scale, list):
             for i, s in enumerate(scale):
                 if quant_axis == 0:
-                    x[i] = np.round(x[i] / s * ((1 << (num_bits - 1)) - 1))
+                    x[i] = _clip(x[i], s)
+                    x[i] = np.round(x[i] / s * bnt)
                 else:
-                    x[:, i] = np.round(x[:, i] / s * (
-                        (1 << (num_bits - 1)) - 1))
-            return x
+                    x[:, i] = _clip(x[:, i], s)
+                    x[:, i] = np.round(x[:, i] / s * bnt)
         else:
-            return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
+            x = _clip(x, scale)
+            x = np.round(x / scale * bnt)
+        return x
 
 
 class ConvertToInt8Pass(object):

From 0a1862d1d25297dea8cf16fcaa7e70d9eaa3c57f Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Mon, 12 Oct 2020 10:15:58 +0800
Subject: [PATCH 66/91] fleet combine amp dgc recompute meta optimizer (#27643)

---
 .../fleet/base/distributed_strategy.py        |   4 +-
 .../fleet/meta_optimizers/amp_optimizer.py    |  57 ++++--
 .../fleet/meta_optimizers/dgc_optimizer.py    |   7 +
 .../fleet/meta_optimizers/lamb_optimizer.py   |   4 +
 .../fleet/meta_optimizers/lars_optimizer.py   |   4 +
 .../meta_optimizers/localsgd_optimizer.py     |   4 +-
 .../meta_optimizers/recompute_optimizer.py    |  26 ++-
 .../contrib/mixed_precision/decorator.py      | 100 ++++++-----
 python/paddle/fluid/optimizer.py              |  15 +-
 .../unittests/fleet_meta_optimizer_base.py    | 122 +++++++++++++
 .../tests/unittests/test_dgc_optimizer.py     |  17 +-
 .../test_fleet_amp_meta_optimizer.py          | 110 ++++++++----
 .../test_fleet_dgc_meta_optimizer.py          | 135 +++++++++------
 .../test_fleet_localsgd_meta_optimizer.py     | 132 +++++++-------
 .../test_fleet_recompute_meta_optimizer.py    | 162 ++++++++++++++----
 15 files changed, 644 insertions(+), 255 deletions(-)
 create mode 100755 python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py

diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 1fc29ad042883..c7798b15c67fe 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -744,13 +744,13 @@ def adaptive_localsgd(self):
             strategy.adaptive_localsgd = True # by default this is false
 
         """
-        return self.strategy.localsgd
+        return self.strategy.adaptive_localsgd
 
     @adaptive_localsgd.setter
     @is_strict_auto
     def adaptive_localsgd(self, flag):
         if isinstance(flag, bool):
-            self.strategy.localsgd = flag
+            self.strategy.adaptive_localsgd = flag
         else:
             print("WARNING: adaptive_localsgd should have value of bool type")
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index ad96e1426694f..283589c5f3320 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -19,16 +19,14 @@ class AMPOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(AMPOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
-        self.amp_opt = None
+        self.wrapped_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = [
             "LarsOptimizer",
             "LambOptimizer",
             "RecomputeOptimizer",
-            "LocalSGDOptimizer",
             "GradientMergeOptimizer",
             "GraphExecutionOptimizer",
-            "AdaptiveLocalSGDOptimizer",
         ]
         self.meta_optimizers_black_list = ["DGCOptimizer"]
 
@@ -37,6 +35,24 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
         super(AMPOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
 
+    def _init_wrapped_opt(self):
+        if self.wrapped_opt is not None:
+            return
+
+        config = self.user_defined_strategy.amp_configs
+
+        custom_white_list = set(config['custom_white_list'])
+        custom_black_list = set(config['custom_black_list'])
+        custom_black_varnames = set(config['custom_black_varnames'])
+        amp_lists = mixed_precision.AutoMixedPrecisionLists(
+            custom_white_list, custom_black_list, custom_black_varnames)
+
+        self.wrapped_opt = mixed_precision.decorate(
+            self.inner_opt, amp_lists, config['init_loss_scaling'],
+            config['incr_every_n_steps'], config['decr_every_n_nan_or_inf'],
+            config['incr_ratio'], config['decr_ratio'],
+            config['use_dynamic_loss_scaling'])
+
     def _can_apply(self):
         if not self.role_maker._is_collective:
             return False
@@ -60,26 +76,31 @@ def _enable_strategy(self, dist_strategy, context):
             "use_dynamic_loss_scaling": True
         }
 
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        # maybe inner_opt of other meta optimizer
+        self._init_wrapped_opt()
+        return self.wrapped_opt.backward(loss, startup_program, parameter_list,
+                                         no_grad_set, callbacks)
+
+    def apply_gradients(self, params_grads):
+        return self.wrapped_opt.apply_gradients(params_grads=params_grads)
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.wrapped_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
-        if self.amp_opt is None:
-            config = self.user_defined_strategy.amp_configs
-            custom_white_list = set(config['custom_white_list'])
-            custom_black_list = set(config['custom_black_list'])
-            custom_black_varnames = set(config['custom_black_varnames'])
-            amp_lists = mixed_precision.AutoMixedPrecisionLists(
-                custom_white_list, custom_black_list, custom_black_varnames)
-
-            self.amp_opt = mixed_precision.decorate(
-                self.inner_opt, amp_lists, config['init_loss_scaling'],
-                config['incr_every_n_steps'], config['decr_every_n_nan_or_inf'],
-                config['incr_ratio'], config['decr_ratio'],
-                config['use_dynamic_loss_scaling'])
-
+        self._init_wrapped_opt()
         optimize_ops, params_grads = \
-            self.amp_opt.minimize(loss, startup_program,
+            self.wrapped_opt.minimize(loss, startup_program,
                                   parameter_list, no_grad_set)
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index 6806a479d30f4..9990021c8506a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -85,6 +85,13 @@ def backward(self,
         return self.dgc_opt.backward(loss, startup_program, parameter_list,
                                      no_grad_set, callbacks)
 
+    def apply_gradients(self, params_grads):
+        return self.dgc_opt.apply_gradients(params_grads=params_grads)
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.dgc_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index df9887759e16f..64d54ae3bab03 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -98,6 +98,10 @@ def backward(self,
     def apply_gradients(self, params_grads):
         return self.lamb_opt.apply_gradients(params_grads=params_grads)
 
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.lamb_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index 609d8b85e714c..32c6be505a546 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -85,6 +85,10 @@ def backward(self,
     def apply_gradients(self, params_grads):
         return self.lars_opt.apply_gradients(params_grads=params_grads)
 
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.lars_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 9f094978d842a..91030f0762934 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -24,7 +24,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(LocalSGDOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = ['AMPOptimizer']
         self.meta_optimizers_black_list = [
             "GraphExecutionOptimizer",
             "AdaptiveLocalSGDOptimizer",
@@ -195,7 +195,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(AdaptiveLocalSGDOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = ['AMPOptimizer']
         self.meta_optimizers_black_list = [
             "GraphExecutionOptimizer", "LocalSGDOptimizer"
         ]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 59ca7e633099e..ea2b67ac4bd1f 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -18,15 +18,14 @@
 class RecomputeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(RecomputeOptimizer, self).__init__(optimizer)
-        #self.inner_opt = RO(optimizer)
         self.inner_opt = optimizer
-        self.wrapped_opt = RO(optimizer)
+        self.wrapped_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = [
             "LarsOptimizer",
             "LambOptimizer",
-            "GradientMergeOptimizer",
             "GraphExecutionOptimizer",
+            "DGCOptimizer",
         ]
         self.meta_optimizers_black_list = []
 
@@ -34,8 +33,15 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
         super(RecomputeOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
-        self.wrapped_opt._set_checkpoints(
-            list(user_defined_strategy.recompute_configs["checkpoints"]))
+
+    def _init_wrapped_opt(self):
+        if self.wrapped_opt is not None:
+            return
+
+        configs = self.user_defined_strategy.recompute_configs
+
+        self.wrapped_opt = RO(self.inner_opt)
+        self.wrapped_opt._set_checkpoints(list(configs["checkpoints"]))
 
     def _can_apply(self):
         if not self.role_maker._is_collective:
@@ -62,14 +68,24 @@ def backward(self,
                  parameter_list=None,
                  no_grad_set=None,
                  callbacks=None):
+        # maybe inner_opt of other meta optimizer
+        self._init_wrapped_opt()
         return self.wrapped_opt.backward(loss, startup_program, parameter_list,
                                          no_grad_set, callbacks)
 
+    def apply_gradients(self, params_grads):
+        return self.wrapped_opt.apply_gradients(params_grads=params_grads)
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.wrapped_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
+        self._init_wrapped_opt()
         optimize_ops, params_grads = \
             self.wrapped_opt.minimize(loss, startup_program,
                                       parameter_list, no_grad_set)
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index c9112ac849ce0..529c664e7083c 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -16,6 +16,7 @@
 from ... import default_startup_program
 from ... import layers
 from ... import unique_name
+from ... import program_guard
 from . import fp16_utils
 from .fp16_utils import rewrite_program
 from .fp16_utils import update_role_var_grad
@@ -58,21 +59,40 @@ def __init__(self, optimizer, amp_lists, init_loss_scaling,
         self._optimizer = optimizer
         self._amp_lists = amp_lists
         self._param_grads = None
-        self._train_program = default_main_program()
-        self._startup_prog = default_startup_program()
+        self._train_program = None
+
         self._scaled_loss = None
-        self._loss_scaling = layers.create_global_var(
-            name=unique_name.generate("loss_scaling"),
-            shape=[1],
-            value=init_loss_scaling,
-            dtype='float32',
-            persistable=True)
+        self._loss_scaling = None
+        self._init_loss_scaling = init_loss_scaling
         self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
         if self._use_dynamic_loss_scaling:
             self._incr_every_n_steps = incr_every_n_steps
             self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
             self._incr_ratio = incr_ratio
             self._decr_ratio = decr_ratio
+            self._num_good_steps = None
+            self._num_bad_steps = None
+
+    def get_loss_scaling(self):
+        """Return the real-time loss scaling factor.
+        """
+        return self._loss_scaling
+
+    def get_scaled_loss(self):
+        """Return the scaled loss.
+        It's useful when you feed customed loss into executor.
+        """
+        return self._scaled_loss
+
+    def _init_amp_var(self):
+        self._loss_scaling = layers.create_global_var(
+            name=unique_name.generate("loss_scaling"),
+            shape=[1],
+            value=self._init_loss_scaling,
+            dtype='float32',
+            persistable=True)
+
+        if self._use_dynamic_loss_scaling:
             self._num_good_steps = layers.create_global_var(
                 name=unique_name.generate("num_good_steps"),
                 shape=[1],
@@ -86,28 +106,16 @@ def __init__(self, optimizer, amp_lists, init_loss_scaling,
                 dtype='int32',
                 persistable=True)
 
-        # Ensure the data type of learning rate vars is float32 (same as the 
+        # Ensure the data type of learning rate vars is float32 (same as the
         # master parameter dtype)
-        if isinstance(optimizer._learning_rate, float):
-            optimizer._learning_rate_map[default_main_program()] = \
-                        layers.create_global_var(
-                        name=unique_name.generate("learning_rate"),
-                        shape=[1],
-                        value=float(optimizer._learning_rate),
-                        dtype='float32',
-                        persistable=True)
-
-    def get_loss_scaling(self):
-        """Return the real-time loss scaling factor.
-        """
-        return self._loss_scaling
-
-    def get_scaled_loss(self):
-        """Return the scaled loss.
-        It's useful when you feed customed loss into executor.
-        """
-
-        return self._scaled_loss
+        if isinstance(self._optimizer._learning_rate, float):
+            self._optimizer._learning_rate_map[default_main_program()] = \
+                    layers.create_global_var(
+                    name=unique_name.generate("learning_rate"),
+                    shape=[1],
+                    value=float(self._optimizer._learning_rate),
+                    dtype='float32',
+                    persistable=True)
 
     def backward(self,
                  loss,
@@ -131,16 +139,21 @@ def backward(self,
             A list of (param, grad), which is a tuple of a parameter and its 
             gradient respectively, and the scaled loss.
         """
-        rewrite_program(self._train_program, self._amp_lists)
-        self._scaled_loss = loss * self._loss_scaling
-        self._params_grads = self._optimizer.backward(
-            self._scaled_loss, startup_program, parameter_list, no_grad_set,
-            callbacks)
-        # Change the op_role_var attr for some ops, so that gradients
-        # transferred across GPUs can be FP16.
-        update_role_var_grad(self._train_program, self._params_grads)
-
-        return self._params_grads
+        train_program = loss.block.program
+        self._train_program = train_program
+
+        with program_guard(train_program, startup_program):
+            self._init_amp_var()
+
+            rewrite_program(train_program, self._amp_lists)
+            self._scaled_loss = loss * self._loss_scaling
+            params_grads = self._optimizer.backward(
+                self._scaled_loss, startup_program, parameter_list, no_grad_set,
+                callbacks)
+            # Change the op_role_var attr for some ops, so that gradients
+            # transferred across GPUs can be FP16.
+            update_role_var_grad(train_program, params_grads)
+        return params_grads
 
     def apply_gradients(self, params_grads):
         """
@@ -182,6 +195,12 @@ def apply_gradients(self, params_grads):
 
         return optimize_ops
 
+    def apply_optimize(self, loss, startup_program, params_grads):
+        program = loss.block.program
+        with program_guard(program, startup_program):
+            optimize_ops = self.apply_gradients(params_grads)
+        return optimize_ops
+
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -207,7 +226,8 @@ def minimize(self,
             parameter_list=parameter_list,
             no_grad_set=no_grad_set)
 
-        optimize_ops = self.apply_gradients(scaled_params_grads)
+        optimize_ops = self.apply_optimize(loss, startup_program,
+                                           scaled_params_grads)
 
         return optimize_ops, scaled_params_grads
 
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 4a9ce4454af0b..367be181f4725 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -731,9 +731,6 @@ def _process_distribute_lookuptable(self, param_grads):
                     outputs={"ParamOut": param_and_grad[0]})
         return new_param_grads, (table_param, table_grad), sgd_op
 
-    def _append_dgc_ops(self, param_and_grad):
-        pass
-
     def backward(self,
                  loss,
                  startup_program=None,
@@ -801,9 +798,6 @@ def backward(self,
             with program_guard(program, startup_program):
                 params_grads = append_backward(loss, parameter_list,
                                                act_no_grad_set, callbacks)
-                # Note: since we can't use all_reduce_op now,
-                # dgc_op should be the last op of one grad.
-                self._append_dgc_ops(params_grads)
         return params_grads
 
     def apply_gradients(self, params_grads):
@@ -1569,6 +1563,11 @@ def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var,
 
     @imperative_base.no_grad
     def apply_gradients(self, params_grads):
+        # Note: since we can't use all_reduce_op now,
+        # dgc_op should be the last op of one grad.
+        # Maybe need a grad allreduce pass.
+        self._append_dgc_ops(params_grads)
+
         params_grads = sorted(params_grads, key=lambda x: x[0].name)
         params_grads, table_param_and_grad, table_optimize_op = \
             self._process_distribute_lookuptable(params_grads)
@@ -4784,10 +4783,6 @@ def mlp(input_x, input_y, hid_dim=128, label_dim=2):
 
             params_grads = append_backward(
                 loss, parameter_list, no_grad_set, checkpoints=checkpoint_vars)
-            # Note: since we can't use all_reduce_op now,
-            #  dgc_op should be the last op of one grad.
-            if hasattr(self._optimizer, "_append_dgc_ops"):
-                self._optimizer._append_dgc_ops(params_grads)
         return params_grads
 
     def apply_optimize(self, loss, startup_program, params_grads):
diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
new file mode 100755
index 0000000000000..e7cdd49a32c26
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+from paddle import fluid
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+
+
+class TestFleetMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
+
+    def net(self, main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.unique_name.guard():
+                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+                fleet.init(role)
+                input_x = paddle.fluid.layers.data(
+                    name="x", shape=[32], dtype='float32')
+                input_y = paddle.fluid.layers.data(
+                    name="y", shape=[1], dtype='int64')
+
+                fc_1 = paddle.fluid.layers.fc(input=input_x,
+                                              size=64,
+                                              act='tanh')
+                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
+                prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                    size=2,
+                                                    act='softmax')
+                cost = paddle.fluid.layers.cross_entropy(
+                    input=prediction, label=input_y)
+                avg_cost = paddle.fluid.layers.mean(x=cost)
+
+                strategy = paddle.distributed.fleet.DistributedStrategy()
+        return avg_cost, strategy
+
+    def optimizer(self,
+                  loss,
+                  strategy,
+                  train_prog,
+                  startup_prog,
+                  name='momentum'):
+        with fluid.program_guard(train_prog, startup_prog):
+            with fluid.unique_name.guard():
+                if name == 'momentum':
+                    optimizer = paddle.fluid.optimizer.Momentum(
+                        learning_rate=0.01, momentum=0.9)
+                elif name == 'adam':
+                    optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+                optimizer = fleet.distributed_optimizer(
+                    optimizer, strategy=strategy)
+                optimizer.minimize(loss)
+
+    def set_strategy(self, strategy, name):
+        if name == 'amp':
+            strategy.amp = True
+            strategy.amp_configs = {
+                "init_loss_scaling": 32768,
+                "decr_every_n_nan_or_inf": 2,
+                "incr_every_n_steps": 1000,
+                "incr_ratio": 2.0,
+                "use_dynamic_loss_scaling": True,
+                "decr_ratio": 0.5,
+                "custom_white_list": ['softmax'],
+                "custom_black_list": ['tanh'],
+            }
+        elif name == 'dgc':
+            strategy.dgc = True
+            strategy.dgc_configs = {
+                "rampup_begin_step": 128,
+                "rampup_step": 100,
+                "sparsity": [0.996, 0.999]
+            }
+        elif name == 'recompute':
+            strategy.recompute = True
+            strategy.recompute_configs = {
+                "checkpoints": ["fc_0.tmp_2", "fc_1.tmp_2"]
+            }
+        elif name == 'lars':
+            strategy.lars = True
+            strategy.lars_configs = {
+                "lars_coeff": 0.001,
+                "lars_weight_decay": 0.0005,
+                "epsilon": 0,
+                "exclude_from_weight_decay": ["batch_norm", ".b"],
+            }
+        elif name == 'lamb':
+            strategy.lamb = True
+            strategy.lamb_configs = {
+                'lamb_weight_decay': 0.01,
+                'exclude_from_weight_decay': [],
+            }
+        elif name == 'localsgd':
+            strategy.localsgd = True
+            strategy.localsgd_configs = {
+                'k_steps': 1,
+                'begin_step': 1,
+            }
+        elif name == 'adaptive_localsgd':
+            strategy.adaptive_localsgd = True
+            strategy.adaptive_localsgd_configs = {
+                'init_k_steps': 1,
+                'begin_step': 1,
+            }
+        else:
+            raise NotImplementedError()
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
index 49b93e0dfaaac..d615f7cb7044e 100644
--- a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
@@ -16,12 +16,14 @@
 
 import unittest
 
+import paddle
 import paddle.fluid.framework as framework
 import paddle.fluid.optimizer as optimizer
 import paddle.fluid.regularizer as regularizer
 import paddle.fluid.clip as clip
 import paddle.compat as cpt
 from paddle.fluid.backward import append_backward
+paddle.enable_static()
 
 
 class TestDGCMomentumOptimizer(unittest.TestCase):
@@ -86,13 +88,17 @@ def check_dgc_momentum_optimizer(self,
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         # params_grads = append_backward(mean_out)
-        params_grads = dgc_momentum_optimizer.backward(mean_out)
+        params_grads = dgc_momentum_optimizer.backward(
+            mean_out, startup_program=init_program)
+
+        with framework.program_guard(program, init_program):
+            opts = dgc_momentum_optimizer.apply_gradients(params_grads)
+
         accumulator_count = 1 if name == "momentum" else 2
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(
             len(dgc_momentum_optimizer.get_accumulators()), accumulator_count)
-        with framework.program_guard(program, init_program):
-            opts = dgc_momentum_optimizer.apply_gradients(params_grads)
+
         self.assertEqual(len(opts), 2)
         sgd_op = opts[-1]
         self.assertEqual([op.type for op in opts], ["scale", name])
@@ -108,8 +114,11 @@ def check_dgc_momentum_optimizer(self,
         self.assertTrue(mul_x.name in velocity_acc)
 
         # Check init_program
+        # dgc not apply include: lr, dgc(count, nranks, begin step), (u,)
+        # dgc apply include: lr, dgc(count, nranks, begin_step), (u,v,k,encode,gather)
+        init_ops_count = 5 if name == "momentum" else 9
         init_ops = init_program.global_block().ops
-        self.assertEqual(len(init_ops), 1)
+        self.assertEqual(len(init_ops), init_ops_count)
         self.assertEqual(init_ops[0].type, "fill_constant")
         self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
index 362428631e68c..6bc1a310d0aea 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -12,57 +12,97 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.distributed.fleet as fleet
-import paddle.distributed.fleet.base.role_maker as role_maker
 import unittest
 import paddle
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle.distributed.fleet.meta_optimizers import AMPOptimizer
 import os
+from fleet_meta_optimizer_base import TestFleetMetaOptimizer
 
 paddle.enable_static()
 
 
-class TestFleetAMPOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ID"] = "0"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+class TestFleetAMPOptimizer(TestFleetMetaOptimizer):
+    def test_amp_optimizer_backward(self):
+        """ test amp optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = AMPOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('cast', ops)
+        self.assertNotIn('check_finite_and_unscale', ops)
+
+    def test_amp_optimizer_backward_gradients(self):
+        """ test amp optimizer backward + gradients"""
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = AMPOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+        with fluid.program_guard(train_prog, startup_prog):
+            opt.apply_gradients(params_grads)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+    def test_amp_optimizer_backward_optimize(self):
+        """ test amp optimizer backward + optimizer """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = AMPOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+        opt.apply_optimize(avg_cost, startup_prog, params_grads)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
 
     def test_amp_optimizer(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.amp = True
-        strategy.amp_configs = {
-            "init_loss_scaling": 32768,
-            "decr_every_n_nan_or_inf": 2,
-            "incr_every_n_steps": 1000,
-            "incr_ratio": 2.0,
-            "use_dynamic_loss_scaling": True,
-            "decr_ratio": 0.5,
-            "custom_white_list": ['softmax'],
-            "custom_black_list": ['tanh'],
-        }
-
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        """ test amp """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'amp')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+    def test_amp_recompute_optimizer(self):
+        """ test amp + recompute """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'amp')
+        self.set_strategy(strategy, 'recompute')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
 
         strategy = fleet._final_strategy()
 
         ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
         self.assertIn('cast', ops)
         self.assertIn('check_finite_and_unscale', ops)
 
+        # recompute
+        self.assertIn('subprog', ''.join(outs))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
index 55d4ff7726aac..0faafd76a799d 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
@@ -17,65 +17,82 @@
 from paddle import fluid
 import os
 import paddle.distributed.fleet as fleet
+from fleet_meta_optimizer_base import TestFleetMetaOptimizer
+from paddle.distributed.fleet.meta_optimizers import DGCOptimizer
 import paddle.distributed.fleet.base.role_maker as role_maker
 
+paddle.enable_static()
 
-class TestFleetDGCOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ID"] = "1"
-        os.environ[
-            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
-
-    def net(self, main_prog, startup_prog):
-        with fluid.program_guard(main_prog, startup_prog):
-            with fluid.unique_name.guard():
-                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-                fleet.init(role)
-                input_x = paddle.fluid.layers.data(
-                    name="x", shape=[32], dtype='float32')
-                input_y = paddle.fluid.layers.data(
-                    name="y", shape=[1], dtype='int64')
-
-                fc_1 = paddle.fluid.layers.fc(input=input_x,
-                                              size=64,
-                                              act='tanh')
-                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
-                prediction = paddle.fluid.layers.fc(input=[fc_2],
-                                                    size=2,
-                                                    act='softmax')
-                cost = paddle.fluid.layers.cross_entropy(
-                    input=prediction, label=input_y)
-                avg_cost = paddle.fluid.layers.mean(x=cost)
-
-                strategy = paddle.distributed.fleet.DistributedStrategy()
-                strategy.dgc = True
-                strategy.dgc_configs = {
-                    "rampup_begin_step": 128,
-                    "rampup_step": 100,
-                    "sparsity": [0.996, 0.999]
-                }
-        return avg_cost, strategy
+
+class TestFleetDGCOptimizer(TestFleetMetaOptimizer):
+    def test_dgc_optimizer_backward(self):
+        """ test dgc optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'dgc')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        dgc_opt = DGCOptimizer(opt)
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
+        params_grads = dgc_opt.backward(avg_cost, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertNotIn('dgc', ops)
+
+    def test_dgc_optimizer_gradients(self):
+        """ test dgc optimizer backward + gradients """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'dgc')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        dgc_opt = DGCOptimizer(opt)
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
+        params_grads = dgc_opt.backward(avg_cost, startup_prog)
+        with fluid.program_guard(train_prog, startup_prog):
+            dgc_opt.apply_gradients(params_grads)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('dgc', ops)
+        self.assertIn('dgc_momentum', ops)
+
+    def test_dgc_optimizer_optimize(self):
+        """ test dgc optimizer backward + optimize """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'dgc')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        dgc_opt = DGCOptimizer(opt)
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
+        params_grads = dgc_opt.backward(avg_cost, startup_prog)
+        dgc_opt.apply_optimize(avg_cost, startup_prog, params_grads)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('dgc', ops)
+        self.assertIn('dgc_momentum', ops)
 
     def test_dgc_optimizer(self):
-        startup_prog = fluid.Program()
-        train_prog = fluid.Program()
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.fluid.optimizer.Momentum(
-            learning_rate=0.01, momentum=0.9)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        self.set_strategy(strategy, 'dgc')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
 
         ops = [op.type for op in avg_cost.block.ops]
         self.assertIn('dgc', ops)
         self.assertIn('dgc_momentum', ops)
 
     def test_dgc_not_apply_with_adam(self):
-        startup_prog = fluid.Program()
-        train_prog = fluid.Program()
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        self.set_strategy(strategy, 'dgc')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam')
 
         ops = [op.type for op in avg_cost.block.ops]
         self.assertNotIn('dgc', ops)
@@ -85,18 +102,32 @@ def test_dgc_not_apply_with_one_worker(self):
         os.environ["PADDLE_TRAINER_ID"] = "0"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
 
-        startup_prog = fluid.Program()
-        train_prog = fluid.Program()
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.fluid.optimizer.Momentum(
-            learning_rate=0.01, momentum=0.9)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        self.set_strategy(strategy, 'dgc')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
 
         ops = [op.type for op in avg_cost.block.ops]
         self.assertNotIn('dgc', ops)
         self.assertNotIn('dgc_momentum', ops)
 
+    def test_dgc_recompute_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'dgc')
+        self.set_strategy(strategy, 'recompute')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('dgc', ops)
+        self.assertIn('dgc_momentum', ops)
+
+        # recompute
+        self.assertIn('subprog', ''.join(outs))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
index f5347b0c665e2..bafb2419123b0 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
@@ -16,71 +16,87 @@
 import paddle
 import os
 
+import paddle
+import paddle.fluid as fluid
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
+from fleet_meta_optimizer_base import TestFleetMetaOptimizer
 
+paddle.enable_static()
 
-class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ID"] = "1"
-        os.environ[
-            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
 
+class TestFleetLocalSGDMetaOptimizer(TestFleetMetaOptimizer):
     def test_localsgd_optimizer(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.localsgd = True
-        strategy.auto = True
-        config = strategy.localsgd_configs
-        config['k_steps'] = 1
-        config['begin_step'] = 1
-        strategy.localsgd_configs = config
-
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
-
-
-class TestFleetAdaptiveLocalSGDMetaOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ID"] = "1"
-        os.environ[
-            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
-
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'localsgd')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            ''.join(op.output('Out')) for op in avg_cost.block.ops
+            if op.type == 'conditional_block'
+        ]
+
+        self.assertIn('conditional_block', ops)
+        self.assertIn('@SNAPSHOT', ''.join(outs))
+
+    def test_localsgd_amp_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'localsgd')
+        self.set_strategy(strategy, 'amp')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            ''.join(op.output('Out')) for op in avg_cost.block.ops
+            if op.type == 'conditional_block'
+        ]
+
+        self.assertIn('conditional_block', ops)
+        self.assertIn('@SNAPSHOT', ''.join(outs))
+
+        # amp
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+
+class TestFleetAdaptiveLocalSGDMetaOptimizer(TestFleetMetaOptimizer):
     def test_adaptive_localsgd_optimizer(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.adaptive_localsgd = True
-        config = strategy.adaptive_localsgd_configs
-        config['init_k_steps'] = 1
-        config['begin_step'] = 1
-        strategy.adaptive_localsgd_configs = config
-
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'adaptive_localsgd')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            ''.join(op.output('Out')) for op in avg_cost.block.ops
+            if op.type == 'conditional_block'
+        ]
+
+        self.assertIn('conditional_block', ops)
+        self.assertIn('@SNAPSHOT', ''.join(outs))
+
+    def test_localsgd_amp_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'adaptive_localsgd')
+        self.set_strategy(strategy, 'amp')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            ''.join(op.output('Out')) for op in avg_cost.block.ops
+            if op.type == 'conditional_block'
+        ]
+
+        self.assertIn('conditional_block', ops)
+        self.assertIn('@SNAPSHOT', ''.join(outs))
+
+        # amp
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
index a42010a4eaa50..42b60cd3fad5a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
@@ -14,40 +14,144 @@
 
 import unittest
 import paddle
+import paddle.fluid as fluid
 import os
+from fleet_meta_optimizer_base import TestFleetMetaOptimizer
+from paddle.distributed.fleet.meta_optimizers import RecomputeOptimizer
 
+paddle.enable_static()
 
-class TestFleetRecomputeMetaOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
-        os.environ["PADDLE_TRAINERS_NUM"] = "2"
-        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-                       "127.0.0.1:36001,127.0.0.2:36001"
+
+class TestFleetRecomputeMetaOptimizer(TestFleetMetaOptimizer):
+    def test_recompute_optimizer_backward(self):
+        """ test recompute optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_optimizer_backward_gradients(self):
+        """ test recompute optimizer backward + gradients """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+        with fluid.program_guard(train_prog, startup_prog):
+            opt.apply_gradients(params_grads)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_optimizer_backward_optimize(self):
+        """ test recompute optimizer backward + optimize """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+        opt.apply_optimize(avg_cost, startup_prog, params_grads)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_optimizer_backward(self):
+        """ test recompute optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_optimizer_backward(self):
+        """ test recompute optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
 
     def test_recompute_optimizer(self):
-        import paddle.distributed.fleet as fleet
-        import paddle.distributed.fleet.base.role_maker as role_maker
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.recompute = True
-        strategy.recompute_configs = {"checkpoints": ["fc_1.tmp_0"]}
-
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'recompute')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_lars_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'recompute')
+        self.set_strategy(strategy, 'lars')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+
+        self.assertIn('subprog', ''.join(outs))
+        self.assertIn('lars_momentum', ops)
+
+    def test_recompute_lamb_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'recompute')
+        self.set_strategy(strategy, 'lamb')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam')
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+
+        self.assertIn('subprog', ''.join(outs))
+        self.assertIn('lamb', ops)
 
 
 if __name__ == "__main__":

From 840d54de9b1eac48fa5a871eecb703f0b598e558 Mon Sep 17 00:00:00 2001
From: mapingshuo <mps2012@yeah.net>
Date: Mon, 12 Oct 2020 10:29:30 +0800
Subject: [PATCH 67/91] add XPU support for shape op and reshape op (#27804)

---
 paddle/fluid/operators/reshape_op.cc          |  37 +++-
 paddle/fluid/operators/shape_op_xpu.cc        |  21 ++
 .../unittests/xpu/test_reshape2_op_xpu.py     | 207 ++++++++++++++++++
 .../tests/unittests/xpu/test_shape_op_xpu.py  |  94 ++++++++
 4 files changed, 354 insertions(+), 5 deletions(-)
 create mode 100644 paddle/fluid/operators/shape_op_xpu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py

diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index e03824ca8c3f4..05bb37ee421ff 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -49,7 +49,8 @@ inline std::vector<int> get_new_shape(
             "the element's shape must be [1]. But received the element's shape "
             "is [%s]",
             tensor->dims()));
-    if (platform::is_gpu_place(tensor->place())) {
+    if (platform::is_gpu_place(tensor->place()) ||
+        platform::is_xpu_place(tensor->place())) {
       framework::Tensor temp;
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
 
@@ -362,7 +363,8 @@ class ReshapeKernel {
       if (shape_tensor) {
         auto *shape_data = shape_tensor->data<int>();
         framework::Tensor cpu_shape_tensor;
-        if (platform::is_gpu_place(shape_tensor->place())) {
+        if (platform::is_gpu_place(shape_tensor->place()) ||
+            platform::is_xpu_place(shape_tensor->place())) {
           TensorCopySync(*shape_tensor, platform::CPUPlace(),
                          &cpu_shape_tensor);
           shape_data = cpu_shape_tensor.data<int>();
@@ -375,9 +377,22 @@ class ReshapeKernel {
 
     out->Resize(out_dims);
     out->mutable_data(ctx.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), out);
+
+#ifdef PADDLE_WITH_XPU
+    if (platform::is_xpu_place(ctx.GetPlace())) {
+      auto &dev_ctx =
+          ctx.template device_context<paddle::platform::XPUDeviceContext>();
+      xpu::memcpy_device(
+          dev_ctx.x_context(), out->data<void>(), in->data<void>(),
+          in->numel() * paddle::framework::SizeOfType(in->type()));
+    } else {
+#endif
+      framework::TensorCopy(
+          *in, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), out);
+#ifdef PADDLE_WITH_XPU
+    }
+#endif
     out->Resize(out_dims);
   }
 };
@@ -644,3 +659,15 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad_grad, float,
                                 ops::ReshapeDoubleGradKernel, plat::float16,
                                 ops::ReshapeDoubleGradKernel);
 #endif
+
+#ifdef PADDLE_WITH_XPU
+REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel, plat::float16,
+                               ops::ReshapeKernel);
+REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel, plat::float16,
+                               ops::ReshapeGradKernel);
+#endif
diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc
new file mode 100644
index 0000000000000..2e9092a643253
--- /dev/null
+++ b/paddle/fluid/operators/shape_op_xpu.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/shape_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
+                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
+                       ops::ShapeKernel<double>);
+
+#endif
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
new file mode 100644
index 0000000000000..1a21b0f1972b7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
@@ -0,0 +1,207 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+
+# situation 1: have shape( list, no tensor), no actual shape(Tensor)
+class TestReshapeOp(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "reshape2"
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.attrs = {"shape": self.new_shape, "use_xpu": True}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def init_data(self):
+        self.ori_shape = (2, 60)
+        self.new_shape = (12, 10)
+        self.infered_shape = (12, 10)
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ["X"], "Out")
+
+
+class TestReshapeOpDimInfer1(TestReshapeOp):
+    def init_data(self):
+        self.ori_shape = (5, 25)
+        self.new_shape = (5, -1, 5)
+        self.infered_shape = (5, -1, 5)
+
+
+class TestReshapeOpDimInfer2(TestReshapeOp):
+    def init_data(self):
+        self.ori_shape = (10, 2, 6)
+        self.new_shape = (10, 0, 3, -1)
+        self.infered_shape = (10, 2, 3, -1)
+
+
+# situation 2: have shape(list, no tensor), have actual shape(Tensor)
+class TestReshapeOpWithInputShape(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "reshape2"
+
+        self.inputs = {
+            "X": np.random.random(self.ori_shape).astype("float32"),
+            "Shape": np.array(
+                self.actual_shape, dtype="int32")
+        }
+        self.attrs = {"shape": self.new_shape, "use_xpu": True}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.actual_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def init_data(self):
+        self.ori_shape = (6, 20)
+        self.new_shape = (0, -1, 20)
+        self.actual_shape = (2, 3, 20)
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ["X"], "Out")
+
+
+# Situation 3: have shape(list, have tensor), no actual shape(Tensor)
+class TestReshapeOp_attr_ShapeTensor(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "reshape2"
+
+        shape_tensor = []
+        for index, ele in enumerate(self.new_shape):
+            shape_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {
+            "X": np.random.random(self.ori_shape).astype("float32"),
+            'ShapeTensor': shape_tensor
+        }
+        self.attrs = {'shape': self.shape, "use_xpu": True}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def init_data(self):
+        self.ori_shape = (4, 25)
+        self.new_shape = (10, 10)
+        self.infered_shape = (10, 10)
+        self.shape = (-1, -1)
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ["X"], "Out")
+
+
+class TestReshapeOpDimInfer1_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
+    def init_data(self):
+        self.ori_shape = (5, 20)
+        self.new_shape = (5, -1, 20)
+        self.infered_shape = (5, -1, 20)
+        self.shape = (5, -1, -1)
+
+
+class TestReshapeOpDimInfer2_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
+    def init_data(self):
+        self.ori_shape = (10, 2, 6)
+        self.new_shape = (10, 0, 3, -1)
+        self.infered_shape = (10, 2, 3, -1)
+        self.shape = (10, 0, 3, -1)
+
+
+# Situation 4: have shape(Tensor), no actual shape(Tensor)
+class TestReshapeOp_attr_OnlyShape(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "reshape2"
+
+        self.inputs = {
+            "X": np.random.random(self.ori_shape).astype("float32"),
+            "Shape": np.array(
+                self.new_shape, dtype="int32")
+        }
+        self.attrs = {"use_xpu": True}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def init_data(self):
+        self.ori_shape = (4, 25)
+        self.new_shape = (10, 10)
+        self.infered_shape = (10, 10)
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ["X"], "Out")
+
+
+class TestReshapeOpDimInfer1_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+    def init_data(self):
+        self.ori_shape = (5, 20)
+        self.new_shape = (5, -1, 10)
+        self.infered_shape = (5, -1, 10)
+        self.shape = (5, -1, -1)
+
+
+class TestReshapeOpDimInfer2_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+    def init_data(self):
+        self.ori_shape = (10, 2, 6)
+        self.new_shape = (10, 0, 3, -1)
+        self.infered_shape = (10, 2, 3, -1)
+        self.shape = (10, 0, 3, -1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
new file mode 100644
index 0000000000000..f194f3ca80cf0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
@@ -0,0 +1,94 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+from paddle.fluid import core
+from paddle.fluid.op import Operator
+
+
+class TestShapeOp(OpTest):
+    def setUp(self):
+        self.op_type = "shape"
+        self.config()
+        self.shape = [2, 3]
+        input = np.zeros(self.shape)
+        self.inputs = {'Input': input}
+        self.outputs = {'Out': np.array(self.shape)}
+
+    def config(self):
+        self.shape = [2, 3]
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+class case1(TestShapeOp):
+    def config(self):
+        self.shape = [2]
+
+
+class case2(TestShapeOp):
+    def config(self):
+        self.shape = [1, 2, 3]
+
+
+class TestShapeWithSelectedRows(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        if core.is_compiled_with_xpu():
+            places.append(core.XPUPlace(0))
+        return places
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        x_rows = [0, 1, 5, 4, 19]
+        height = 20
+        row_numel = 2
+
+        np_array = np.ones((len(x_rows), row_numel)).astype("float32")
+
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(x_rows)
+        x.set_height(height)
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+
+        # initialize input variable Out
+        out_shape = scope.var("Out").get_tensor()
+        op = Operator("shape", Input="X", Out="Out")
+
+        op.run(scope, place)
+
+        out_shape = np.array(out_shape).tolist()
+        self.assertListEqual([5, 2], out_shape)
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 4676f03cbb77563dcdbd09fa8caf25d4b22d4464 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Mon, 12 Oct 2020 10:57:53 +0800
Subject: [PATCH 68/91] fix summary (#27820)

---
 python/paddle/hapi/model_summary.py | 2 +-
 python/paddle/tests/test_model.py   | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index c46a53e910df0..30b22a2f32c34 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -254,7 +254,7 @@ def build_input(input_size, dtypes):
                 dtype = dtypes[0]
             else:
                 dtype = dtypes
-            return paddle.rand(list(input_size), dtype)
+            return paddle.cast(paddle.rand(list(input_size)), dtype)
         else:
             return [
                 build_input(i, dtype) for i, dtype in zip(input_size, dtypes)
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 96c4483a35ba8..8cd5e172aa06a 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -501,6 +501,11 @@ def test_summary_nlp(self):
         rnn = paddle.nn.LSTM(16, 32, 2)
         paddle.summary(rnn, [(-1, 23, 16), ((2, None, 32), (2, -1, 32))])
 
+    def test_summary_dtype(self):
+        input_shape = (3, 1)
+        net = paddle.nn.Embedding(10, 3, sparse=True)
+        paddle.summary(net, input_shape, dtypes='int64')
+
     def test_summary_error(self):
         with self.assertRaises(TypeError):
             nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)

From d4359b0f39af18fa86ac660a47125cbe63517408 Mon Sep 17 00:00:00 2001
From: Jack Zhou <136876878@qq.com>
Date: Mon, 12 Oct 2020 11:04:15 +0800
Subject: [PATCH 69/91] add the kunlun kernel for the paddle 2.0

Add xpu kernel for KUNLUN core:

* accuracy op
* sign op
* scale op
* sum op

Add default atol in xpu unittest.
---
 .../operators/metrics/accuracy_op_xpu.cc      | 120 ++++++++++++++++++
 paddle/fluid/operators/scale_op_xpu.cc        |  63 +++++++++
 paddle/fluid/operators/sign_op_xpu.cc         |  44 +++++++
 paddle/fluid/operators/sum_op_xpu.cc          |  65 ++++++++++
 .../unittests/xpu/test_accuracy_op_xpu.py     |  63 +++++++++
 .../tests/unittests/xpu/test_scale_op_xpu.py  |  54 ++++++++
 .../tests/unittests/xpu/test_sign_op_xpu.py   |  54 ++++++++
 .../tests/unittests/xpu/test_sum_op_xpu.py    |  61 +++++++++
 8 files changed, 524 insertions(+)
 create mode 100644 paddle/fluid/operators/metrics/accuracy_op_xpu.cc
 create mode 100644 paddle/fluid/operators/scale_op_xpu.cc
 create mode 100644 paddle/fluid/operators/sign_op_xpu.cc
 create mode 100644 paddle/fluid/operators/sum_op_xpu.cc
 create mode 100755 python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py

diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
new file mode 100644
index 0000000000000..c0aa00e79341e
--- /dev/null
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AccuracyXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* inference = ctx.Input<Tensor>("Out");
+    auto* indices = ctx.Input<Tensor>("Indices");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
+    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
+    int* total_data = total->mutable_data<int>(ctx.GetPlace());
+    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
+    const int64_t* indices_data = indices->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
+    size_t num_samples = inference->dims()[0];
+    size_t class_dim = inference->dims()[1];
+    if (num_samples == 0) {
+      return;
+    }
+    size_t indices_int32_size = num_samples * class_dim * sizeof(int);
+    size_t indices_int64_size = num_samples * class_dim * sizeof(int64_t);
+    size_t label_int32_size = num_samples * sizeof(int);
+    size_t label_int64_size = num_samples * sizeof(int64_t);
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int* indices_int32_device = NULL;
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc(reinterpret_cast<void**>(&indices_int32_device),
+                   indices_int32_size),
+        XPU_SUCCESS,
+        platform::errors::ResourceExhausted(
+            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
+            " on XPU. \n\nPlease check whether there is any other process "
+            "using XPU.\n",
+            string::HumanReadableSize(indices_int32_size)));
+    int* label_int32_device = NULL;
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc(reinterpret_cast<void**>(&label_int32_device),
+                   label_int32_size),
+        XPU_SUCCESS,
+        platform::errors::ResourceExhausted(
+            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
+            " on XPU. \n\nPlease check whether there is any other process "
+            "using XPU.\n",
+            string::HumanReadableSize(label_int32_size)));
+
+    int* indices_int32_host =
+        reinterpret_cast<int*>(std::malloc(indices_int32_size));
+    int64_t* indices_int64_host =
+        reinterpret_cast<int64_t*>(std::malloc(indices_int64_size));
+    int* label_int32_host =
+        reinterpret_cast<int*>(std::malloc(label_int32_size));
+    int64_t* label_int64_host =
+        reinterpret_cast<int64_t*>(std::malloc(label_int64_size));
+    dev_ctx.Wait();
+    memory::Copy(platform::CPUPlace(), indices_int64_host,
+                 BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 indices_data, indices_int64_size);
+    memory::Copy(platform::CPUPlace(), label_int64_host,
+                 BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 label_data, label_int64_size);
+    for (int i = 0; i < num_samples; ++i) {
+      label_int32_host[i] = label_int64_host[i];
+      for (int j = 0; j < class_dim; ++j) {
+        indices_int32_host[i * class_dim + j] =
+            indices_int64_host[i * class_dim + j];
+      }
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 indices_int32_device, platform::CPUPlace(), indices_int32_host,
+                 indices_int32_size);
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 label_int32_device, platform::CPUPlace(), label_int32_host,
+                 label_int32_size);
+    int r = xpu::accuracy(dev_ctx.x_context(), indices_int32_device,
+                          label_int32_device, num_samples, class_dim,
+                          correct_data, total_data, accuracy_data);
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+    dev_ctx.Wait();
+    xpu_free(indices_int32_device);
+    xpu_free(label_int32_device);
+    std::free(indices_int32_host);
+    std::free(indices_int64_host);
+    std::free(label_int32_host);
+    std::free(label_int64_host);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    accuracy,
+    ops::AccuracyXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
new file mode 100644
index 0000000000000..4002be8100152
--- /dev/null
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/scale_op.h"
+#include <string>
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class ScaleXPUKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in_var = ctx.InputVar("X");
+    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
+    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
+    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
+    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
+    auto* out_var = ctx.OutputVar("Out");
+    if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
+      auto& in_slr = in_var->Get<framework::SelectedRows>();
+      auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
+      out_slr->set_rows(in_slr.rows());
+      out_slr->set_height(in_slr.height());
+    }
+    auto* out =
+        framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
+    out->mutable_data<T>(in->place());
+    PADDLE_ENFORCE_EQ(
+        in->dims(), out->dims(),
+        platform::errors::InvalidArgument("In and out should have the same dim,"
+                                          " expected %s, but got %s.",
+                                          in->dims().to_str().c_str(),
+                                          out->dims().to_str().c_str()));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::scale(dev_ctx.x_context(), in->numel(), scale, bias,
+                       bias_after_scale, in->data<float>(), out->data<float>());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    scale, ops::ScaleXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/sign_op_xpu.cc b/paddle/fluid/operators/sign_op_xpu.cc
new file mode 100644
index 0000000000000..44fd555544e7f
--- /dev/null
+++ b/paddle/fluid/operators/sign_op_xpu.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/sign_op.h"
+#include "paddle/fluid/platform/xpu_header.h"
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SignXPUKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    out->mutable_data<T>(in->place());
+    auto xpu_context = context.device_context<DeviceContext>().x_context();
+    int r = xpu::activation_forward(xpu_context, xpu::Activation_t::SIGN,
+                                    in->numel(), in->data<T>(), out->data<T>());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    sign, ops::SignXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/sum_op_xpu.cc b/paddle/fluid/operators/sum_op_xpu.cc
new file mode 100644
index 0000000000000..14928061d23dd
--- /dev/null
+++ b/paddle/fluid/operators/sum_op_xpu.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/sum_op.h"
+#include <vector>
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SumXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto in_vars = context.MultiInputVar("X");
+    auto out_var = context.OutputVar("Out");
+    auto *out = context.Output<LoDTensor>("Out");
+    bool in_place = out_var == in_vars[0];
+    int N = in_vars.size();
+    PADDLE_ENFORCE_EQ(
+        out_var->IsType<framework::LoDTensor>(), true,
+        platform::errors::InvalidArgument("XPU only surpport LodTensor"));
+    if (!in_place) {
+      out->mutable_data<T>(context.GetPlace());
+    }
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<const float *> ptrs(N, nullptr);
+    int valid_count = 0;
+    for (int i = 0; i < N; ++i) {
+      PADDLE_ENFORCE_EQ(
+          in_vars[i]->IsType<framework::LoDTensor>(), true,
+          platform::errors::InvalidArgument("XPU only surpport LodTensor"));
+      auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
+      if (in_t.numel() == 0) {
+        continue;
+      }
+      ptrs[valid_count] = reinterpret_cast<const float *>(in_t.data<T>());
+      valid_count++;
+    }
+    int r = xpu::sum_batch(dev_ctx.x_context(), ptrs.data(), out->data<T>(),
+                           valid_count, out->numel());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    sum, ops::SumXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
new file mode 100755
index 0000000000000..7aaa78856811f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUAccuracyOp(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.init_dtype()
+        n = 8192
+        infer = np.random.random((n, 1)).astype(self.dtype)
+        indices = np.random.randint(0, 2, (n, 1)).astype('int64')
+        label = np.random.randint(0, 2, (n, 1)).astype('int64')
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
+        }
+        self.attrs = {'use_xpu': True}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
new file mode 100644
index 0000000000000..1f74fa5e2d685
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
@@ -0,0 +1,54 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUScaleOp(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.dtype = np.float32
+        self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
+        self.attrs = {'scale': -2.3, 'use_xpu': True}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
+        }
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
new file mode 100644
index 0000000000000..ab07221a07071
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
@@ -0,0 +1,54 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSignOp(OpTest):
+    def setUp(self):
+        self.op_type = "sign"
+        self.dtype = np.float32
+        self.inputs = {
+            'X': np.random.uniform(-10, 10, (10, 10)).astype(self.dtype)
+        }
+        self.outputs = {'Out': np.sign(self.inputs['X'])}
+        self.attrs = {'use_xpu': True}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
new file mode 100644
index 0000000000000..3bafbf649e6ce
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
@@ -0,0 +1,61 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSumOp(OpTest):
+    def setUp(self):
+        self.op_type = "sum"
+        self.use_mkldnn = False
+        self.init_kernel_type()
+        x0 = np.random.random((3, 40)).astype(self.dtype)
+        x1 = np.random.random((3, 40)).astype(self.dtype)
+        x2 = np.random.random((3, 40)).astype(self.dtype)
+        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+        y = x0 + x1 + x2
+        self.outputs = {'Out': y}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'use_xpu': True}
+
+    def init_kernel_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['x0'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()

From 070ac9590cead7b5a32a7a11f8d6d1f3e665c120 Mon Sep 17 00:00:00 2001
From: TeslaZhao <zhaolisoftware@163.com>
Date: Mon, 12 Oct 2020 11:13:30 +0800
Subject: [PATCH 70/91] Add double grad in Squeeze and Unsqueeze (#27810)

* Add double grad in Squeeze and Unsqueeze

* Add double grad in Squeeze and Unsqueeze
---
 paddle/fluid/operators/squeeze_op.cc          | 39 +++++++++++++--
 paddle/fluid/operators/unsqueeze_op.cc        | 31 ++++++++++++
 .../fluid/tests/unittests/test_nn_grad.py     | 50 ++++++++++++++++++-
 .../fluid/tests/unittests/test_squeeze2_op.py |  2 +
 .../fluid/tests/unittests/test_squeeze_op.py  |  1 +
 .../tests/unittests/test_unsqueeze2_op.py     |  2 +
 .../tests/unittests/test_unsqueeze_op.py      |  1 +
 7 files changed, 121 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 93d8f42ce2175..479973a5daa5f 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -249,6 +249,19 @@ class Squeeze2GradOp : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
+class SqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("squeeze");
+    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
 // FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
 // the XShape is used to carry the shape and lod of X which will be used in
 // squeeze_grad, in this way, the framework can reuse the memory of X
@@ -279,8 +292,22 @@ class Squeeze2GradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-DECLARE_INPLACE_OP_INFERER(SequeezeInplaceInferer, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(SequeezeGradInplaceInferer,
+template <typename T>
+class Squeeze2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("squeeze2");
+    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    grad_op->SetOutput("XShape", this->Input("XShape"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(SqueezeInplaceInferer, {"X", "Out"});
+DECLARE_INPLACE_OP_INFERER(SqueezeGradInplaceInferer,
                            {framework::GradVarName("Out"),
                             framework::GradVarName("X")});
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(SqueezeGradNoNeedBufferVarsInferer, "X");
@@ -292,14 +319,18 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
                   ops::SqueezeGradOpMaker<paddle::framework::OpDesc>,
                   ops::SqueezeGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp,
+                  ops::SqueezeDoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::SqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::SqueezeGradNoNeedBufferVarsInferer);
 
 REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
                   ops::Squeeze2GradOpMaker<paddle::framework::OpDesc>,
                   ops::Squeeze2GradOpMaker<paddle::imperative::OpBase>,
-                  ops::SequeezeInplaceInferer);
+                  ops::SqueezeInplaceInferer);
 REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
-                  ops::SequeezeGradInplaceInferer);
+                  ops::Squeeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::Squeeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
+                  ops::SqueezeGradInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(
     squeeze, ops::SqueezeKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index ee1361e361830..0e58e1391cfab 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -228,6 +228,19 @@ class UnsqueezeGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class UnsqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("unsqueeze");
+    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
 // FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
 // unsqueeze, the XShape is used to carry the shape and lod of X which
 // will be used in unsqueeze_grad, in this way, the framework can reuse
@@ -304,6 +317,20 @@ class Unsqueeze2GradOp : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
+class Unsqueeze2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("unsqueeze2");
+    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    grad_op->SetOutput("XShape", this->Input("XShape"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
 DECLARE_INPLACE_OP_INFERER(UnsqueezeInplaceInferer, {"X", "Out"});
 DECLARE_INPLACE_OP_INFERER(UnsqueezeGradInplaceInferer,
                            {framework::GradVarName("Out"),
@@ -317,6 +344,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
                   ops::UnsqueezeGradOpMaker<paddle::framework::OpDesc>,
                   ops::UnsqueezeGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
+                  ops::UnsqueezeDoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::UnsqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::UnsqueezeGradOpNoNeedBufferVarInferer);
 
 REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
@@ -324,6 +353,8 @@ REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
                   ops::Unsqueeze2GradOpMaker<paddle::imperative::OpBase>,
                   ops::UnsqueezeInplaceInferer);
 REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
+                  ops::Unsqueeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::Unsqueeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::UnsqueezeGradInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index bf1955c5711f5..1675f935f7d6a 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -22,8 +22,8 @@
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
 import gradient_checker
-
 from decorator_helper import prog_scope
+paddle.enable_static()
 
 
 class TestMulGradCheck(unittest.TestCase):
@@ -281,5 +281,53 @@ def test_grad(self):
             self.func(p)
 
 
+class TestSqueezeDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [1, 3, 1, 40]
+        axes = [0, 2]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = paddle.squeeze(x, axes)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestUnsqueezeDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [3, 40]
+        axes = [1, 2]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = paddle.unsqueeze(x, axes)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze2_op.py b/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
index a1879c724597e..377f8597cca3b 100644
--- a/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
@@ -18,6 +18,8 @@
 import numpy as np
 
 from op_test import OpTest
+import paddle
+paddle.enable_static()
 
 
 # Correct: General.
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
index 5ab13cec540aa..830678fe8f6af 100644
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -20,6 +20,7 @@
 from paddle.fluid import compiler, Program, program_guard
 import paddle
 from op_test import OpTest
+paddle.enable_static()
 
 
 # Correct: General.
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
index 340d22acbfb51..eaecf91215cc6 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
@@ -18,6 +18,8 @@
 import numpy as np
 import paddle.fluid as fluid
 from op_test import OpTest
+import paddle
+paddle.enable_static()
 
 
 # Correct: General.
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index 1975e4306026e..f8d27dd42f43b 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -19,6 +19,7 @@
 import paddle
 import paddle.fluid as fluid
 from op_test import OpTest
+paddle.enable_static()
 
 
 # Correct: General.

From f3e2580cf026a767e078c32ab9941ff8906bb1bd Mon Sep 17 00:00:00 2001
From: hong19860320 <9973393+hong19860320@users.noreply.github.com>
Date: Mon, 12 Oct 2020 14:10:46 +0800
Subject: [PATCH 71/91] Fix the param of swish (#27824)

---
 python/paddle/fluid/tests/unittests/test_activation_op.py | 2 +-
 python/paddle/nn/functional/activation.py                 | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index ac3d0a3a78562..4fed0c8552b44 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -2175,7 +2175,7 @@ def setUp(self):
         x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
         out = ref_swish(x)
         self.inputs = {'X': x}
-        self.attrs = {'slope': 1.0}
+        self.attrs = {'beta': 1.0}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 53fa9814e6ef0..6e09e25b1ab05 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -1183,7 +1183,7 @@ def swish(x, name=None):
     """
 
     if in_dygraph_mode():
-        return core.ops.swish(x, 'slop', 1.0)
+        return core.ops.swish(x, 'beta', 1.0)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'swish')
     helper = LayerHelper('swish', **locals())
@@ -1192,7 +1192,7 @@ def swish(x, name=None):
         type='swish',
         inputs={'X': x},
         outputs={'Out': out},
-        attrs={'slope': 1.0})
+        attrs={'beta': 1.0})
     return out
 
 

From b84d4ae31c16d6e3d4f9bfd9253339c9fb50a3ca Mon Sep 17 00:00:00 2001
From: LiuChiachi <709153940@qq.com>
Date: Mon, 12 Oct 2020 14:13:44 +0800
Subject: [PATCH 72/91] Fix bug of Model.save (#27815)

* fix model bugs, inputs can be InputSpec instance

* correct error message
---
 python/paddle/hapi/model.py       | 25 ++++++++++++++-----------
 python/paddle/tests/test_model.py | 15 ++++++++++++++-
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 459d6cd3284e9..21e3054dde7d7 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -201,8 +201,11 @@ def _init_context():
 
 
 def _update_input_shapes(inputs):
+    "Get input shape list by given inputs in Model initialization."
     shapes = None
-    if isinstance(inputs, list):
+    if isinstance(inputs, Input):
+        shapes = [list(inputs.shape)]
+    elif isinstance(inputs, list):
         shapes = [list(input.shape) for input in inputs]
     elif isinstance(inputs, dict):
         shapes = [list(inputs[name].shape) for name in inputs]
@@ -917,9 +920,7 @@ def train_batch(self, inputs, labels=None):
         """
         loss = self._adapter.train_batch(inputs, labels)
         if fluid.in_dygraph_mode() and self._input_shapes is None:
-            self._input_shapes = self._adapter._input_shapes
-            self._is_shape_inferred = True
-            self._inputs = self._verify_spec(None, self._input_shapes, True)
+            self._update_inputs()
         return loss
 
     def eval_batch(self, inputs, labels=None):
@@ -967,9 +968,7 @@ def eval_batch(self, inputs, labels=None):
         """
         loss = self._adapter.eval_batch(inputs, labels)
         if fluid.in_dygraph_mode() and self._input_shapes is None:
-            self._input_shapes = self._adapter._input_shapes
-            self._is_shape_inferred = True
-            self._inputs = self._verify_spec(None, self._input_shapes, True)
+            self._update_inputs()
         return loss
 
     def test_batch(self, inputs):
@@ -1012,9 +1011,7 @@ def test_batch(self, inputs):
         """
         loss = self._adapter.test_batch(inputs)
         if fluid.in_dygraph_mode() and self._input_shapes is None:
-            self._input_shapes = self._adapter._input_shapes
-            self._is_shape_inferred = True
-            self._inputs = self._verify_spec(None, self._input_shapes, True)
+            self._update_inputs()
         return loss
 
     def save(self, path, training=True):
@@ -1707,7 +1704,7 @@ def get_inout_spec(all_vars, return_name=False):
                 layer = self.network
                 if self._input_shapes is None:  # No provided or inferred
                     raise RuntimeError(
-                        "Saving inference model needs 'inputs' or running before saving. Please specify 'inputs' in Model initialization or input training zqqdata and perform a training for shape derivation."
+                        "Saving inference model needs 'inputs' or running before saving. Please specify 'inputs' in Model initialization or input training data and perform a training for shape derivation."
                     )
                 if self._is_shape_inferred:
                     warnings.warn(
@@ -1953,3 +1950,9 @@ def _len_data_loader(self, data_loader):
         except Exception:
             steps = None
         return steps
+
+    def _update_inputs(self):
+        "Update self._inputs according to given inputs."
+        self._input_shapes = self._adapter._input_shapes
+        self._is_shape_inferred = True
+        self._inputs = self._verify_spec(None, self._input_shapes, True)
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 8cd5e172aa06a..56105b6d7f15a 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -556,9 +556,10 @@ def test_export_deploy_model(self):
                 shutil.rmtree(save_dir)
             paddle.enable_static()
 
-    def test_dygraph_export_deploy_model_without_inputs(self):
+    def test_dygraph_export_deploy_model_about_inputs(self):
         mnist_data = MnistDataset(mode='train')
         paddle.disable_static()
+        # without inputs
         for initial in ["fit", "train_batch", "eval_batch", "test_batch"]:
             save_dir = tempfile.mkdtemp()
             if not os.path.exists(save_dir):
@@ -584,6 +585,18 @@ def test_dygraph_export_deploy_model_without_inputs(self):
 
             model.save(save_dir, training=False)
             shutil.rmtree(save_dir)
+        # with inputs, and the type of inputs is InputSpec
+        save_dir = tempfile.mkdtemp()
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        net = LeNet()
+        inputs = InputSpec([None, 1, 28, 28], 'float32', 'x')
+        model = Model(net, inputs)
+        optim = fluid.optimizer.Adam(
+            learning_rate=0.001, parameter_list=model.parameters())
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
+        model.save(save_dir, training=False)
+        shutil.rmtree(save_dir)
 
 
 class TestRaiseError(unittest.TestCase):

From 659d04df2cf544f050ba4dac61e7c84fefe8087d Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Mon, 12 Oct 2020 15:09:24 +0800
Subject: [PATCH 73/91] hsigmoid -> hsigmoid_loss/HSigmoidLoss; refine docs
 (#27745)

---
 paddle/fluid/pybind/op_function_generator.cc  |   2 +
 .../unittests/test_directory_migration.py     |   9 +-
 .../fluid/tests/unittests/test_hsigmoid.py    | 219 ------------------
 .../fluid/tests/unittests/test_hsigmoid_op.py | 202 +++++++++++++++-
 python/paddle/nn/__init__.py                  |   2 +-
 python/paddle/nn/functional/__init__.py       |   2 +-
 python/paddle/nn/functional/activation.py     | 123 ----------
 python/paddle/nn/functional/loss.py           | 133 +++++++++++
 python/paddle/nn/layer/__init__.py            |   1 -
 python/paddle/nn/layer/activation.py          | 137 -----------
 python/paddle/nn/layer/loss.py                | 123 ++++++++++
 python/paddle/static/nn/__init__.py           |   2 -
 12 files changed, 464 insertions(+), 491 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/test_hsigmoid.py

diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 9bc603c0ecc2c..ee6e541c9e6c6 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -49,6 +49,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
      {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
     {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
     {"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}},
+    {"hierarchical_sigmoid",
+     {"X", "W", "Label", "PathTable", "PathCode", "Bias"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 7d48f2c419085..fd014f3b4ecaf 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -64,11 +64,10 @@ def test_new_directory(self):
             'paddle.static.nn.create_parameter',
             'paddle.static.nn.crf_decoding', 'paddle.static.nn.data_norm',
             'paddle.static.nn.deformable_conv', 'paddle.static.nn.group_norm',
-            'paddle.static.nn.hsigmoid', 'paddle.static.nn.instance_norm',
-            'paddle.static.nn.layer_norm', 'paddle.static.nn.multi_box_head',
-            'paddle.static.nn.nce', 'paddle.static.nn.prelu',
-            'paddle.static.nn.row_conv', 'paddle.static.nn.spectral_norm',
-            'paddle.static.nn.embedding'
+            'paddle.static.nn.instance_norm', 'paddle.static.nn.layer_norm',
+            'paddle.static.nn.multi_box_head', 'paddle.static.nn.nce',
+            'paddle.static.nn.prelu', 'paddle.static.nn.row_conv',
+            'paddle.static.nn.spectral_norm', 'paddle.static.nn.embedding'
         ]
 
         import_file = 'run_import_modules.py'
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid.py b/python/paddle/fluid/tests/unittests/test_hsigmoid.py
deleted file mode 100644
index 80937640c2d2f..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle import fluid, nn
-import paddle.fluid.dygraph as dg
-import paddle.nn.functional as F
-import paddle.fluid.initializer as I
-import numpy as np
-import unittest
-
-
-class HSigmoidTestCase(unittest.TestCase):
-    def __init__(self,
-                 methodName="runTest",
-                 batch_size=4,
-                 feature_size=6,
-                 num_classes=8,
-                 labels=None,
-                 path_code=None,
-                 path_table=None,
-                 is_sparse=False,
-                 dtype="float32"):
-        super(HSigmoidTestCase, self).__init__()
-        self.batch_size = batch_size
-        self.feature_size = feature_size
-        self.num_classes = num_classes
-        self.dtype = dtype
-        self.is_sparse = is_sparse
-
-        self.labels = labels
-        self.path_code = path_code
-        self.path_table = path_table
-        self.is_custom = path_code is not None and path_table is not None
-
-    def setUp(self):
-        input_shape = (self.batch_size, self.feature_size)
-        self.input = np.random.uniform(
-            -1, 1, size=input_shape).astype(self.dtype)
-        if self.labels is None:
-            self.labels = np.random.randint(
-                0, self.num_classes, size=(self.batch_size, 1)).astype(np.int64)
-        C = self.num_classes if self.is_custom else self.num_classes - 1
-        self.weight_shape = (C, self.feature_size)
-        self.weight = np.random.randn(*self.weight_shape).astype(self.dtype)
-        self.bias_shape = (C, 1)
-        self.bias = np.random.randn(*self.bias_shape).astype(self.dtype)
-
-    def fluid_layer(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                x = fluid.data(
-                    "input", [-1, self.feature_size], dtype=self.dtype)
-                label = fluid.data("labels", [-1, 1], dtype="int64")
-                if self.is_custom:
-                    path_table = fluid.data(
-                        "path_table", [-1, -1], dtype="int64")
-                    path_code = fluid.data("path_code", [-1, -1], dtype="int64")
-                else:
-                    path_table = path_code = None
-                y = fluid.layers.hsigmoid(
-                    x,
-                    label,
-                    self.num_classes,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
-                    bias_attr=I.NumpyArrayInitializer(self.bias),
-                    path_table=path_table,
-                    path_code=path_code,
-                    is_custom=self.is_custom,
-                    is_sparse=self.is_sparse, )
-        exe = fluid.Executor(place)
-        exe.run(start)
-        feed_dict = {"input": self.input, "labels": self.labels}
-        if self.is_custom:
-            feed_dict["path_code"] = self.path_code
-            feed_dict["path_table"] = self.path_table
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-        return y_np
-
-    def functional(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                x = fluid.data(
-                    "input", [-1, self.feature_size], dtype=self.dtype)
-                label = fluid.data("labels", [-1, 1], dtype="int64")
-                if self.is_custom:
-                    path_table = fluid.data(
-                        "path_table", [-1, -1], dtype="int64")
-                    path_code = fluid.data("path_code", [-1, -1], dtype="int64")
-                else:
-                    path_table = path_code = None
-                w = fluid.data("weight", self.weight_shape, dtype=self.dtype)
-                b = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.hsigmoid(
-                    x,
-                    label,
-                    w,
-                    b,
-                    self.num_classes,
-                    is_sparse=self.is_sparse,
-                    path_table=path_table,
-                    path_code=path_code)
-
-        exe = fluid.Executor(place)
-        exe.run(start)
-        feed_dict = {
-            "input": self.input,
-            "labels": self.labels,
-            "weight": self.weight,
-            "bias": self.bias
-        }
-        if self.is_custom:
-            feed_dict["path_code"] = self.path_code
-            feed_dict["path_table"] = self.path_table
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-        return y_np
-
-    def nn_layer(self, place):
-        with dg.guard(place):
-            x_var = dg.to_variable(self.input)
-            label_var = dg.to_variable(self.labels)
-            if self.is_custom:
-                path_code_var = dg.to_variable(self.path_code)
-                path_table_var = dg.to_variable(self.path_table)
-            else:
-                path_code_var = path_table_var = None
-            hierarchical_softmax = nn.HSigmoid(
-                self.feature_size,
-                self.num_classes,
-                is_custom=self.is_custom,
-                is_sparse=self.is_sparse,
-                param_attr=I.NumpyArrayInitializer(self.weight),
-                bias_attr=I.NumpyArrayInitializer(self.bias),
-                dtype=self.dtype)
-            y_var = hierarchical_softmax(
-                x_var,
-                label_var,
-                path_table=path_table_var,
-                path_code=path_code_var)
-            y_np = y_var.numpy()
-        return y_np
-
-    def _test_equivalence(self, place):
-        result1 = self.fluid_layer(place)
-        result2 = self.functional(place)
-        result3 = self.nn_layer(place)
-        np.testing.assert_array_almost_equal(result1, result2)
-        np.testing.assert_array_almost_equal(result2, result3)
-
-    def runTest(self):
-        place = fluid.CPUPlace()
-        self._test_equivalence(place)
-
-
-class HSigmoidTestErrorCase(HSigmoidTestCase):
-    def runTest(self):
-        place = fluid.CPUPlace()
-        with dg.guard(place):
-            with self.assertRaises(ValueError):
-                self.nn_layer()
-
-    def nn_layer(self):
-        x_var = dg.to_variable(self.input)
-        label_var = dg.to_variable(self.labels)
-        if self.is_custom:
-            path_code_var = dg.to_variable(self.path_code)
-            path_table_var = dg.to_variable(self.path_table)
-        else:
-            path_code_var = path_table_var = None
-        hierarchical_softmax = nn.HSigmoid(
-            self.feature_size,
-            self.num_classes,
-            is_custom=self.is_custom,
-            param_attr=I.NumpyArrayInitializer(self.weight),
-            bias_attr=I.NumpyArrayInitializer(self.bias),
-            dtype=self.dtype)
-        y_var = hierarchical_softmax(
-            x_var,
-            label_var,
-            path_table=path_table_var,
-            path_code=path_code_var)
-        y_np = y_var.numpy()
-        return y_np
-
-
-def load_tests(loader, standard_tests, pattern):
-    suite = unittest.TestSuite()
-    suite.addTest(HSigmoidTestCase(methodName="runTest"))
-    suite.addTest(
-        HSigmoidTestCase(
-            methodName="runTest",
-            batch_size=4,
-            feature_size=6,
-            num_classes=8,
-            labels=np.array([0, 1, 4, 5]).astype(np.int64),
-            path_table=np.array([(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (
-                0, 1, 4, -1, -1), (0, 2, -1, -1, -1)]).astype(np.int64),
-            path_code=np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
-                1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]).astype(np.int64)))
-    suite.addTest(HSigmoidTestErrorCase(methodName="runTest", num_classes=1))
-    return suite
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 5c9867e681524..3f8eed08adf68 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -19,10 +19,13 @@
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
+import paddle.nn.functional as F
 from paddle.fluid import Program, program_guard
+import paddle.fluid.initializer as I
 import math
 from op_test import OpTest, skip_check_grad_ci
 
+paddle.enable_static()
 np.random.seed(100)
 
 
@@ -56,7 +59,6 @@ def cal_index(self, bit):
     def get_length(self):
         length = 0
         for ele in self.ptable_[self.index_]:  # find the first -1 to stop trace
-
             if ele >= 0:
                 length = length + 1
             else:
@@ -388,8 +390,192 @@ def test_check_grad(self):
         self.check_grad(['X', 'W'], ['Out'], no_grad_set=set('Label'))
 
 
-class TestHSigmoidOpError(unittest.TestCase):
+class TestHSigmoidLossAPI(unittest.TestCase):
+    # test paddle.nn.functional.hsigmoid_loss, paddle.nn.HSigmoidLoss
+    def setUp(self):
+        self.dtype = 'float32'
+        self.batch_size = 4
+        self.feature_size = 6
+        self.num_classes = 8
+        self.is_custom = False
+        self.place = paddle.CPUPlace()
+
+        paddle.set_default_dtype(self.dtype)
+
+        self.x_np = np.random.uniform(
+            -1, 1, [self.batch_size, self.feature_size]).astype(self.dtype)
+        self.labels_np = np.random.randint(
+            self.num_classes, size=(self.batch_size, 1), dtype='int64')
+        self.weight_np = np.random.uniform(
+            -1, 1, [self.num_classes - 1, self.feature_size]).astype(self.dtype)
+        self.bias_np = np.random.uniform(-1, 1, (
+            self.num_classes - 1, )).astype(self.dtype)
+        self.path_table_np = None
+        self.path_code_np = None
+        _, self.out_np = hsigmoid(self.x_np, self.weight_np, self.labels_np,
+                                  self.bias_np, self.num_classes)
+        self.set_attrs()
+
+        if self.is_custom:
+            _, self.out_np = hsigmoidWithCustomTree(
+                self.x_np, self.weight_np, self.path_table_np,
+                self.path_code_np, self.labels_np,
+                self.bias_np.reshape(-1, 1), self.num_classes)
+
+    def set_attrs(self):
+        pass
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        labels = paddle.to_tensor(self.labels_np)
+        weight = paddle.to_tensor(self.weight_np)
+        bias = paddle.to_tensor(self.bias_np)
+        path_table = None
+        path_code = None
+        if self.is_custom:
+            path_table = paddle.to_tensor(self.path_table_np)
+            path_code = paddle.to_tensor(self.path_code_np)
+        out1 = F.hsigmoid_loss(x, labels, self.num_classes, weight, bias,
+                               path_table, path_code)
+
+        weight_attr = I.NumpyArrayInitializer(self.weight_np)
+        bias_attr = I.NumpyArrayInitializer(self.bias_np)
+        m = paddle.nn.HSigmoidLoss(self.feature_size, self.num_classes,
+                                   weight_attr, bias_attr, self.is_custom)
+        out2 = m(x, labels, path_table, path_code)
+
+        for out in [out1, out2]:
+            self.assertTrue(np.allclose(self.out_np, out.numpy()))
+        paddle.enable_static()
+
+    def test_static_api(self):
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(train_program, startup_program):
+            x = paddle.static.data('x', [-1, self.feature_size])
+            labels = paddle.static.data('labels', [-1, 1], 'int64')
+            weight = paddle.static.data('weight', [-1, self.feature_size])
+            bias = paddle.static.data('bias', [-1, ])
+            path_table = None
+            path_code = None
+            if self.is_custom:
+                path_table = paddle.static.data('path_table', [-1, -1], 'int64')
+                path_code = paddle.static.data('path_code', [-1, -1], 'int64')
+            out1 = F.hsigmoid_loss(x, labels, self.num_classes, weight, bias,
+                                   path_table, path_code)
+
+            weight_attr = paddle.framework.ParamAttr(
+                initializer=I.NumpyArrayInitializer(self.weight_np))
+            bias_attr = paddle.framework.ParamAttr(
+                initializer=I.NumpyArrayInitializer(self.bias_np))
+            m = paddle.nn.HSigmoidLoss(self.feature_size, self.num_classes,
+                                       weight_attr, bias_attr, self.is_custom)
+            out2 = m(x, labels, path_table, path_code)
+
+            exe = paddle.static.Executor(self.place)
+            exe.run(startup_program)
+            feed_dict = {
+                'x': self.x_np,
+                'labels': self.labels_np,
+                'weight': self.weight_np,
+                'bias': self.bias_np
+            }
+            if self.is_custom:
+                feed_dict["path_code"] = self.path_code_np
+                feed_dict["path_table"] = self.path_table_np
+            ret1, ret2 = exe.run(train_program,
+                                 feed=feed_dict,
+                                 fetch_list=[out1, out2])
+
+            for ret in [ret1, ret2]:
+                self.assertTrue(np.allclose(self.out_np, ret))
+
+    def test_fluid_api(self):
+        train_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            x = fluid.data('x', [-1, self.feature_size])
+            labels = fluid.data('labels', [-1, 1], 'int64')
+            path_table = None
+            path_code = None
+            if self.is_custom:
+                path_table = fluid.data('path_table', [-1, -1], 'int64')
+                path_code = fluid.data('path_code', [-1, -1], 'int64')
+            weight_attr = I.NumpyArrayInitializer(self.weight_np)
+            bias_attr = I.NumpyArrayInitializer(self.bias_np)
+            out = fluid.layers.hsigmoid(x, labels, self.num_classes,
+                                        weight_attr, bias_attr, 'out',
+                                        path_table, path_code, self.is_custom)
+
+            exe = fluid.Executor(self.place)
+            exe.run(startup_program)
+            feed_dict = {'x': self.x_np, 'labels': self.labels_np}
+            if self.is_custom:
+                feed_dict["path_code"] = self.path_code_np
+                feed_dict["path_table"] = self.path_table_np
+            ret, = exe.run(train_program, feed=feed_dict, fetch_list=[out])
+
+            self.assertTrue(np.allclose(ret, self.out_np))
+
     def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            # test paddle.nn.HSigmoidLoss
+            self.assertRaises(ValueError, paddle.nn.HSigmoidLoss, 6, 1)
+
+            # test paddle.nn.functional.hsigmoid_loss
+            x = paddle.static.data('x', [4, 6])
+            label = paddle.static.data('label', [4, 1], 'int64')
+            weight = paddle.static.data('weight', [7, 6])
+            bias = paddle.static.data('bias', [7])
+
+            x_int32 = paddle.static.data('x_int32', [4, 6], 'int32')
+            self.assertRaises(TypeError, F.hsigmoid_loss, x_int32, label, 8,
+                              weight)
+
+            label_float32 = paddle.static.data('label_float32', [4, 1],
+                                               'float32')
+            self.assertRaises(TypeError, F.hsigmoid_loss, x, label_float32, 8,
+                              weight)
+
+            weight_int32 = paddle.static.data('weight_int32', [7, 6], 'int32')
+            self.assertRaises(TypeError, F.hsigmoid_loss, x, label, 8,
+                              weight_int32)
+
+            bias_int32 = paddle.static.data('bias_int32', [7], 'int32')
+            self.assertRaises(
+                TypeError,
+                F.hsigmoid_loss,
+                x,
+                label,
+                8,
+                weight,
+                bias=bias_int32)
+
+            path_table_int32 = paddle.static.data('path_table_int32', [7],
+                                                  'int32')
+            self.assertRaises(
+                TypeError,
+                F.hsigmoid_loss,
+                x,
+                label,
+                8,
+                weight,
+                path_table=path_table_int32)
+
+            path_code_int32 = paddle.static.data('path_code_int32', [7],
+                                                 'int32')
+            self.assertRaises(
+                TypeError,
+                F.hsigmoid_loss,
+                x,
+                label,
+                8,
+                weight,
+                path_code=path_code_int32)
+
+        # test paddle.fluid.layers.hsigmoid
         with program_guard(Program()):
             label = fluid.data('label', [4, 1], 'int64')
             # The input type must be Variable.
@@ -410,5 +596,17 @@ def test_errors(self):
                               label_int32, 2)
 
 
+class TestHSigmoidLossAPICustom(TestHSigmoidLossAPI):
+    def set_attrs(self):
+        self.is_custom = True
+        self.path_table_np = np.array([(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (
+            0, 1, 4, -1, -1), (0, 2, -1, -1, -1)]).astype(np.int64)
+        self.path_code_np = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
+            1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]).astype(np.int64)
+
+    def test_errors(self):
+        pass
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index b16e95b7130f9..1dddef0cace1d 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -73,7 +73,6 @@
 from .layer.activation import Tanhshrink  #DEFINE_ALIAS
 from .layer.activation import ThresholdedReLU  #DEFINE_ALIAS
 from .layer.activation import LogSoftmax  #DEFINE_ALIAS
-from .layer.activation import HSigmoid  #DEFINE_ALIAS
 from .layer.activation import Maxout  #DEFINE_ALIAS
 from .layer.common import BilinearTensorProduct  #DEFINE_ALIAS
 from .layer.common import Pool2D  #DEFINE_ALIAS
@@ -133,6 +132,7 @@
 # from .layer.loss import NCELoss        #DEFINE_ALIAS
 from .layer.loss import BCEWithLogitsLoss  #DEFINE_ALIAS
 from .layer.loss import CrossEntropyLoss  #DEFINE_ALIAS
+from .layer.loss import HSigmoidLoss  #DEFINE_ALIAS
 from .layer.loss import MSELoss  #DEFINE_ALIAS
 from .layer.loss import L1Loss  #DEFINE_ALIAS
 from .layer.loss import NLLLoss  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index d2e1832c6b637..30eefb2c3912b 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -36,7 +36,6 @@
 from .activation import hardtanh  #DEFINE_ALIAS
 from .activation import hardsigmoid  #DEFINE_ALIAS
 from .activation import hardswish  #DEFINE_ALIAS
-from .activation import hsigmoid  #DEFINE_ALIAS
 from .activation import leaky_relu  #DEFINE_ALIAS
 from .activation import log_sigmoid  #DEFINE_ALIAS
 from .activation import maxout  #DEFINE_ALIAS
@@ -140,6 +139,7 @@
 from .loss import cross_entropy  #DEFINE_ALIAS
 from .loss import dice_loss  #DEFINE_ALIAS
 from .loss import edit_distance  #DEFINE_ALIAS
+from .loss import hsigmoid_loss  #DEFINE_ALIAS
 from .loss import iou_similarity  #DEFINE_ALIAS
 from .loss import kl_div  #DEFINE_ALIAS
 from .loss import l1_loss  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 6e09e25b1ab05..33ecd29162c12 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -26,7 +26,6 @@
     'hardtanh',
     'hardsigmoid',
     'hardswish',
-    'hsigmoid',
     'leaky_relu',
     'log_sigmoid',
     'maxout',
@@ -361,128 +360,6 @@ def hardswish(x, name=None):
     return out
 
 
-def hsigmoid(input,
-             label,
-             weight,
-             bias,
-             num_classes,
-             path_table=None,
-             path_code=None,
-             is_sparse=False):
-    """
-	:alias_main: paddle.nn.functional.hsigmoid
-	:alias: paddle.nn.functional.hsigmoid,paddle.nn.functional.activation.hsigmoid
-
-    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
-    and speed up the model training, especially the training of language model.
-    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
-    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
-    the path, and sum them to get a total cost.
-    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
-    represents the number of classes or the size of word dict.
-
-    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
-    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_. For the custom
-    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
-
-    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
-    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
-    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
-       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
-    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
-       to the same batch of inputs.
-
-    Parameters:
-        input (Variable): A tensor with the shape [N, D], where N is the size of mini-batch,
-            and D is the feature size. Its data type supports float32 and float64.
-        label (Variable): A tensor contains the labels of training data. Its shape is [N, 1]
-            and data type is int64.
-        weight (Variable): A tensor with shape (num_classes - 1, D) if not using custom tree(path_code and path_table is None), or (num_classes, D) if using custom tree.
-        bias (Variable): A tensor with shape (num_classes - 1, 1) if not using custom tree(path_code and path_table is None), or (num_classes, 1) if using custom tree.
-        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
-            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
-            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
-            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
-            classes using by the binary classifier.
-        path_table (Variable, optional): A tensor that stores each batch of samples' path from leaf to root
-            node, its shape is [N, L] and data type is int64, where L is the length of path. For each sample i,
-            path_table[i] is a np.array like structure and each element in this array is the indexes in parent
-            nodes' weight matrix. Default: None.
-        path_code (Variable, optional): A tensor that stores each batch of samples' code of path from leaf
-            to root node, its shape is [N, L] and data type is int64, which is the same as :attr:`path_table`.
-            Each code of path is consisted with the code of nodes from leaf to root node. Default: None.
-        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the
-            gradient of W and input will be sparse. Default: False.
-
-    Returns:
-        Variable: A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as :attr:`input`.
-
-    Examples:
-        .. code-block:: python
-
-            from paddle import fluid, nn
-            import paddle.fluid.dygraph as dg
-            import paddle.nn.functional as F
-            import numpy as np
-
-            main = fluid.Program()
-            start = fluid.Program()
-            feature_size = 6
-            num_classes = 8
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, start):
-                    x = fluid.data("input", [-1, feature_size],
-                                  dtype="float32")
-                    label = fluid.data("labels", [-1, 1], dtype="int64")
-                    w = fluid.data("weight", (num_classes -1, feature_size), dtype="float32")
-                    b = fluid.data("bias", (num_classes -1, ), dtype="float32")
-                    y = F.hsigmoid(x, label, w, b, num_classes)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(start)
-            feed_dict = {
-                "input": np.random.randn(4, feature_size).astype(np.float32),
-                "labels": np.random.randint(0, num_classes, (4, 1)).astype(np.int64),
-                "weight": np.random.randn(num_classes - 1, feature_size).astype(np.float32),
-                "bias": np.random.randn(num_classes - 1, ).astype(np.float32),
-            }
-            y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-            print(y_np.shape)
-
-          # (4, 1)
-    """
-
-    attrs = {
-        "num_classes": num_classes,
-        "is_sparse": is_sparse,
-        "remote_prefetch": is_sparse
-    }
-
-    inputs = {
-        "X": input,
-        "W": weight,
-        "Bias": bias,
-        "PathTable": path_table,
-        "PathCode": path_code,
-        "Label": label
-    }
-
-    helper = LayerHelper('hierarchical_sigmoid', **locals())
-    dtype = helper.input_dtype()
-
-    out = helper.create_variable_for_type_inference(dtype)
-    pre_out = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": out, "PreOut": pre_out, "W_Out": weight}
-
-    helper.append_op(
-        type="hierarchical_sigmoid",
-        inputs=inputs,
-        outputs=outputs,
-        attrs=attrs)
-    return out
-
-
 def leaky_relu(x, negative_slope=0.01, name=None):
     """
     leaky_relu activation
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index c4b5606dddcf1..d085213dffc23 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -54,6 +54,7 @@
     'cross_entropy',
     'dice_loss',
     'edit_distance',
+    'hsigmoid_loss',
     'iou_similarity',
     'kl_div',
     'l1_loss',
@@ -343,6 +344,138 @@ def binary_cross_entropy_with_logits(logit,
     return out
 
 
+def hsigmoid_loss(input,
+                  label,
+                  num_classes,
+                  weight,
+                  bias=None,
+                  path_table=None,
+                  path_code=None,
+                  is_sparse=False,
+                  name=None):
+    """
+    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
+    and speed up the model training, especially the training of language model.
+    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
+    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
+    the path, and sum them to get a total cost.
+    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+    represents the number of classes or the size of word dict.
+
+    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
+    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_. For the custom
+    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
+
+    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
+    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
+    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
+       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
+    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
+       to the same batch of inputs.
+
+    Parameters:
+        input (Tensor): A tensor with the shape [N, D], where N is the size of mini-batch,
+            and D is the feature size. Its data type supports float32 or float64.
+        label (Tensor): A tensor contains the labels of training data. Its shape is [N, 1]
+            and data type is int64.
+        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
+            If the default tree is used (path_code and path_table is None are None), `num_classes`
+            should not be None. If the custom tree is used (path_code and path_table is None are not None),
+            `num_classes` should be the number of non-leaf nodes, which indicates the num of
+            classes using by the binary classifier.
+        weight (Tensor): A tensor with shape (num_classes - 1, D), with the same data type as `input`.
+        bias (Tensor, optional): A tensor with shape (num_classes - 1, 1), with the same data type as `input`.
+            If `bias` is None, no bias will be add. Default is None.
+        path_table (Tensor, optional): A tensor that stores each batch of samples' path from leaf to root
+            node, its shape is [N, L] and data type is int64, where L is the length of path. For each sample i,
+            path_table[i] is a np.array like structure and each element in this array is the indexes in parent
+            nodes' weight matrix. If `path_table` and `path_code` are None, the default tree will be used.
+            Default is None.
+        path_code (Tensor, optional): A tensor that stores each batch of samples' code of path from leaf
+            to root node, its shape is [N, L] and data type is int64, which is the same as :attr:`path_table`.
+            Each code of path is consisted with the code of nodes from leaf to root node. If `path_table` and
+            `path_code` are None, the default tree will be used. Default is None.
+        is_sparse (bool, optional): Whether use sparse updating instead of dense updating. If `is_sparse` is True,
+            the gradient of `weight` and `input` will be sparse. Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as `input`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            paddle.set_device('cpu')
+
+            input = paddle.uniform([2, 3])
+            # [[-0.8018668   0.8736385  -0.9064771 ] # random
+            #  [-0.10228515 -0.87188244 -0.8783718 ]] # random
+            label = paddle.to_tensor([0, 1, 4, 5])
+            num_classes = 5
+            weight=paddle.uniform([num_classes-1, 3])
+            # [[-0.24148715  0.8449961  -0.7399121 ] # random
+            #  [-0.9800559   0.43509364  0.9091208 ] # random
+            #  [ 0.60194826  0.10430074 -0.4521166 ] # random
+            #  [-0.4469818  -0.01536179 -0.604454  ]] # random
+
+            out=F.hsigmoid_loss(input, label, num_classes, weight)
+            # [[3.0159328]
+            #  [2.2407534]]
+    """
+
+    if in_dygraph_mode():
+        out, _, _ = core.ops.hierarchical_sigmoid(
+            input, weight, label, path_table, path_code, bias, 'num_classes',
+            num_classes, 'is_sparse', is_sparse, 'remote_prefetch', is_sparse)
+        return out
+
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'hsigmoid_loss')
+    check_variable_and_dtype(label, 'label', ['int64'], 'hsigmoid_loss')
+    check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
+                             'hsigmoid_loss')
+    if bias is not None:
+        check_variable_and_dtype(bias, 'bias', ['float32', 'float64'],
+                                 'hsigmoid_loss')
+    if path_table is not None:
+        check_variable_and_dtype(path_table, 'path_table', ['int64'],
+                                 'hsigmoid_loss')
+    if path_code is not None:
+        check_variable_and_dtype(path_code, 'path_code', ['int64'],
+                                 'hsigmoid_loss')
+
+    attrs = {
+        "num_classes": num_classes,
+        "is_sparse": is_sparse,
+        "remote_prefetch": is_sparse
+    }
+
+    inputs = {
+        "X": input,
+        "W": weight,
+        "Bias": bias,
+        "PathTable": path_table,
+        "PathCode": path_code,
+        "Label": label
+    }
+
+    helper = LayerHelper('hsigmoid_loss', **locals())
+    out = helper.create_variable_for_type_inference(input.dtype)
+    pre_out = helper.create_variable_for_type_inference(input.dtype)
+    outputs = {"Out": out, "PreOut": pre_out, "W_Out": weight}
+
+    helper.append_op(
+        type="hierarchical_sigmoid",
+        inputs=inputs,
+        outputs=outputs,
+        attrs=attrs)
+    return out
+
+
 def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
     """
     This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 760af09f1f2f5..3a5bcaa21fe5b 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -41,7 +41,6 @@
 from .activation import Sigmoid  #DEFINE_ALIAS
 # from .activation import Softmax        #DEFINE_ALIAS
 from .activation import LogSoftmax  #DEFINE_ALIAS
-from .activation import HSigmoid  #DEFINE_ALIAS
 from .common import BilinearTensorProduct  #DEFINE_ALIAS
 from .common import Bilinear  #DEFINE_ALIAS
 from .common import Pool2D  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index cd17f26e09e37..dbb9d00f365cf 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -38,7 +38,6 @@
     'LogSigmoid',
     'LogSoftmax',
     'Maxout',
-    'HSigmoid',
 ]
 
 from ...fluid.dygraph import layers
@@ -319,142 +318,6 @@ def forward(self, x):
         return F.hardtanh(x, self._min, self._max, self._name)
 
 
-class HSigmoid(layers.Layer):
-    """
-	:alias_main: paddle.nn.HSigmoid
-	:alias: paddle.nn.HSigmoid,paddle.nn.layer.HSigmoid,paddle.nn.layer.activation.HSigmoid
-
-    Hierarchical Sigmoid Layer.
-
-    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
-    and speed up the model training, especially the training of language model.
-    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
-    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
-    the path, and sum them to get a total cost.
-    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
-    represents the number of classes or the size of word dict.
-
-    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
-    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>_`. For the custom
-    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
-
-    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
-    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
-    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
-       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
-    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
-       to the same batch of inputs.
-
-    Parameters:
-        feature_size (int): The feature size.
-        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
-            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
-            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
-            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
-            classes using by the binary classifier.
-        param_attr (ParamAttr, optional): The parameter attribute for the learnable parameters/weights
-            of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create a
-            ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is
-            initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of hsigmoid. If it
-            is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
-            hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
-            set, the bias is initialized zero. Default: None.
-        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and
-            `path_code` should be passed to its forward method, otherwise `path_table` and `path_code`
-            should not be passed to its forward method. Default: False.
-        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the
-            gradient of W and input will be sparse. Default: False.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-          from paddle import fluid, nn
-          import paddle.fluid.dygraph as dg
-          import paddle.nn.functional as F
-          import numpy as np
-
-          main = fluid.Program()
-          start = fluid.Program()
-          feature_size = 6
-          num_classes = 8
-          with fluid.unique_name.guard():
-              with fluid.program_guard(main, start):
-                  x = fluid.data("input", [-1, feature_size],
-                              dtype="float32")
-                  label = fluid.data("labels", [-1, 1], dtype="int64")
-                  hsm = nn.HSigmoid(feature_size, num_classes)
-                  y = hsm(x, label)
-
-          place = fluid.CPUPlace()
-          exe = fluid.Executor(place)
-          exe.run(start)
-          feed_dict = {
-              "input": np.random.randn(4, feature_size).astype(np.float32),
-              "labels": np.random.randint(0, num_classes, (4, 1)).astype(np.int64),
-          }
-          y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-          print(y_np.shape)
-
-          # (4, 1)
-    """
-
-    def __init__(self,
-                 feature_size,
-                 num_classes,
-                 param_attr=None,
-                 bias_attr=None,
-                 is_custom=False,
-                 is_sparse=False,
-                 dtype="float32"):
-        super(HSigmoid, self).__init__()
-        if (num_classes < 2) and (not is_custom):
-            raise ValueError(
-                "num_classes must not be less than 2 with default tree")
-
-        if (not is_custom) and (is_sparse):
-            print("Sparse mode should not be used without custom tree")
-            is_sparse = False
-
-        self._feature_size = feature_size
-        self._num_classes = num_classes
-        self._is_custom = is_custom
-        self._is_sparse = is_sparse
-
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-
-        self._dtype = dtype
-
-        remote_prefetch = is_sparse
-        print("With sparse mode, if your models has only"
-              " small parameter prefetch may cause speed down")
-
-        C = self._num_classes if is_custom else self._num_classes - 1
-        self.weight = self.create_parameter(
-            [C, self._feature_size],
-            attr=self._param_attr,
-            is_bias=False,
-            dtype=self._dtype)
-        self.bias = self.create_parameter(
-            [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype)
-
-    def forward(self, input, label, path_table=None, path_code=None):
-        out = F.hsigmoid(
-            input,
-            label,
-            self.weight,
-            self.bias,
-            self._num_classes,
-            path_table=path_table,
-            path_code=path_code,
-            is_sparse=self._is_sparse)
-        return out
-
-
 class PReLU(layers.Layer):
     """
     PReLU Activation.
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 98048bb7e64cf..5ce4baca55749 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -23,6 +23,7 @@
 __all__ = [
     'BCEWithLogitsLoss',
     'CrossEntropyLoss',
+    'HSigmoidLoss',
     'MSELoss',
     'L1Loss',
     'NLLLoss',
@@ -251,6 +252,128 @@ def forward(self, input, label):
             reduction=self.reduction)
 
 
+class HSigmoidLoss(fluid.dygraph.Layer):
+    """
+    Hierarchical Sigmoid Layer.
+    
+    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
+    and speed up the model training, especially the training of language model.
+    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
+    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
+    the path, and sum them to get a total cost.
+    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+    represents the number of classes or the size of word dict.
+
+    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
+    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>_`. For the custom
+    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
+
+    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
+    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
+    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
+       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
+    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
+       to the same batch of inputs.
+
+    Parameters:
+        feature_size (int): The number of features.
+        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
+            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
+            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
+            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
+            classes using by the binary classifier.
+        weight_attr (ParamAttr, optional): The parameter attribute for the learnable weights
+            of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create a
+            ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is
+            initialized with Xavier. Default is None.
+        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of hsigmoid. If it
+            is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
+            hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
+            set, the bias is initialized zero. Default is None.
+        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and 
+            `path_code` should be passed to its forward method, otherwise `path_table` and `path_code`
+            should not be passed to its forward method. Default is False.
+        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True,
+            the gradient of weight and input will be sparse. Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        input (Tensor): The input tensor. The shapes is [N, D], where N is batch size and D is feature size. It's data type should be float32, float64.
+        label (Tensor): It's shapes is [N, 1]. It's data type should be int64.
+        output (Tensor): The HSigmoid Loss of ``input`` and ``label``. Shape is [N, 1]
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.set_device('cpu')
+
+            input = paddle.uniform([2, 3])
+            # [[-0.2820413   0.9528898  -0.81638825] # random
+            #  [-0.6733154  -0.33866507  0.25770962]] # random
+            label = paddle.to_tensor([0, 1, 4, 5])
+            m = paddle.nn.HSigmoidLoss(3, 5)
+            out = m(input, label)
+            # [[2.4543471]
+            #  [1.9359267]]
+    """
+
+    def __init__(self,
+                 feature_size,
+                 num_classes,
+                 weight_attr=None,
+                 bias_attr=None,
+                 is_custom=False,
+                 is_sparse=False,
+                 name=None):
+        super(HSigmoidLoss, self).__init__()
+        if (num_classes < 2) and (not is_custom):
+            raise ValueError(
+                "num_classes must not be less than 2 with default tree")
+
+        if (not is_custom) and (is_sparse):
+            print("Sparse mode should not be used without custom tree")
+            is_sparse = False
+
+        self._feature_size = feature_size
+        self._num_classes = num_classes
+        self._is_custom = is_custom
+        self._is_sparse = is_sparse
+
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        self._name = name
+        self._dtype = paddle.get_default_dtype()
+
+        remote_prefetch = is_sparse
+        print("With sparse mode, if your models has only"
+              " small parameter prefetch may cause speed down")
+
+        C = self._num_classes if is_custom else self._num_classes - 1
+        self.weight = self.create_parameter(
+            [C, self._feature_size],
+            attr=self._weight_attr,
+            is_bias=False,
+            dtype=self._dtype)
+        self.bias = self.create_parameter(
+            [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype)
+
+    def forward(self, input, label, path_table=None, path_code=None):
+        out = F.hsigmoid_loss(
+            input,
+            label,
+            self._num_classes,
+            self.weight,
+            self.bias,
+            path_table=path_table,
+            path_code=path_code,
+            is_sparse=self._is_sparse,
+            name=self._name)
+        return out
+
+
 class MSELoss(fluid.dygraph.layers.Layer):
     """
     **Mean Square Error Loss**
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index d50bb33f24001..cd089432b1ca3 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -27,7 +27,6 @@
     'data_norm',
     'deformable_conv',
     'group_norm',
-    'hsigmoid',
     'instance_norm',
     'layer_norm',
     'multi_box_head',
@@ -53,7 +52,6 @@
 from ...fluid.layers import data_norm  #DEFINE_ALIAS
 from ...fluid.layers import deformable_conv  #DEFINE_ALIAS
 from ...fluid.layers import group_norm  #DEFINE_ALIAS
-from ...fluid.layers import hsigmoid  #DEFINE_ALIAS
 from ...fluid.layers import instance_norm  #DEFINE_ALIAS
 from ...fluid.layers import layer_norm  #DEFINE_ALIAS
 from ...fluid.layers import multi_box_head  #DEFINE_ALIAS

From 7779790c61a5a2412eba7235bbd839905f008530 Mon Sep 17 00:00:00 2001
From: Guanghua Yu <742925032@qq.com>
Date: Mon, 12 Oct 2020 15:16:12 +0800
Subject: [PATCH 74/91] error message optimization in
 softmax_with_cross_entropy_op (#27772)

* error message optimization in softmax_with_cross_entropy_op

* fix some unsuited comment
---
 .../operators/softmax_with_cross_entropy_op.cu | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index ba56e5e36f985..3ac7a5a127b37 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -357,7 +357,8 @@ static void HardLabelSoftmaxWithCrossEntropy(
     CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
     CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
     default:
-      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Block Dimension must be 2^n in softmax_with_cross_entropy_op."));
       break;
   }
 #undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
@@ -397,7 +398,8 @@ static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
     CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
     CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
     default:
-      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Block Dimension must be 2^n in softmax_with_cross_entropy_op."));
       break;
   }
 
@@ -408,8 +410,10 @@ template <typename T>
 class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(context.GetPlace()), true,
+        platform::errors::Unavailable("softmax_with_cross_entropy operator's "
+                                      "CUDA kernel only runs on GPU device."));
     const Tensor* logits = context.Input<Tensor>("Logits");
     const Tensor* labels = context.Input<Tensor>("Label");
     Tensor* softmax = context.Output<Tensor>("Softmax");
@@ -469,8 +473,10 @@ template <typename T>
 class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(context.GetPlace()), true,
+        platform::errors::Unavailable("softmax_with_cross_entropy operator's "
+                                      "CUDA kernel only runs on GPU device."));
     const Tensor* labels = context.Input<Tensor>("Label");
     const T* loss_grad_data =
         context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();

From 36bb056ed67323d87e85897b2c360601cbe8f317 Mon Sep 17 00:00:00 2001
From: GaoWei8 <53294385+GaoWei8@users.noreply.github.com>
Date: Mon, 12 Oct 2020 15:55:24 +0800
Subject: [PATCH 75/91] Add flattern weight of lstm  (#27192)

* add flattern weight of lstm
---
 paddle/fluid/operators/cudnn_lstm_op.cc       |  43 ++++-
 paddle/fluid/operators/cudnn_lstm_op.cu.cc    | 162 ++++++++++++++++--
 python/paddle/fluid/layers/rnn.py             |  20 +--
 .../tests/unittests/test_lstm_cudnn_op.py     | 127 +++++++++++---
 4 files changed, 290 insertions(+), 62 deletions(-)

diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index 50486ad041aa4..31f0c26a3f3a1 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -25,7 +26,6 @@ class CudnnLSTMOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTM");
-    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTM");
     OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTM");
     OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTM");
 
@@ -122,7 +122,13 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("W",
              "(Tensor) the learnable hidden-hidden weights."
              " The shape is (N), where N is total weight size of the LSTM. "
-             " cudnn concatenate all the weight to one Tensor");
+             " cudnn concatenate all the weight to one Tensor")
+        .AsDispensable();
+    AddInput("WeightList",
+             "(vector<Tensor>), stores weight and bias data when the weight "
+             "use the list format. ")
+        .AsDispensable()
+        .AsDuplicable();
     AddInput("SequenceLength",
              "(Tensor) When the input data is padding, "
              "set this parameter. This parameter represents "
@@ -216,7 +222,6 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTMGrad");
-    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTMGrad");
     OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTMGrad");
     OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTMGrad");
 
@@ -228,7 +233,10 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
     };
 
     SetOutGradDim("Input");
-    SetOutGradDim("W");
+    if (ctx->HasInputs("WeightList")) {
+      ctx->SetOutputsDim(framework::GradVarName("WeightList"),
+                         ctx->GetInputsDim("WeightList"));
+    }
     SetOutGradDim("InitH");
     SetOutGradDim("InitC");
   }
@@ -251,7 +259,9 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput("Input", this->Input("Input"));
     op->SetInput("InitH", this->Input("InitH"));
     op->SetInput("InitC", this->Input("InitC"));
-    op->SetInput("W", this->Input("W"));
+    if (this->HasInput("WeightList")) {
+      op->SetInput("WeightList", this->Input("WeightList"));
+    }
     if (this->HasInput("SequenceLength")) {
       op->SetInput("SequenceLength", this->Input("SequenceLength"));
     }
@@ -262,8 +272,12 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput(framework::GradVarName("LastC"), this->OutputGrad("LastC"));
     op->SetInput(framework::GradVarName("LastH"), this->OutputGrad("LastH"));
 
+    if (this->HasInput("WeightList")) {
+      op->SetOutput(framework::GradVarName("WeightList"),
+                    this->InputGrad("WeightList", false));
+    }
+
     op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
-    op->SetOutput(framework::GradVarName("W"), this->InputGrad("W"));
     op->SetOutput(framework::GradVarName("InitH"), this->InputGrad("InitH"));
     op->SetOutput(framework::GradVarName("InitC"), this->InputGrad("InitC"));
     op->SetAttrMap(this->Attrs());
@@ -290,3 +304,20 @@ REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp);
 
 REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel<float>);
 REGISTER_OP_CPU_KERNEL(cudnn_lstm_grad, ops::NotImpleKernel<float>);
+
+// TODO(Shixiaowei02) Add ModifyInput support
+REGISTER_OP_VERSION(cudnn_lstm)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade cudnn_lstm add a new input [WeightList] and modify input [W] to dispensable.)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewInput(
+                "WeightList",
+                "The WeightList stores weight and bias data. WeightList is "
+                "dispensable.")
+            .NewInput("SequenceLength",
+                      "When the input data is padding, set this parameter. "
+                      "SequenceLength is dispensable.")
+            .NewOutput("StateOut", "Store the global drop state when training")
+            .NewOutput("Reserve",
+                       "A temporary output Tensor to store the reserve_data"));
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 6ac75b78d7058..bea7d9c02ca7d 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -30,6 +30,66 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
+template <typename T, typename Type>
+bool is_continuous(const Type &weight_list) {
+  bool continuous = true;
+  for (size_t i = 0; i < weight_list.size() - 1; ++i) {
+    auto *in_data = weight_list[i]->template data<T>();
+    auto *in_after_data = weight_list[i + 1]->template data<T>();
+    auto in_size = weight_list[i]->numel();
+    bool temp = in_data + in_size == in_after_data;
+    continuous = continuous && temp;
+  }
+  return continuous;
+}
+
+int size_sum(const std::vector<const Tensor *> &weight_list) {
+  int size = 0;
+  for (size_t i = 0; i < weight_list.size(); ++i) {
+    auto in_size = weight_list[i]->numel();
+    size += in_size;
+  }
+  return size;
+}
+
+template <typename T>
+void weight_to_tensor(const platform::Place &place, cudaStream_t stream,
+                      const std::vector<const Tensor *> &weight_list,
+                      Tensor *weight) {
+  auto weight_data = weight->data<T>();
+  int weight_offset = 0;
+  for (size_t i = 0; i < weight_list.size(); ++i) {
+    const T *in_data = weight_list[i]->data<T>();
+    auto in_size = weight_list[i]->numel();
+
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, weight->place()),
+                 weight_data + weight_offset,
+                 BOOST_GET_CONST(platform::CUDAPlace, weight_list[i]->place()),
+                 in_data, in_size * sizeof(T), stream);
+    weight_offset += in_size;
+  }
+}
+
+template <typename T>
+void weight_to_tensor_list(const platform::Place &place, cudaStream_t stream,
+                           std::vector<Tensor *> *weight_grad,
+                           const std::vector<const Tensor *> &weight_input,
+                           const Tensor *weight) {
+  int weight_offset = 0;
+  auto *weight_data = weight->data<T>();
+  for (size_t i = 0; i < weight_input.size(); ++i) {
+    auto in_size = weight_input[i]->numel();
+    T *weight_grad_data = (*weight_grad)[i]->mutable_data<T>(place);
+    const T *src = weight_data + weight_offset;
+
+    memory::Copy(
+        BOOST_GET_CONST(platform::CUDAPlace, (*weight_grad)[i]->place()),
+        weight_grad_data, BOOST_GET_CONST(platform::CUDAPlace, weight->place()),
+        src, in_size * sizeof(T), stream);
+    weight_offset += in_size;
+  }
+}
+
 template <typename T>
 void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle,
                   const int &seq_length, ScopedRNNBase *rnn, const T *x_data,
@@ -75,8 +135,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     const Tensor *init_h = ctx.Input<Tensor>("InitH");
     const Tensor *init_c = ctx.Input<Tensor>("InitC");
 
-    auto w = ctx.Input<Tensor>("W");
-
     Tensor *out = ctx.Output<Tensor>("Out");
     Tensor *last_h = ctx.Output<Tensor>("LastH");
     Tensor *last_c = ctx.Output<Tensor>("LastC");
@@ -87,8 +145,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     const T *init_h_data = init_h->data<T>();
     const T *init_c_data = init_c->data<T>();
 
-    const T *w_data = w->data<T>();
-
     T *out_data = out->mutable_data<T>(ctx.GetPlace());
     T *last_h_data = last_h->mutable_data<T>(ctx.GetPlace());
     T *last_c_data = last_c->mutable_data<T>(ctx.GetPlace());
@@ -113,11 +169,45 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     int seq_length = x->dims()[0];
     int batch_size = x->dims()[1];
     int input_size = x->dims()[2];
-    int weight_numel = w->numel();
     bool state_initialized = state_out->IsInitialized() ? true : false;
 
     size_t workspace_size;
     size_t reserve_size;
+    Tensor weight_whole;
+    T *w_data = nullptr;
+    int weight_numel;
+    bool w_initialized = false;
+    auto place = ctx.GetPlace();
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+                      ctx.device_context())
+                      .stream();
+    if (is_test && ctx.HasInput("W")) {
+      auto *W = ctx.Input<Tensor>("W");
+      w_initialized = W->IsInitialized() ? true : false;
+      weight_numel = W->numel();
+    }
+    if (!w_initialized) {
+      auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+      bool continuous =
+          is_continuous<T, std::vector<const Tensor *>>(weight_list);
+      weight_numel = size_sum(weight_list);
+
+      if (!continuous) {
+        LOG_FIRST_N(WARNING, 2)
+            << "If the memory space of the Input WeightList is not "
+               "continuous, less efficient calculation will be "
+               "called. Please call coalesce_tensor op to make the "
+               "input memory continuous.";
+        weight_whole.mutable_data<T>({weight_numel}, place);
+        weight_to_tensor<T>(place, stream, weight_list, &weight_whole);
+        w_data = weight_whole.data<T>();
+      } else {
+        w_data = const_cast<T *>(weight_list[0]->data<T>());
+      }
+    } else {
+      auto *W = ctx.Input<Tensor>("W");
+      w_data = const_cast<T *>(W->data<T>());
+    }
 
     ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
                       num_layers, dropout_prob, seed, weight_numel,
@@ -136,6 +226,12 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
       LSTMInferece<T>(has_seq_length, handle, seq_length, &rnn, x_data,
                       init_h_data, init_c_data, w_data, out_data, last_h_data,
                       last_c_data, &workspace_data_, workspace_size);
+      if (!w_initialized && ctx.HasInput("W") && ctx.HasInput("WeightList")) {
+        auto *W = const_cast<Tensor *>(ctx.Input<Tensor>("W"));
+        auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+        W->mutable_data<T>({weight_numel}, place);
+        weight_to_tensor<T>(place, stream, weight_list, W);
+      }
     } else {
       if (!has_seq_length) {
         // for train
@@ -176,11 +272,11 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto *input = ctx.Input<Tensor>("Input");
-    auto *weight = ctx.Input<Tensor>("W");
     auto *init_h = ctx.Input<Tensor>("InitH");
     auto *init_c = ctx.Input<Tensor>("InitC");
     auto *reserve = ctx.Input<Tensor>("Reserve");
     auto *state_out = ctx.Input<Tensor>("StateOut");
+    auto weight_list = ctx.MultiInput<Tensor>("WeightList");
 
     auto *out = ctx.Input<Tensor>("Out");
     auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -188,9 +284,10 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("LastC"));
 
     auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto *weight_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
     auto *init_h_grad = ctx.Output<Tensor>(framework::GradVarName("InitH"));
     auto *init_c_grad = ctx.Output<Tensor>(framework::GradVarName("InitC"));
+    auto weight_grad_list = ctx.MultiOutput<framework::Tensor>(
+        framework::GradVarName("WeightList"));
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
@@ -199,7 +296,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto init_h_dims = init_h->dims();
     auto init_c_dims = init_c->dims();
 
-    auto *weight_data = weight->data<T>();
     auto *init_h_data = init_h->data<T>();
     auto *init_c_data = init_c->data<T>();
     auto *out_data = out->data<T>();
@@ -207,18 +303,50 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto *last_h_grad_data = last_h_grad->data<T>();
     auto *last_c_grad_data = last_c_grad->data<T>();
 
+    auto place = ctx.GetPlace();
+    int weight_numel = size_sum(weight_list);
+    bool continuous =
+        is_continuous<T, std::vector<const Tensor *>>(weight_list);
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+                      ctx.device_context())
+                      .stream();
+    Tensor weight_whole;
+    T *weight_data = nullptr;
+
+    if (!continuous) {
+      weight_whole.mutable_data<T>({weight_numel}, place);
+      weight_to_tensor<T>(place, stream, weight_list, &weight_whole);
+      weight_data = weight_whole.data<T>();
+    } else {
+      weight_data = const_cast<T *>(weight_list[0]->data<T>());
+    }
+
+    Tensor weight_grad;
     math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
-    weight_grad->mutable_data<T>(ctx.GetPlace());
-    zero(dev_ctx, weight_grad, static_cast<T>(0.0));
+    weight_grad.mutable_data<T>({weight_numel}, ctx.GetPlace());
+    zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
+    T *weight_grad_data = weight_grad.data<T>();
+
+    int offset = 0;
+    for (size_t i = 0; i < weight_grad_list.size(); ++i) {
+      size_t len = weight_grad_list[i]->numel();
+      auto dim = weight_grad_list[i]->dims();
+      weight_grad_list[i]
+          ->ShareDataWith(weight_grad.Slice(static_cast<int64_t>(offset),
+                                            static_cast<int64_t>(offset + len)))
+          .Resize(dim);
+      offset += len;
+    }
 
     in_grad->mutable_data<T>(input_dims, ctx.GetPlace());
     auto *in_grad_data = in_grad->data<T>();
 
-    init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
-    auto *init_h_grad_data = init_h_grad->data<T>();
+    if (init_h_grad) init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
+    auto *init_h_grad_data = init_h_grad ? init_h_grad->data<T>() : nullptr;
 
-    init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
-    auto *init_c_grad_data = init_c_grad->data<T>();
+    if (init_c_grad) init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
+    auto *init_c_grad_data = init_c_grad ? init_c_grad->data<T>() : nullptr;
 
     float dropout_prob = ctx.Attr<float>("dropout_prob");
     bool is_bidirec = ctx.Attr<bool>("is_bidirec");
@@ -236,7 +364,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     int seq_length = input_dims[0];
     int batch_size = input->dims()[1];
     int input_size = input->dims()[2];
-    int weight_numel = weight->numel();
 
     size_t workspace_size;
     size_t reserve_size;
@@ -268,8 +395,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
           handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
           rnn.init_h_desc(), init_h->data<T>(), rnn.y_descs(), out->data<T>(),
           workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
-          weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
-          reserve_size));
+          weight_grad_data, const_cast<uint8_t *>(reserve_data), reserve_size));
     } else {
 #if CUDNN_VERSION >= 7201
       // for train
@@ -288,7 +414,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
           handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
           rnn.init_h_desc(), init_h->data<T>(), rnn.y_seq_desc(),
           out->data<T>(), workspace_data_.data<uint8_t>(), workspace_size,
-          rnn.weight_desc(), weight_grad->data<T>(),
+          rnn.weight_desc(), weight_grad_data,
           const_cast<uint8_t *>(reserve_data), reserve_size));
 #else
       PADDLE_THROW(platform::errors::Unavailable(
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 8ac46ad2648fd..57c2489194337 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -2443,23 +2443,17 @@ def lstm(input,
     input_shape = list(input.shape)
     input_size = input_shape[-1]
     weight_size = 0
+    num_dirrection = 2 if is_bidirec == True else 1
+
     for i in range(num_layers):
         if i == 0:
-            input_weight_size = (input_size * hidden_size) * 4
+            input_weight_size = (input_size * hidden_size) * 4 * num_dirrection
         else:
-            if is_bidirec:
-                input_weight_size = (hidden_size * 2 * hidden_size) * 4
-            else:
-                input_weight_size = (hidden_size * hidden_size) * 4
+            input_weight_size = (hidden_size * hidden_size) * 4 * num_dirrection
+        hidden_weight_size = (hidden_size * hidden_size) * 4 * num_dirrection
 
-        hidden_weight_size = (hidden_size * hidden_size) * 4
-
-        if is_bidirec:
-            weight_size += (input_weight_size + hidden_weight_size) * 2
-            weight_size += hidden_size * 8 * 2
-        else:
-            weight_size += input_weight_size + hidden_weight_size
-            weight_size += hidden_size * 8
+        weight_size += input_weight_size + hidden_weight_size
+        weight_size += hidden_size * 8 * num_dirrection
 
     weight = helper.create_parameter(
         attr=helper.param_attr,
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index 29a0fa55f7729..82443f8c5493b 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -20,14 +20,44 @@
 
 import paddle.fluid.core as core
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
+import random
+random.seed(2)
+np.set_printoptions(threshold=np.inf)
+paddle.enable_static()
 
 SIGMOID_THRESHOLD_MIN = -40.0
 SIGMOID_THRESHOLD_MAX = 13.0
 EXP_MAX_INPUT = 40.0
 
 
+class RandomWeight:
+    def __init__(self):
+        pass
+
+    def updata_weight(self, hidden_size, input_size, dtype):
+        std = 1.0 / math.sqrt(hidden_size)
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self.dtype = dtype
+
+        self.weight_ih = np.random.uniform(
+            low=-std, high=std, size=(4 * self.hidden_size,
+                                      self.input_size)).astype(dtype)
+        self.weight_hh = np.random.uniform(
+            low=-std, high=std, size=(4 * self.hidden_size,
+                                      self.hidden_size)).astype(dtype)
+        self.bias_ih = np.random.uniform(
+            low=-std, high=std, size=(4 * self.hidden_size)).astype(dtype)
+        self.bias_hh = np.random.uniform(
+            low=-std, high=std, size=(4 * self.hidden_size)).astype(dtype)
+
+
+weight = RandomWeight()
+
+
 class LayerMixin(object):
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
@@ -51,16 +81,13 @@ def __init__(self, input_size, hidden_size, bias=True):
         self.bias = bias
         self.dtype = np.float64
         self.parameters = dict()
-        std = 1.0 / math.sqrt(hidden_size)
-        self.weight_ih = np.ones(
-            (4 * hidden_size, input_size), dtype=self.dtype)
-        self.weight_hh = np.ones((4 * hidden_size,
-                                  hidden_size)).astype(self.dtype)
+        self.weight_ih = weight.weight_ih
+        self.weight_hh = weight.weight_hh
         self.parameters['weight_ih'] = self.weight_ih
         self.parameters['weight_hh'] = self.weight_hh
         if bias:
-            self.bias_ih = np.ones((4 * hidden_size)).astype(self.dtype)
-            self.bias_hh = np.ones((4 * hidden_size)).astype(self.dtype)
+            self.bias_ih = weight.bias_ih
+            self.bias_hh = weight.bias_hh
             self.parameters['bias_ih'] = self.bias_ih
             self.parameters['bias_hh'] = self.bias_hh
         else:
@@ -353,24 +380,26 @@ def __init__(self,
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNLstmOp(OpTest):
-    #TODO(GaoWei8): Need to satisfy the result through the new interface
+    def get_weight_names(self):
+        weight_names = []
+        for i in range(2 * self.num_layers):
+            weight_names.append('weight{}'.format(i))
+        for i in range(2 * self.num_layers):
+            weight_names.append('bias{}'.format(i))
+        return weight_names
+
     def setUp(self):
         self.op_type = "cudnn_lstm"
         self.dtype = np.float64
         self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
         self.num_layers = 1
+        self.set_attrs()
 
         seq_length = 12
         batch_size = 5
         input_size = 21
         hidden_size = 21
 
-        input_weight_size = (hidden_size * hidden_size) * 4
-        hidden_weight_size = (hidden_size * hidden_size) * 4
-        weight_size = input_weight_size + hidden_weight_size
-        weight_size += hidden_size * 8
-        weight_size *= self.num_layers
-
         input = np.random.uniform(
             low=-0.1, high=0.1,
             size=(seq_length, batch_size, input_size)).astype(self.dtype)
@@ -379,17 +408,39 @@ def setUp(self):
         input[9][3:][:] = 0
         input[8][4:][:] = 0
 
+        weight.updata_weight(hidden_size, input_size, self.dtype)
         rnn1 = LSTM(
             input_size,
             hidden_size,
-            self.num_layers,
+            num_layers=self.num_layers,
             time_major=True,
             direction="forward")
 
         output, (last_hidden, last_cell) = rnn1(
             input, sequence_length=self.sequence_length)
 
-        flat_w = np.ones((weight_size)).astype(self.dtype)
+        flat_w = []
+        num = 0
+        for i in range(self.num_layers):
+            if i == 0:
+                weight_ih = weight.weight_ih
+            else:
+                weight_ih = weight.weight_hh
+            flat_w.append(("weight" + str(num), weight_ih))
+            num += 1
+        for i in range(self.num_layers):
+            weight_hh = weight.weight_hh
+            flat_w.append(("weight" + str(num), weight_hh))
+            num += 1
+        num = 0
+        for i in range(self.num_layers):
+            bias_ih = weight.bias_ih
+            flat_w.append(("bias" + str(num), bias_ih))
+            num += 1
+        for i in range(self.num_layers):
+            bias_hh = weight.bias_hh
+            flat_w.append(("bias" + str(num), bias_hh))
+            num += 1
         init_h = np.zeros((self.num_layers, batch_size,
                            hidden_size)).astype(self.dtype)
         init_c = np.zeros((self.num_layers, batch_size,
@@ -398,7 +449,7 @@ def setUp(self):
 
         self.inputs = {
             'Input': input,
-            'W': flat_w,
+            'WeightList': flat_w,
             'InitH': init_h,
             'InitC': init_c,
             'SequenceLength': self.sequence_length
@@ -408,7 +459,7 @@ def setUp(self):
             'is_bidirec': False,
             'input_size': input_size,
             'hidden_size': hidden_size,
-            'num_layers': 1,
+            'num_layers': self.num_layers,
         }
         self.outputs = {
             'Out': output,
@@ -428,16 +479,42 @@ def test_output_with_place(self):
 
     def test_grad_with_place(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place,
-                                   set(['Input', 'W', 'InitH', 'InitC']),
-                                   ['Out', 'LastH', 'LastC'])
+        var_name_list = self.get_weight_names()
+        for var_name in var_name_list:
+            self.check_grad_with_place(
+                place,
+                set(['Input', var_name, 'InitH', 'InitC']),
+                ['Out', 'LastH', 'LastC'])
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestCUDNNLstmOp2(TestCUDNNLstmOp):
-    def set_attrs(self):
-        self.num_layers = 2
+class TestCUDNNlstmAPI(unittest.TestCase):
+    def test_lstm(self):
+        seq_len = 20
+        batch_size = 5
+        hidden_size = 20
+        dropout_prob = 0.0
+        num_layers = 1
+        input = fluid.data(
+            name='input',
+            shape=[seq_len, batch_size, hidden_size],
+            dtype='float64')
+        init_h = layers.fill_constant([num_layers, batch_size, hidden_size],
+                                      'float64', 0.0)
+        init_c = layers.fill_constant([num_layers, batch_size, hidden_size],
+                                      'float64', 0.0)
+        rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len,
+                                              hidden_size, num_layers,
+                                              dropout_prob, False)
+        exe = fluid.Executor(fluid.CUDAPlace(0))
+        exe.run(fluid.default_startup_program())
+        input_i = np.random.uniform(
+            low=-0.1, high=0.1, size=(seq_len, batch_size,
+                                      hidden_size)).astype("float64")
+        out = exe.run(fluid.default_main_program(),
+                      feed={'input': input_i},
+                      fetch_list=[rnn_out, last_h, last_c, 'cudnn_lstm_0.w_0'])
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -448,7 +525,7 @@ def test_lstm(self):
         batch_size = 5
         hidden_size = 20
         dropout_prob = 0.0
-        num_layers = 1
+        num_layers = 2
         input = fluid.data(
             name='input',
             shape=[seq_len, batch_size, hidden_size],

From 16999ae49d4f12aaa7b56c519fd16e004fca2fc0 Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Mon, 12 Oct 2020 16:15:35 +0800
Subject: [PATCH 76/91] use IndexList to improve performance of instance_norm
 op (#25132)

* use IndexList to improve performance, test=develop

* remove EIGEN_HAS_INDEX_LIST, test=develop

* use IndexList only when EIGEN_HAS_INDEX_LIST is true
---
 paddle/fluid/operators/instance_norm_op.cc | 33 ++++++++++++++++++----
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index 03279a9b2c15b..1018adcd930a4 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -181,10 +181,22 @@ class InstanceNormKernel<platform::CPUDeviceContext, T>
     auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
     auto *place = dev_ctx.eigen_device();
 
+    Eigen::DSizes<int, 2> shape(NxC, sample_size);
+// Once eigen on Windows is updated, the if branch can be removed.
+#ifndef EIGEN_HAS_INDEX_LIST
     Eigen::DSizes<int, 2> bcast(1, sample_size);
     Eigen::DSizes<int, 2> C_shape(C, 1);
     Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
-    Eigen::DSizes<int, 2> shape(NxC, sample_size);
+    Eigen::DSizes<int, 1> rdims(1);
+#else
+    Eigen::IndexList<Eigen::type2index<1>, int> bcast;
+    bcast.set(1, sample_size);
+    Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
+    C_shape.set(0, C);
+    Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
+    NxC_shape.set(0, NxC);
+    Eigen::IndexList<Eigen::type2index<1>> rdims;
+#endif
 
     math::SetConstant<platform::CPUDeviceContext, T> set_constant;
 
@@ -201,8 +213,6 @@ class InstanceNormKernel<platform::CPUDeviceContext, T>
     auto x_e = framework::EigenVector<T>::Flatten(*x);
     auto x_arr = x_e.reshape(shape);
 
-    Eigen::DSizes<int, 1> rdims(1);
-
     saved_mean_e.device(*place) = x_arr.mean(rdims);
     auto saved_variance_arr =
         (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon;
@@ -316,14 +326,25 @@ class InstanceNormGradKernel<platform::CPUDeviceContext, T>
     auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
     auto *place = dev_ctx.eigen_device();
 
+    Eigen::DSizes<int, 2> rshape(NxC, sample_size);
+    Eigen::DSizes<int, 2> param_shape(N, C);
+    Eigen::DSizes<int, 2> shape(NxC, sample_size);
+#ifndef EIGEN_HAS_INDEX_LIST
     Eigen::DSizes<int, 1> rdims(0);
     Eigen::DSizes<int, 1> mean_rdims(1);
-    Eigen::DSizes<int, 2> rshape(NxC, sample_size);
     Eigen::DSizes<int, 2> bcast(1, sample_size);
     Eigen::DSizes<int, 2> C_shape(C, 1);
     Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
-    Eigen::DSizes<int, 2> param_shape(N, C);
-    Eigen::DSizes<int, 2> shape(NxC, sample_size);
+#else
+    Eigen::IndexList<Eigen::type2index<0>> rdims;
+    Eigen::IndexList<Eigen::type2index<1>> mean_rdims;
+    Eigen::IndexList<Eigen::type2index<1>, int> bcast;
+    bcast.set(1, sample_size);
+    Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
+    C_shape.set(0, C);
+    Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
+    NxC_shape.set(0, NxC);
+#endif
 
     math::SetConstant<platform::CPUDeviceContext, T> set_constant;
 

From e8a5aefbbd417441124989515662ea03093d56f1 Mon Sep 17 00:00:00 2001
From: yongqiangma <xing.wo@163.com>
Date: Mon, 12 Oct 2020 16:19:07 +0800
Subject: [PATCH 77/91] update CUDAPlace doc. test=document_fix (#27711)

---
 paddle/fluid/pybind/pybind.cc | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 8c75db01dd221..0ee725c302215 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1315,9 +1315,6 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
   py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC(
-    **Note**:
-        For multi-card tasks, please use `FLAGS_selected_gpus` environment variable to set the visible GPU device.
-        The next version will fix the problem with `CUDA_VISIBLE_DEVICES` environment variable.
 
     CUDAPlace is a descriptor of a device.
     It represents a GPU device allocated or to be allocated with Tensor or LoDTensor.
@@ -1336,8 +1333,10 @@ All parameter, weight, gradient are variables in Paddle.
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          gpu_place = fluid.CUDAPlace(0)
+          import paddle
+
+          place = paddle.CUDAPlace(0)
+          paddle.disable_static(place)
 
         )DOC")
       .def("__init__",

From 6335e6a0a6a892cce66164a782e0370f82b3a422 Mon Sep 17 00:00:00 2001
From: chen zhiyu <quby@sina.com>
Date: Mon, 12 Oct 2020 16:49:13 +0800
Subject: [PATCH 78/91] add musl option (#27798)

---
 CMakeLists.txt                         |  1 +
 cmake/configure.cmake                  | 10 ++++++++++
 paddle/fluid/inference/CMakeLists.txt  |  2 +-
 paddle/fluid/inference/check_symbol.sh |  2 +-
 paddle/fluid/platform/enforce.h        | 11 ++++++++---
 paddle/fluid/platform/macros.h         |  2 ++
 paddle/fluid/platform/port.h           |  9 ++++-----
 python/paddle/fluid/core.py            | 26 +++++++++++++++++---------
 8 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b1554fba5e1fa..fa87cc14f2668 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,6 +131,7 @@ option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
 option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
 option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
+option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
 
 # PY_VERSION
 if(NOT PY_VERSION)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index cf458d9770675..fc984f5e560ef 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -51,6 +51,16 @@ if(WIN32)
   endif(NOT MSVC)
 endif(WIN32)
 
+if(WITH_MUSL)
+    add_definitions(-DPADDLE_WITH_MUSL)
+
+    message(STATUS, "Set compile option WITH_MKL=OFF when WITH_MUSL=ON")
+    SET(WITH_MKL OFF)
+
+    message(STATUS, "Set compile option WITH_GPU=OFF when WITH_MUSL=ON")
+    SET(WITH_GPU OFF)
+endif()
+
 if(WITH_PSLIB)
     add_definitions(-DPADDLE_WITH_PSLIB)
 endif()
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index f85e1f6511656..6d35d3395ba60 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -88,7 +88,7 @@ if(NOT APPLE AND NOT WIN32)
   set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
   # check symbol hidden
   FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
-    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
+    "execute_process(COMMAND sh -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
     " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_fluid.so\" RESULT_VARIABLE symbol_res)\n"
     "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
     "  message(FATAL_ERROR \"Check symbol failed.\")\n"
diff --git a/paddle/fluid/inference/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh
index b6b7d1f20baf7..a0f64796576c8 100755
--- a/paddle/fluid/inference/check_symbol.sh
+++ b/paddle/fluid/inference/check_symbol.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 
 lib=$1
 if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index a3ae9e48eea30..165321d9c87ff 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -47,6 +47,10 @@ limitations under the License. */
 #include <type_traits>
 #include <utility>
 
+#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
+#include <execinfo.h>
+#endif
+
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"
 #include "paddle/fluid/platform/errors.h"
@@ -236,13 +240,14 @@ inline std::string SimplifyDemangleStr(std::string str) {
 }
 
 inline std::string GetCurrentTraceBackString() {
-  static constexpr int TRACE_STACK_LIMIT = 100;
   std::ostringstream sout;
 
   sout << "\n\n--------------------------------------\n";
   sout << "C++ Traceback (most recent call last):";
   sout << "\n--------------------------------------\n";
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
+  static constexpr int TRACE_STACK_LIMIT = 100;
+
   void* call_stack[TRACE_STACK_LIMIT];
   auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
   auto symbols = backtrace_symbols(call_stack, size);
@@ -261,7 +266,7 @@ inline std::string GetCurrentTraceBackString() {
   }
   free(symbols);
 #else
-  sout << "Windows not support stack backtrace yet.\n";
+  sout << "Not support stack backtrace yet.\n";
 #endif
   return sout.str();
 }
diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h
index 32b7efc04c1f2..fb5cf9fb31915 100644
--- a/paddle/fluid/platform/macros.h
+++ b/paddle/fluid/platform/macros.h
@@ -25,6 +25,8 @@ limitations under the License. */
   classname& operator=(classname&&) = delete
 #endif
 
+#ifndef PADDLE_WITH_MUSL
 #if defined(__FLT_MAX__)
 #define FLT_MAX __FLT_MAX__
 #endif  // __FLT_MAX__
+#endif  // PADDLE_WITH_MUSL
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index c1b81159aca97..c5e8ff807a2d3 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -14,19 +14,18 @@
 
 #pragma once
 
-#include <cstdio>
-#include <stdexcept>
-
 #include <time.h>
+
+#include <cstdio>
 #include <memory>
+#include <stdexcept>
 #include <string>
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"
 
 #if !defined(_WIN32)
-#include <dlfcn.h>     //  dladdr
-#include <execinfo.h>  // backtrace
+#include <dlfcn.h>  // dladdr
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <algorithm>  // std::accumulate
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 9a14c4cdf14a4..ad116c2597064 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -205,8 +205,15 @@ def pre_load(dso_name):
     load_dso(dso_path)
 
 
-def get_glibc_ver():
-    return run_shell_command("ldd --version | awk '/ldd/{print $NF}'")
+def get_libc_ver():
+    ldd_glibc = run_shell_command("ldd --version | awk '/ldd/{print $NF}'")
+    if ldd_glibc is not None:
+        return ("glibc", ldd_glibc)
+
+    ldd_musl = run_shell_command("ldd 2>&1 | awk '/Version/{print $NF}'")
+    if ldd_musl is not None:
+        return ("musl", ldd_musl)
+    return (None, None)
 
 
 def less_than_ver(a, b):
@@ -231,13 +238,14 @@ def to_list(s):
 # For paddle, the problem is that 'libgomp' is a DSO with static TLS, and it is loaded after 14 DSOs.
 # So, here is a tricky way to solve the problem by pre load 'libgomp' before 'core_avx.so'.
 # The final solution is to upgrade glibc to > 2.22 on the target system.
-if platform.system().lower() == 'linux' and less_than_ver(get_glibc_ver(),
-                                                          '2.23'):
-    try:
-        pre_load('libgomp')
-    except Exception as e:
-        # NOTE(zhiqiu): do not abort if failed, since it may success when import core_avx.so
-        sys.stderr.write('Error: Can not preload libgomp.so')
+if platform.system().lower() == 'linux':
+    libc_type, libc_ver = get_libc_ver()
+    if libc_type == 'glibc' and less_than_ver(libc_ver, '2.23'):
+        try:
+            pre_load('libgomp')
+        except Exception as e:
+            # NOTE(zhiqiu): do not abort if failed, since it may success when import core_avx.so
+            sys.stderr.write('Error: Can not preload libgomp.so')
 
 load_noavx = False
 

From 55e63763ecc4a74fc7ab2001ff10aa198ee3a1f2 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Mon, 12 Oct 2020 11:22:23 +0200
Subject: [PATCH 79/91] [oneDNN] adaptive pool support (#27747)

---
 .../fluid/operators/mkldnn/pool_mkldnn_op.cc  |  3 ++
 paddle/fluid/platform/mkldnn_reuse.h          | 24 ++++++++++++++
 .../unittests/mkldnn/test_pool2d_mkldnn_op.py | 33 +++++++++++++++++++
 3 files changed, 60 insertions(+)

diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index bf12c61a4d9b1..72d2f779f800b 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -126,6 +126,9 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, data_dims,
                   strides, ksize);
 
+    platform::PoolingMKLDNNHandler<T>::ComputeAdaptivePoolParameters(
+        ctx, paddle::framework::vectorize(in_x->dims()), ksize, strides);
+
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
 
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index d1c5480c0f543..785627a09fb27 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -853,6 +853,9 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT<T, mkldnn::pooling_forward,
         CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
                           mkldnn_paddings[1]);
       }
+
+      ComputeAdaptivePoolParameters(ctx, src_tz, ksize, strides);
+
       this->AcquireForwardPrimitiveDescriptor(
           is_test ? mkldnn::prop_kind::forward_inference
                   : mkldnn::prop_kind::forward_training,
@@ -919,6 +922,27 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT<T, mkldnn::pooling_forward,
     return mem_p;
   }
 
+  static void ComputeAdaptivePoolParameters(
+      const paddle::framework::ExecutionContext& ctx,
+      const std::vector<int64_t>& src_tz, std::vector<int64_t>& ksize,
+      std::vector<int64_t>& strides) {
+    if (ctx.Attr<bool>("adaptive")) {
+      // (jczaja): oneDNN is supporting only unchangable in size pool window
+      PADDLE_ENFORCE_EQ(
+          src_tz[src_tz.size() - 1] % ksize[1], 0,
+          platform::errors::Unimplemented(
+              "Input dim must be divisible by corressponding ksize dim."));
+      PADDLE_ENFORCE_EQ(
+          src_tz[src_tz.size() - 2] % ksize[0], 0,
+          platform::errors::Unimplemented(
+              "Input dim must be divisible by corressponding ksize dim."));
+      ksize[0] = src_tz[src_tz.size() - 2] / ksize[0];
+      ksize[1] = src_tz[src_tz.size() - 1] / ksize[1];
+      strides[0] = ksize[0];
+      strides[1] = ksize[1];
+    }
+  }
+
  private:
   static inline int ComputeCeiledOutput(int input_size, int kernel_size,
                                         int padding, int stride) {
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
index ee917b059b87c..467bac67051dd 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
@@ -61,6 +61,37 @@ def init_data_type(self):
 create_test_mkldnn_class(TestCase5)
 
 
+class TestAvgPoolAdaptive(TestPool2D_Op):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_test_case(self):
+        self.ksize = [1, 1]
+        self.strides = [1, 1]
+
+    def init_data_type(self):
+        self.dtype = np.float32
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+
+class TestAvgPoolAdaptive2(TestAvgPoolAdaptive):
+    def init_test_case(self):
+        self.ksize = [2, 3]
+        self.strides = [1, 1]
+
+    def init_shape(self):
+        self.shape = [2, 3, 6, 6]
+
+
 class TestAsymPad(TestPool2D_Op):
     def init_test_case(self):
         self.ksize = [3, 3]
@@ -160,4 +191,6 @@ def init_shape(self):
 
 
 if __name__ == '__main__':
+    from paddle import enable_static
+    enable_static()
     unittest.main()

From 2bcb7c0a2f7212146baebc7f7eb11ff9bd2f33e0 Mon Sep 17 00:00:00 2001
From: joejiong <wujionghao@baidu.com>
Date: Mon, 12 Oct 2020 17:31:17 +0800
Subject: [PATCH 80/91] Mutiply allows non-tensor data input (#27690)

Mutiply allows non-tensor data input
---
 .../fluid/tests/unittests/test_multiply.py    | 75 +++++++++++++++++++
 python/paddle/tensor/math.py                  | 12 +++
 2 files changed, 87 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_multiply.py b/python/paddle/fluid/tests/unittests/test_multiply.py
index dbf167617a24f..abd0c15dc7235 100755
--- a/python/paddle/fluid/tests/unittests/test_multiply.py
+++ b/python/paddle/fluid/tests/unittests/test_multiply.py
@@ -26,6 +26,7 @@ class TestMultiplyAPI(unittest.TestCase):
 
     def __run_static_graph_case(self, x_data, y_data, axis=-1):
         with program_guard(Program(), Program()):
+            paddle.enable_static()
             x = paddle.static.data(
                 name='x', shape=x_data.shape, dtype=x_data.dtype)
             y = paddle.static.data(
@@ -42,6 +43,21 @@ def __run_static_graph_case(self, x_data, y_data, axis=-1):
             res = outs[0]
             return res
 
+    def __run_static_graph_case_with_numpy_input(self, x_data, y_data, axis=-1):
+        with program_guard(Program(), Program()):
+            paddle.enable_static()
+
+            res = tensor.multiply(x_data, y_data, axis=axis)
+            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            outs = exe.run(fluid.default_main_program(),
+                           feed={'x': x_data,
+                                 'y': y_data},
+                           fetch_list=[res])
+            res = outs[0]
+            return res
+
     def __run_dynamic_graph_case(self, x_data, y_data, axis=-1):
         paddle.disable_static()
         x = paddle.to_tensor(x_data)
@@ -49,27 +65,52 @@ def __run_dynamic_graph_case(self, x_data, y_data, axis=-1):
         res = paddle.multiply(x, y, axis=axis)
         return res.numpy()
 
+    def __run_dynamic_graph_case_with_numpy_input(self, x_data, y_data,
+                                                  axis=-1):
+        paddle.disable_static()
+        res = paddle.multiply(x_data, y_data, axis=axis)
+        return res.numpy()
+
     def test_multiply(self):
         """test_multiply."""
         np.random.seed(7)
+
         # test static computation graph: 1-d array
         x_data = np.random.rand(200)
         y_data = np.random.rand(200)
         res = self.__run_static_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
+        # test static computation graph: 1-d array
+        x_data = np.random.rand(200)
+        y_data = np.random.rand(200)
+        res = self.__run_static_graph_case_with_numpy_input(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
         # test static computation graph: 2-d array
         x_data = np.random.rand(2, 500)
         y_data = np.random.rand(2, 500)
         res = self.__run_static_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
+        # test static computation graph with_primitives: 2-d array
+        x_data = np.random.rand(2, 500)
+        y_data = np.random.rand(2, 500)
+        res = self.__run_static_graph_case_with_numpy_input(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
         # test static computation graph: broadcast
         x_data = np.random.rand(2, 500)
         y_data = np.random.rand(500)
         res = self.__run_static_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
+        # test static computation graph with_primitives: broadcast
+        x_data = np.random.rand(2, 500)
+        y_data = np.random.rand(500)
+        res = self.__run_static_graph_case_with_numpy_input(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
         # test static computation graph: broadcast with axis
         x_data = np.random.rand(2, 300, 40)
         y_data = np.random.rand(300)
@@ -77,24 +118,50 @@ def test_multiply(self):
         expected = np.multiply(x_data, y_data[..., np.newaxis])
         self.assertTrue(np.allclose(res, expected))
 
+        # test static computation graph with_primitives: broadcast with axis
+        x_data = np.random.rand(2, 300, 40)
+        y_data = np.random.rand(300)
+        res = self.__run_static_graph_case_with_numpy_input(
+            x_data, y_data, axis=1)
+        expected = np.multiply(x_data, y_data[..., np.newaxis])
+        self.assertTrue(np.allclose(res, expected))
+
         # test dynamic computation graph: 1-d array
         x_data = np.random.rand(200)
         y_data = np.random.rand(200)
         res = self.__run_dynamic_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
+        # test dynamic numpy input computation graph: 1-d array
+        x_data = np.random.rand(200)
+        y_data = np.random.rand(200)
+        res = self.__run_dynamic_graph_case_with_numpy_input(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
         # test dynamic computation graph: 2-d array
         x_data = np.random.rand(20, 50)
         y_data = np.random.rand(20, 50)
         res = self.__run_dynamic_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
+        # test dynamic numpy input computation graph: 1-d array
+        x_data = np.random.rand(20, 50)
+        y_data = np.random.rand(20, 50)
+        res = self.__run_dynamic_graph_case_with_numpy_input(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
         # test dynamic computation graph: broadcast
         x_data = np.random.rand(2, 500)
         y_data = np.random.rand(500)
         res = self.__run_dynamic_graph_case(x_data, y_data)
         self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
 
+        # test dynamic computation graph with numpy tensor: broadcast
+        x_data = np.random.rand(2, 500)
+        y_data = np.random.rand(500)
+        res = self.__run_dynamic_graph_case_with_numpy_input(x_data, y_data)
+        self.assertTrue(np.allclose(res, np.multiply(x_data, y_data)))
+
         # test dynamic computation graph: broadcast with axis
         x_data = np.random.rand(2, 300, 40)
         y_data = np.random.rand(300)
@@ -102,6 +169,14 @@ def test_multiply(self):
         expected = np.multiply(x_data, y_data[..., np.newaxis])
         self.assertTrue(np.allclose(res, expected))
 
+        # test dynamic computation graph with numpy tensor: broadcast with axis
+        x_data = np.random.rand(2, 300, 40)
+        y_data = np.random.rand(300)
+        res = self.__run_dynamic_graph_case_with_numpy_input(
+            x_data, y_data, axis=1)
+        expected = np.multiply(x_data, y_data[..., np.newaxis])
+        self.assertTrue(np.allclose(res, expected))
+
 
 class TestMultiplyError(unittest.TestCase):
     """TestMultiplyError."""
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 2a370422eed7e..138841fcf074b 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -472,15 +472,27 @@ def multiply(x, y, axis=-1, name=None):
     """
     op_type = 'elementwise_mul'
     act = None
+
     if x.dtype != y.dtype:
         raise TypeError(
             'Input tensors must be same type, but received type of x: %s, type of y: %s '
             % (x.dtype, y.dtype))
 
     if in_dygraph_mode():
+        if not isinstance(x, (paddle.Tensor)):
+            x = paddle.to_tensor(x)
+        if not isinstance(y, (paddle.Tensor)):
+            y = paddle.to_tensor(y)
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
+    if not isinstance(x, (paddle.Tensor, Variable)):
+        x = paddle.static.data(
+            name='x', shape=x.shape, dtype=x.dtype)
+    if not isinstance(y, (paddle.Tensor, Variable)):
+        y = paddle.static.data(
+            name='y', shape=y.shape, dtype=y.dtype)
+
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 def maximum(x, y, axis=-1, name=None):

From 9005c5a2604b43589f6c26560f44364d611cdb54 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Mon, 12 Oct 2020 17:32:28 +0800
Subject: [PATCH 81/91] Lite subgraph support arm cpu. (#27827)

---
 cmake/external/lite.cmake                     | 92 ++++++++++++++-----
 cmake/inference_lib.cmake                     |  2 +-
 .../analysis/ir_passes/lite_subgraph_pass.cc  |  4 +
 paddle/fluid/inference/lite/tensor_utils.cc   |  1 +
 paddle/fluid/pybind/inference_api.cc          |  2 +-
 5 files changed, 75 insertions(+), 26 deletions(-)

diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index 3da550519bae2..1da47bba7b6a5 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if(NOT LINUX OR NOT WITH_MKL)
-  message("Paddle-lite will not build because the required Linux and MKL do not exist.")
+if(NOT LINUX)
+  message("Paddle-lite will not build because the required Linux do not exist.")
   set(WITH_LITE OFF)
   return()
 endif()
@@ -42,30 +42,30 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   endif()
 
   # No quotes, so cmake can resolve it as a command with arguments.
-  set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j)
-  set(LITE_OPTIONAL_ARGS -DWITH_MKL=ON
-                         -DLITE_WITH_CUDA=${WITH_GPU}
-                         -DWITH_MKLDNN=OFF
-                         -DLITE_WITH_X86=ON
-                         -DLITE_WITH_PROFILE=OFF
-                         -DWITH_LITE=OFF
-                         -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF
-                         -DWITH_PYTHON=OFF
-                         -DWITH_TESTING=OFF
-                         -DLITE_BUILD_EXTRA=ON
-                         -DCUDNN_ROOT=${CUDNN_ROOT}
-                         -DLITE_WITH_STATIC_CUDA=OFF
-                         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME}
-                         -DLITE_WITH_XPU=${LITE_WITH_XPU}
-                         -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
-                         -DLITE_WITH_ARM=OFF)
-
-  ExternalProject_Add(
+  if(WITH_ARM)
+    set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j)
+    message(WARNING "BUILD_COMMAND: ${LITE_BUILD_COMMAND}")
+    set(LITE_OPTIONAL_ARGS -DWITH_MKL=OFF
+                           -DLITE_WITH_CUDA=OFF
+                           -DWITH_MKLDNN=OFF
+                           -DLITE_WITH_X86=OFF
+                           -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON
+                           -DLITE_WITH_PROFILE=OFF
+                           -DARM_TARGET_OS=armlinux
+                           -DWITH_LITE=ON
+                           -DWITH_PYTHON=OFF
+                           -DWITH_TESTING=OFF
+                           -DLITE_BUILD_EXTRA=ON
+                           -DLITE_WITH_XPU=${LITE_WITH_XPU}
+                           -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
+                           -DLITE_WITH_ARM=ON)
+    ExternalProject_Add(
       ${LITE_PROJECT}
       ${EXTERNAL_PROJECT_LOG_ARGS}
       GIT_REPOSITORY      "https://github.com/PaddlePaddle/Paddle-Lite.git"
       GIT_TAG             ${LITE_GIT_TAG}
       PREFIX              ${LITE_SOURCES_DIR}
+      PATCH_COMMAND       mkdir -p ${LITE_SOURCES_DIR}/src/extern_lite-build/lite/gen_code && touch ${LITE_SOURCES_DIR}/src/extern_lite-build/lite/gen_code/__generated_code__.cc
       UPDATE_COMMAND      ""
       BUILD_COMMAND       ${LITE_BUILD_COMMAND}
       INSTALL_COMMAND     ""
@@ -81,7 +81,51 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
                           -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                           ${EXTERNAL_OPTIONAL_ARGS}
                           ${LITE_OPTIONAL_ARGS}
-  )
+    )
+    set(LITE_OUTPUT_BIN_DIR inference_lite_lib.armlinux.armv8)
+  else()
+    set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j)
+    set(LITE_OUTPUT_BIN_DIR inference_lite_lib)
+    set(LITE_OPTIONAL_ARGS -DWITH_MKL=ON
+                           -DLITE_WITH_CUDA=${WITH_GPU}
+                           -DWITH_MKLDNN=OFF
+                           -DLITE_WITH_X86=ON
+                           -DLITE_WITH_PROFILE=OFF
+                           -DWITH_LITE=OFF
+                           -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF
+                           -DWITH_PYTHON=OFF
+                           -DWITH_TESTING=OFF
+                           -DLITE_BUILD_EXTRA=ON
+                           -DCUDNN_ROOT=${CUDNN_ROOT}
+                           -DLITE_WITH_STATIC_CUDA=OFF
+                           -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME}
+                           -DLITE_WITH_XPU=${LITE_WITH_XPU}
+                           -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
+                           -DLITE_WITH_ARM=OFF)
+
+    ExternalProject_Add(
+        ${LITE_PROJECT}
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY      "https://github.com/PaddlePaddle/Paddle-Lite.git"
+        GIT_TAG             ${LITE_GIT_TAG}
+        PREFIX              ${LITE_SOURCES_DIR}
+        UPDATE_COMMAND      ""
+        BUILD_COMMAND       ${LITE_BUILD_COMMAND}
+        INSTALL_COMMAND     ""
+        CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                            -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS}
+                            -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                            -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                            -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                            -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                            -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                            ${EXTERNAL_OPTIONAL_ARGS}
+                            ${LITE_OPTIONAL_ARGS}
+    )
+  endif()
   ExternalProject_Get_property(${LITE_PROJECT} BINARY_DIR)
   ExternalProject_Get_property(${LITE_PROJECT} SOURCE_DIR)
   set(LITE_BINARY_DIR ${BINARY_DIR})
@@ -103,8 +147,8 @@ function(external_lite_libs alias path)
   endif()
 endfunction()
 
-external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
-set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
+external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so)
+set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so)
 
 add_definitions(-DPADDLE_WITH_LITE)
 add_definitions(-DLITE_WITH_LOG)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index f4603051a0e7e..d5ef6d85b578f 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -131,7 +131,7 @@ function(copy_part_of_thrid_party TARGET DST)
     if (LITE_BINARY_DIR)
         set(dst_dir "${DST}/third_party/install/lite")
         copy(${TARGET}
-                SRCS ${LITE_BINARY_DIR}/inference_lite_lib/*
+                SRCS ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/*
                 DSTS ${dst_dir})
     endif()
 endfunction()
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index e78d5ef017b7f..2c454893a6203 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -252,7 +252,11 @@ void LiteSubgraphPass::SetUpEngine(
   } else if (use_xpu) {
     target_type = TARGET(kXPU);
   } else {
+#ifdef PADDLE_WITH_ARM
+    target_type = TARGET(kARM);
+#else
     target_type = TARGET(kX86);
+#endif
   }
 
   paddle::lite_api::PrecisionType precision_type =
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index 33661594b926f..7b909b3f84205 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -46,6 +46,7 @@ platform::Place GetNativePlace(const TargetType& type, int id = 0) {
   switch (type) {
     case TargetType::kHost:
     case TargetType::kX86:
+    case TargetType::kARM:
       return platform::CPUPlace();
     case TargetType::kCUDA:
       return platform::CUDAPlace(id);
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index be4d90597e1e1..c8e5048421cca 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -481,8 +481,8 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("disable_trt_plugin_fp16") = false)
       .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
       .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine,
-           py::arg("zero_copy") = false,
            py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
+           py::arg("zero_copy") = false,
            py::arg("passes_filter") = std::vector<std::string>(),
            py::arg("ops_filter") = std::vector<std::string>())
       .def("lite_engine_enabled", &AnalysisConfig::lite_engine_enabled)

From e96fc6abb209081579206a9f628ff13833791285 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 12 Oct 2020 18:46:20 +0800
Subject: [PATCH 82/91] Fix/embedding doc (#27816)

* fix fluid doc

* fix fluid doc

* fix fluid doc, test=document_fix

* fix fluid doc, test=document_fix
---
 python/paddle/fluid/input.py | 104 +++++++++++++++++++++++++++++------
 1 file changed, 88 insertions(+), 16 deletions(-)

diff --git a/python/paddle/fluid/input.py b/python/paddle/fluid/input.py
index 529588c0846b5..0e3ee46fa46d1 100644
--- a/python/paddle/fluid/input.py
+++ b/python/paddle/fluid/input.py
@@ -220,24 +220,96 @@ def embedding(input,
     Returns:
         Variable: Embedding Tensor or LoDTensor mapped by input. The data type is the same as :attr:`dtype` .
 
-    Examples:
+    Static Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            paddle.enable_static()
+            
+            x = paddle.static.data(name="x", shape = [2, 4], dtype=np.int64)
+            embedding = paddle.nn.Embedding(10, 3,
+                        weight_attr=paddle.nn.initializer.Constant(value=1.0))
+            adam = paddle.optimizer.SGD(parameters=[embedding.weight], learning_rate=0.01)
+            output = embedding(x)
+            m_output=paddle.mean(output)
+            
+            adam.minimize(m_output)
+            
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            
+            x = np.array([[7, 2, 4, 5],[4, 3, 2, 9]], dtype=np.int64)
+            
+            # x is a Numpy.
+            # x.data = [[7, 2, 4, 5], [4, 3, 2, 9]]
+            # x.shape = [2, 4]
+            
+            out, = exe.run(paddle.static.default_main_program(), feed={'x':x}, fetch_list=[output])
+            
+            # out is a Numpy.
+            # out.data = [[1., 1., 1.],
+            #             [1., 1., 1.],
+            #             [1., 1., 1.],
+            #             [1., 1., 1.]],
+            #
+            #            [[1., 1., 1.],
+            #             [1., 1., 1.],
+            #             [1., 1., 1.],
+            #             [0., 0., 0.]]]
+            # out.shape = [2, 4, 3]
+
+
+    Dygraph Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import numpy as np
-          data = fluid.data(name='x', shape=[None, 10], dtype='int64')
-
-          # example 1
-          emb_1 = fluid.embedding(input=data, size=[128, 64])
-
-          # example 2: load custom or pre-trained word vectors
-          weight_data = np.random.random(size=(128, 100))  # word vectors with numpy format
-          w_param_attrs = fluid.ParamAttr(
-              name="emb_weight",
-              learning_rate=0.5,
-              initializer=fluid.initializer.NumpyArrayInitializer(weight_data),
-              trainable=True)
-          emb_2 = fluid.embedding(input=data, size=(128, 100), param_attr=w_param_attrs, dtype='float32')   
+            import paddle
+            import numpy as np
+            
+            paddle.disable_static()
+            
+            x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
+            
+            # x is a Tensor.
+            # x.data = [[3], [4], [5]]
+            # x.shape = [3, 1]
+            x = paddle.to_tensor(x_data, stop_gradient=False)
+            
+            # embedding weight shape = [10, 3]
+            embedding = paddle.nn.Embedding(10, 3, sparse=True)
+            
+            # embedding weight data = [10, 3]
+            w0 = np.full(shape=(10, 3), fill_value=2).astype(np.float32)
+            
+            # embedding.weight.shape = [10, 3]
+            # embedding.weight.data =
+            #                        [[2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.],
+            #                         [2., 2., 2.]]
+            embedding.weight.set_value(w0)
+            
+            adam = paddle.optimizer.Adam(
+                parameters=[embedding.weight], learning_rate=0.01)
+            adam.clear_grad()
+            
+            # out is Tensor
+            # out.shape: [3, 1, 3]
+            # out.layout: NCHW
+            # out.dtype: float
+            # out.data: [2 2 2 2 2 2 2 2 2]
+            out = embedding(x)
+            
+            out.backward()
+            adam.step()
+
     """
 
     helper = LayerHelper('embedding', **locals())

From 84d8e49de82cd26362d356b6ac523c9f6c44e08d Mon Sep 17 00:00:00 2001
From: MRXLT <xlt2024@gmail.com>
Date: Mon, 12 Oct 2020 18:54:20 +0800
Subject: [PATCH 83/91] refine adam/strided_slice && fix doc for
 rmsprop/unstack (#27740)

* refine parameters order && doc

* update rmsprop doc

* refine adam/transpose/unstack/stride_slice

* fix bug && doc

* fix doc

* bug fix

* bug fix

* fix doc

* fix doc

* fix doc

* fix doc

* depercate old strided_slice

* update doc

* set default value for name

* update doc
---
 python/paddle/fluid/layers/nn.py              | 10 ++-
 .../tests/unittests/test_strided_slice_op.py  | 13 +++
 .../incubate/complex/tensor/manipulation.py   | 15 ++--
 python/paddle/optimizer/adam.py               | 38 ++++----
 python/paddle/optimizer/adamw.py              | 27 +++---
 python/paddle/optimizer/rmsprop.py            | 24 ++---
 python/paddle/tensor/manipulation.py          | 87 ++++++++++++++++++-
 7 files changed, 147 insertions(+), 67 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8cb0404c18cad..a6402a2852c2a 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -10241,9 +10241,9 @@ def unstack(x, axis=0, num=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
-            y = fluid.layers.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
+            import paddle
+            x = paddle.ones(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
+            y = paddle.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
 
     """
     helper = LayerHelper('unstack', **locals())
@@ -11017,7 +11017,7 @@ def slice(input, axes, starts, ends):
     return out
 
 
-@templatedoc()
+@deprecated(since='2.0.0', update_to="paddle.strided_slice")
 def strided_slice(input, axes, starts, ends, strides):
     """
     :alias_main: paddle.strided_slice
@@ -11095,7 +11095,9 @@ def strided_slice(input, axes, starts, ends, strides):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
 
+            paddle.enable_static()
             input = fluid.data(
                 name="input", shape=[3, 4, 5, 6], dtype='float32')
 
diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
index 37f11c449d21f..0fe6cd5e7e753 100644
--- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
@@ -16,6 +16,9 @@
 import numpy as np
 import unittest
 import paddle.fluid as fluid
+import paddle
+
+paddle.enable_static()
 
 
 def strided_slice_native_forward(input, axes, starts, ends, strides):
@@ -498,6 +501,16 @@ def test_1(self):
         assert np.array_equal(res_6, input[-3:3, 0:100:2, :, -1:2:-1])
         assert np.array_equal(res_7, input[-1, 0:100:2, :, -1:2:-1])
 
+    def test_dygraph_op(self):
+        x = paddle.zeros(shape=[3, 4, 5, 6], dtype="float32")
+        axes = [1, 2, 3]
+        starts = [-3, 0, 2]
+        ends = [3, 2, 4]
+        strides_1 = [1, 1, 1]
+        sliced_1 = paddle.strided_slice(
+            x, axes=axes, starts=starts, ends=ends, strides=strides_1)
+        assert sliced_1.shape == (3, 2, 2, 2)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/incubate/complex/tensor/manipulation.py b/python/paddle/incubate/complex/tensor/manipulation.py
index 7852260a31e3c..d1e0cbed82e99 100644
--- a/python/paddle/incubate/complex/tensor/manipulation.py
+++ b/python/paddle/incubate/complex/tensor/manipulation.py
@@ -128,16 +128,13 @@ def transpose(x, perm, name=None):
         .. code-block:: python
  
             import paddle
-            import numpy as np
-            import paddle.fluid.dygraph as dg
  
-            with dg.guard():
-                a = np.array([[1.0 + 1.0j, 2.0 + 1.0j], [3.0+1.0j, 4.0+1.0j]])
-                x = dg.to_variable(a)
-                y = paddle.complex.transpose(x, [1, 0])
-                print(y.numpy())
-                # [[1.+1.j 3.+1.j]
-                #  [2.+1.j 4.+1.j]]
+            x = paddle.to_tensor([[1.0 + 1.0j, 2.0 + 1.0j], [3.0+1.0j, 4.0+1.0j], [5.0+1.0j, 6.0+1.0j]])
+            x_transposed = paddle.complex.transpose(x, [1, 0])
+            print(x_transposed.numpy())
+            #[[1.+1.j 3.+1.j 5.+1.j]
+            # [2.+1.j 4.+1.j 6.+1.j]]
+
     """
     complex_variable_exists([x], "transpose")
     real = layers.transpose(x.real, perm, name)
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 9cbb45ce60d14..366d8b953e3d4 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -29,7 +29,7 @@ class Adam(Optimizer):
     of section 2 of `Adam paper <https://arxiv.org/abs/1412.6980>`_ ,
     it can dynamically adjusts the learning rate of each parameter using
     the 1st moment estimates and the 2nd moment estimates of the gradient.
-    
+
     The parameter ``param_out`` update rule with gradient ``grad``:
 
     .. math::
@@ -68,13 +68,10 @@ class Adam(Optimizer):
 	    the regularization setting here in optimizer will be ignored for this parameter. \
 	    Otherwise, the regularization setting here in optimizer will take effect. \
 	    Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
-        name (str, optional): Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name`.
-            The default value is None.
         lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
             The accumulators are updated at every step. Every element of the two moving-average
             is updated in both dense mode and sparse mode. If the size of parameter is very large,
@@ -82,17 +79,17 @@ class Adam(Optimizer):
             gradient in current mini-batch, so it will be much more faster. But this mode has
             different semantics with the original Adam algorithm and may lead to different result.
             The default value is False.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            paddle.disable_static()
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
+            inp = paddle.rand([10,10], dtype="float32")
             out = linear(inp)
             loss = paddle.mean(out)
             adam = paddle.optimizer.Adam(learning_rate=0.1,
@@ -105,12 +102,9 @@ class Adam(Optimizer):
 
             # Adam with beta1/beta2 as Tensor and weight_decay as float
             import paddle
-            import numpy as np
 
-            paddle.disable_static()
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
+            inp = paddle.rand([10,10], dtype="float32")
             out = linear(inp)
             loss = paddle.mean(out)
 
@@ -140,8 +134,8 @@ def __init__(self,
                  parameters=None,
                  weight_decay=None,
                  grad_clip=None,
-                 name=None,
-                 lazy_mode=False):
+                 lazy_mode=False,
+                 name=None):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
@@ -258,7 +252,7 @@ def _append_optimize_op(self, block, param_and_grad):
     def step(self):
         """
         Execute the optimizer and update parameters once.
-        
+
         Returns:
             None
 
@@ -266,13 +260,11 @@ def step(self):
             .. code-block:: python
 
                 import paddle
-                import numpy as np
-                paddle.disable_static()
-                value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.to_tensor(value)
+                
+                a = paddle.rand([2,13], dtype="float32")
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
-                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                adam = paddle.optimizer.Adam(learning_rate = 0.01,
                                             parameters = linear.parameters())
                 out = linear(a)
                 out.backward()
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 0b04f03eb14da..00c197a58b3dd 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -23,7 +23,7 @@
 
 class AdamW(Adam):
     """
-    The AdamW optimizer is implemented based on the AdamW Optimization 
+    The AdamW optimizer is implemented based on the AdamW Optimization
     in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
     it can resolves the problem of L2 regularization failure in the Adam optimizer.
 
@@ -32,7 +32,7 @@ class AdamW(Adam):
         t & = t + 1
 
         moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
-        
+
         moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
 
         learning\_rate & = learning\_rate * \\
@@ -57,16 +57,13 @@ class AdamW(Adam):
             The default value is 1e-08.
         weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
         apply_decay_param_fun (function|None, optional): If it is not None,
-            only tensors that makes apply_decay_param_fun(Tensor)==True 
+            only tensors that makes apply_decay_param_fun(Tensor)==True
             will be updated. It only works when we want to specify tensors.
             Default: None.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
-        name (str, optional): Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name`.
-            The default value is None.
         lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
             The accumulators are updated at every step. Every element of the two moving-average
             is updated in both dense mode and sparse mode. If the size of parameter is very large,
@@ -74,18 +71,18 @@ class AdamW(Adam):
             gradient in current mini-batch, so it will be much more faster. But this mode has
             different semantics with the original Adam algorithm and may lead to different result.
             The default value is False.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
     **Notes**:
         **Currently, AdamW doesn't support sparse parameter optimization.**
 
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
 
-            paddle.disable_static()
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
             linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
+            inp = paddle.rand([10,10], dtype="float32")
             out = linear(inp)
             loss = paddle.mean(out)
 
@@ -112,8 +109,8 @@ def __init__(self,
                  weight_decay=0.01,
                  apply_decay_param_fun=None,
                  grad_clip=None,
-                 name=None,
-                 lazy_mode=False):
+                 lazy_mode=False,
+                 name=None):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 2609972d85ccd..5e17ca34ff218 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -90,9 +90,9 @@ class RMSProp(Optimizer):
 	    the regularization setting here in optimizer will be ignored for this parameter. \
 	    Otherwise, the regularization setting here in optimizer will take effect. \
 	    Default None, meaning there is no regularization.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
-            some derived class of ``GradientClipBase`` . There are three cliping strategies 
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
@@ -104,24 +104,18 @@ class RMSProp(Optimizer):
           .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            paddle.disable_static()
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            inp = paddle.rand([10,10], dtype="float32")
             linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
             out = linear(inp)
             loss = paddle.mean(out)
 
-            beta1 = paddle.to_tensor([0.9], dtype="float32")
-            beta2 = paddle.to_tensor([0.99], dtype="float32")
-
-            adam = paddle.optimizer.RMSProp(learning_rate=0.1,
-                    parameters=linear.parameters(),
-                    weight_decay=0.01)
+            rmsprop = paddle.optimizer.RMSProp(learning_rate=0.1,
+                             parameters=linear.parameters(),
+                                       weight_decay=0.01)
             out.backward()
-            adam.step()
-            adam.clear_grad()
+            rmsprop.step()
+            rmsprop.clear_grad()
 
     """
 
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 86bf9b31f9a9e..531629c573fb6 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -25,7 +25,6 @@
 # TODO: define functions to manipulate a tensor  
 from ..fluid.layers import cast  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
-from ..fluid.layers import strided_slice  #DEFINE_ALIAS
 from ..fluid.layers import transpose  #DEFINE_ALIAS
 from ..fluid.layers import unstack  #DEFINE_ALIAS
 
@@ -1461,3 +1460,89 @@ def gather_nd(x, index, name=None):
     """
 
     return paddle.fluid.layers.gather_nd(input=x, index=index, name=name)
+
+
+def strided_slice(x, axes, starts, ends, strides, name=None):
+    """
+    This operator produces a slice of ``x`` along multiple axes. Similar to numpy:
+    https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
+    Slice uses ``axes``, ``starts`` and ``ends`` attributes to specify the start and
+    end dimension for each axis in the list of axes and Slice uses this information
+    to slice the input data tensor. If a negative value is passed to
+    ``starts`` or ``ends`` such as :math:`-i`,  it represents the reverse position of the
+    axis :math:`i-1` th(here 0 is the initial position). The ``strides`` represents steps of
+    slicing and if the ``strides`` is negative, slice operation is in the opposite direction.
+    If the value passed to ``starts`` or ``ends`` is greater than n
+    (the number of elements in this dimension), it represents n.
+    For slicing to the end of a dimension with unknown size, it is recommended
+    to pass in INT_MAX. The size of ``axes`` must be equal to ``starts`` , ``ends`` and ``strides``.
+    Following examples will explain how strided_slice works:
+
+    .. code-block:: text
+
+        Case1:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [1, 0]
+                ends = [2, 3]
+                strides = [1, 1]
+            Then:
+                result = [ [5, 6, 7], ]
+
+        Case2:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [0, 1]
+                ends = [2, 0]
+                strides = [1, -1]
+            Then:
+                result = [ [8, 7, 6], ]
+        Case3:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [0, 1]
+                ends = [-1, 1000]
+                strides = [1, 3]
+            Then:
+                result = [ [2], ]
+    Args:
+        x (Tensor): An N-D ``Tensor``. The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
+        axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to.
+                            It's optional. If it is not provides, it will be treated as :math:`[0,1,...,len(starts)-1]`.
+        starts (list|tuple|Tensor): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of                                                                                          it should be integers or Tensors with shape [1]. If ``starts`` is an Tensor, it should be an 1-D Tensor.                                                                                    It represents starting indices of corresponding axis in ``axes``.
+        ends (list|tuple|Tensor): The data type is ``int32`` . If ``ends`` is a list or tuple, the elements of
+                it should be integers or Tensors with shape [1]. If ``ends`` is an Tensor, it should be an 1-D Tensor .                                                                                     It represents ending indices of corresponding axis in ``axes``.
+        strides (list|tuple|Tensor): The data type is ``int32`` . If ``strides`` is a list or tuple, the elements of
+                it should be integers or Tensors with shape [1]. If ``strides`` is an Tensor, it should be an 1-D Tensor .                                                                                  It represents slice step of corresponding axis in ``axes``.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
+                        For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor:  A ``Tensor`` with the same dimension as ``x``. The data type is same as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.zeros(shape=[3,4,5,6], dtype="float32")
+            # example 1:
+            # attr starts is a list which doesn't contain Tensor.
+            axes = [1, 2, 3]
+            starts = [-3, 0, 2]
+            ends = [3, 2, 4]
+            strides_1 = [1, 1, 1]
+            strides_2 = [1, 1, 2]
+            sliced_1 = paddle.strided_slice(x, axes=axes, starts=starts, ends=ends, strides=strides_1)
+            # sliced_1 is x[:, 1:3:1, 0:2:1, 2:4:1].                                
+            # example 2:
+            # attr starts is a list which contain tensor Tensor.
+            minus_3 = paddle.fill_constant([1], "int32", -3)
+            sliced_2 = paddle.strided_slice(x, axes=axes, starts=[minus_3, 0, 2], ends=ends, strides=strides_2)
+            # sliced_2 is x[:, 1:3:1, 0:2:1, 2:4:2].
+    """
+
+    return paddle.fluid.layers.strided_slice(
+        input=x, axes=axes, starts=starts, ends=ends, strides=strides)

From 8fa4c0988991b2e23095f10eed805a4c83cf4b0d Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Mon, 12 Oct 2020 19:43:30 +0800
Subject: [PATCH 84/91] add load_op_xpu for Baidu Kunlun (#27817)

* add load_op_xpu for Baidu Kunlun, test=kunlun

* add is_compiled_with_xpu for unit test, test=kunlun

* add is_compiled_with_xpu for unit test, test=kunlun
---
 paddle/fluid/operators/load_op_xpu.cc         | 28 +++++++++
 .../fluid/tests/unittests/test_load_op_xpu.py | 63 +++++++++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100644 paddle/fluid/operators/load_op_xpu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_load_op_xpu.py

diff --git a/paddle/fluid/operators/load_op_xpu.cc b/paddle/fluid/operators/load_op_xpu.cc
new file mode 100644
index 0000000000000..e56586552e498
--- /dev/null
+++ b/paddle/fluid/operators/load_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/load_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    load, ops::LoadOpKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::LoadOpKernel<paddle::platform::XPUDeviceContext, double>,
+    ops::LoadOpKernel<paddle::platform::XPUDeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::XPUDeviceContext, int8_t>,
+    ops::LoadOpKernel<paddle::platform::XPUDeviceContext, int64_t>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/python/paddle/fluid/tests/unittests/test_load_op_xpu.py b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
new file mode 100644
index 0000000000000..1d7f986507ca9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest, randomize_probability
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestLoadOpXpu(unittest.TestCase):
+    """ Test load operator.
+    """
+
+    def setUp(self):
+        self.ones = np.ones((4, 4)).astype('float32')
+        main_prog = fluid.Program()
+        start_prog = fluid.Program()
+        with fluid.program_guard(main_prog, start_prog):
+            input = fluid.data('input', shape=[-1, 4], dtype='float32')
+            output = layers.fc(
+                input,
+                4,
+                param_attr=fluid.ParamAttr(
+                    name='w',
+                    initializer=fluid.initializer.NumpyArrayInitializer(
+                        self.ones)))
+        exe = fluid.Executor(fluid.XPUPlace(0))
+        exe.run(start_prog)
+        fluid.io.save_persistables(
+            exe, dirname="/tmp/model", main_program=main_prog)
+
+    def test_load_xpu(self):
+        main_prog = fluid.Program()
+        start_prog = fluid.Program()
+        with fluid.program_guard(main_prog, start_prog):
+            var = layers.create_tensor(dtype='float32')
+            layers.load(var, file_path='/tmp/model/w')
+
+        exe = fluid.Executor(fluid.XPUPlace(0))
+        exe.run(start_prog)
+        ret = exe.run(main_prog, fetch_list=[var.name])
+        self.assertTrue(np.array_equal(self.ones, ret[0]))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 2e1bca99cade4129cca043289d93a3cf77384200 Mon Sep 17 00:00:00 2001
From: guofei <52460041+gfwm2013@users.noreply.github.com>
Date: Mon, 12 Oct 2020 19:59:10 +0800
Subject: [PATCH 85/91] Refine the gradient calculation errors caused by
 renaming in while_grad (#27814)

test=develop
---
 .../fluid/operators/controlflow/while_op.cc   | 70 ++++++++++++++++---
 .../operators/controlflow/while_op_helper.cc  | 11 +++
 .../operators/controlflow/while_op_helper.h   |  4 ++
 .../tests/unittests/test_while_loop_op.py     | 29 ++++----
 .../fluid/tests/unittests/test_while_op.py    |  2 +
 5 files changed, 93 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index b85e740ada9bd..b8ecbe8ab4a9f 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <set>
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -70,6 +72,23 @@ class WhileOp : public framework::OperatorBase {
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
     auto *program = block->Program();
+    bool is_test = Attr<bool>("is_test");
+
+    std::set<std::string> no_copy_var_names;
+    if (!is_test) {
+      const std::vector<framework::OpDesc *> &all_ops = block->AllOps();
+      for (const framework::OpDesc *op : all_ops) {
+        const framework::VariableNameMap &input_var_names = op->Inputs();
+        const framework::VariableNameMap &output_var_names = op->Outputs();
+        for (auto &ipt : input_var_names) {
+          for (const std::string &var_name : ipt.second) {
+            if (StrInVaraiableNameMap(var_name, output_var_names)) {
+              no_copy_var_names.insert(var_name);
+            }
+          }
+        }
+      }
+    }
 
     auto step_scopes =
         scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
@@ -89,7 +108,6 @@ class WhileOp : public framework::OperatorBase {
                           "The Output(StepScope) of WhileOp should be empty."));
 
     bool cond_data = GetCondData(cond);
-    bool is_test = Attr<bool>("is_test");
     auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
     VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
 
@@ -98,8 +116,32 @@ class WhileOp : public framework::OperatorBase {
       while (cond_data) {
         auto &current_scope = scope.NewScope();
         step_scopes->push_back(&current_scope);
+
+        std::vector<std::string> rename_vars;
+        for (const std::string &input_var_name : Inputs(kX)) {
+          if (no_copy_var_names.find(input_var_name) ==
+              no_copy_var_names.end()) {
+            std::string input_var_rename = input_var_name + kSuffix;
+            framework::Variable *input_var = scope.FindVar(input_var_name);
+            if (input_var->IsType<framework::LoDTensor>()) {
+              rename_vars.push_back(input_var_rename);
+              auto input_var_tensor = input_var->Get<LoDTensor>();
+              auto *rename_input_var_tensor =
+                  current_scope.Var(input_var_rename)->GetMutable<LoDTensor>();
+              framework::TensorCopy(input_var_tensor, dev_place,
+                                    rename_input_var_tensor);
+              rename_input_var_tensor->set_lod(input_var_tensor.lod());
+            }
+          }
+        }
         executor.RunPreparedContext(ctx.get(), &current_scope, false, true,
                                     true);
+
+        for (auto &var_rename : rename_vars) {
+          std::string input_var_name =
+              var_rename.substr(0, var_rename.size() - strlen(kSuffix));
+          current_scope.Rename(var_rename, input_var_name);
+        }
         cond_data =
             GetCondData(scope.FindVar(Input(kCondition))->Get<LoDTensor>());
       }
@@ -312,6 +354,10 @@ class WhileGradOp : public framework::OperatorBase {
         //    continue;
         //  }
 
+        auto var_iter =
+            std::find(outside_og_names.begin(), outside_og_names.end(),
+                      pg_ig_names[param_id]);
+
         // zero gradient variable in step 0
         if (cur_scope_iter == step_scopes->rbegin()) {
           auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
@@ -326,7 +372,8 @@ class WhileGradOp : public framework::OperatorBase {
                         "or LoDTensor, but the received var[%s] is %s.",
                         inside_grad_name, framework::ToTypeName(var->Type())));
 
-          if (var->IsType<LoDTensor>()) {
+          if ((var_iter == outside_og_names.end()) &&
+              var->IsType<LoDTensor>()) {
             auto &inside_tensor = var->Get<framework::LoDTensor>();
             framework::AttributeMap attrs;
             attrs["dtype"] = inside_tensor.type();
@@ -343,13 +390,18 @@ class WhileGradOp : public framework::OperatorBase {
                 ->set_lod(inside_tensor.lod());
           }
         }
-        auto new_inside_name = cur_scope.Rename(inside_grad_name);
-        auto sum_op = framework::OpRegistry::CreateOp(
-            "sum", {{"X", {pg_ig_names[param_id], new_inside_name}}},
-            {{"Out", {pg_ig_names[param_id]}}},
-            framework::AttributeMap{{"use_mkldnn", {false}}});
-        sum_op->Run(cur_scope, dev_place);
-        cur_scope.Rename(new_inside_name, inside_grad_name);
+        auto var_outside = scope.FindVar(pg_ig_names[param_id]);
+        if ((var_iter == outside_og_names.end()) ||
+            ((var_iter != outside_og_names.end()) &&
+             var_outside->IsType<framework::LoDTensorArray>())) {
+          auto new_inside_name = cur_scope.Rename(inside_grad_name);
+          auto sum_op = framework::OpRegistry::CreateOp(
+              "sum", {{"X", {pg_ig_names[param_id], new_inside_name}}},
+              {{"Out", {pg_ig_names[param_id]}}},
+              framework::AttributeMap{{"use_mkldnn", {false}}});
+          sum_op->Run(cur_scope, dev_place);
+          cur_scope.Rename(new_inside_name, inside_grad_name);
+        }
       }
       dev_ctx.Wait();
       const_cast<framework::Scope &>(scope).DeleteScope(&cur_scope);
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
index a3fe71f3ec8b3..b8e9f9f36ac81 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -232,5 +232,16 @@ bool GetCondData(const framework::LoDTensor &cond) {
   return cpu_cond->data<bool>()[0];
 }
 
+bool StrInVaraiableNameMap(const std::string &name,
+                           const framework::VariableNameMap &var_names) {
+  for (auto &ipt : var_names) {
+    if (std::find(ipt.second.begin(), ipt.second.end(), name) !=
+        ipt.second.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h
index d2e9953e6477a..8b4a14570b1ef 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
@@ -38,6 +38,7 @@ static constexpr char kX[] = "X";
 static constexpr char kXGRAD[] = "X@GRAD";
 static constexpr char kOutputs[] = "Out";
 static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
+static constexpr char kSuffix[] = "@TMP_COPY";
 
 void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
     const framework::ProgramDesc &program, int block_id,
@@ -50,5 +51,8 @@ void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
 
 bool GetCondData(const framework::LoDTensor &cond);
 
+bool StrInVaraiableNameMap(const std::string &,
+                           const framework::VariableNameMap &);
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_while_loop_op.py b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
index aa692eb536736..83ca577faa5c6 100644
--- a/python/paddle/fluid/tests/unittests/test_while_loop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
@@ -16,6 +16,7 @@
 import numpy as np
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.layers as layers
@@ -24,6 +25,8 @@
 from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.backward import append_backward
 
+paddle.enable_static()
+
 
 class TestApiWhileLoop(unittest.TestCase):
     def test_var_tuple(self):
@@ -199,16 +202,10 @@ def test_while_loop_backward(self):
         def cond(i, x):
             return layers.less_than(i, eleven)
 
-        def body(j, x):
-            # TODO: In while block, if the var created in parent block
-            # participates in the calculation of gradient, the result of gradient
-            # is incorrect because each step scope always returns the same value
-            # generated by last step.
-            # Here we call `assign` op in while block to avoid this bug, and working on fixing it in next PR.
-            i = layers.assign(j)
+        def body(i, x):
             x = layers.elementwise_mul(x=i, y=i)
-            j = layers.increment(j)
-            return [j, x]
+            i = layers.increment(i)
+            return [i, x]
 
         main_program = Program()
         startup_program = Program()
@@ -244,10 +241,10 @@ def body(j, x):
 
     def test_while_loop_backward2(self):
         def cond(i, x):
-            return i < 5
+            return i < 3
 
         def body(i, x):
-            x = x + i
+            x = x * i
             i = i + 1
             return [i, x]
 
@@ -269,17 +266,21 @@ def body(i, x):
 
         feed_i = np.ones(1).astype('float32')
         feed_x = np.ones(1).astype('float32')
-        data = np.asarray([11]).astype('float32')
-        i_grad = np.asarray([1]).astype('float32')
+        data = np.asarray([2]).astype('float32')
+        i_grad = np.asarray([3]).astype('float32')
+        x_grad = np.asarray([2]).astype('float32')
 
         res = exe.run(main_program,
                       feed={'i': feed_i,
                             'x': feed_x},
-                      fetch_list=[mean.name, i.grad_name])
+                      fetch_list=[mean.name, i.grad_name, x.grad_name])
         self.assertTrue(np.allclose(np.asarray(res[0]), data))
         self.assertTrue(
             np.allclose(np.asarray(res[1]), i_grad),
             msg=" \nres = \n{} \n\n ans = \n{}".format(res[1], i_grad))
+        self.assertTrue(
+            np.allclose(np.asarray(res[2]), x_grad),
+            msg=" \nres = \n{} \n\n ans = \n{}".format(res[2], x_grad))
 
 
 class TestApiWhileLoop_NestedWithBackwardAndLoDTensorArray(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py
index ee01bfb21f820..d6d52b7d604aa 100644
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -24,6 +24,8 @@
 import numpy
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 class TestWhileOp(unittest.TestCase):
     def simple_net(self):

From e388e603125f72852e073dbd9dbeae790c46f739 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Mon, 12 Oct 2020 20:12:47 +0800
Subject: [PATCH 86/91] Refine cond API English Doc for 2.0RC (#27708)

As the title
---
 python/paddle/fluid/layers/control_flow.py | 63 +++++++++-------------
 1 file changed, 26 insertions(+), 37 deletions(-)

diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 411ac6e51b1c8..0c77917c78190 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -2297,11 +2297,6 @@ def copy_var_to_parent_block(var, layer_helper):
 
 def cond(pred, true_fn=None, false_fn=None, name=None):
     """
-    :api_attr: Static Graph
-	:alias_main: paddle.nn.cond
-	:alias: paddle.nn.cond,paddle.nn.control_flow.cond
-	:old_api: paddle.fluid.layers.cond
-    
     This API returns ``true_fn()`` if the predicate ``pred`` is true else
     ``false_fn()`` . Users could also set ``true_fn`` or ``false_fn`` to
     ``None`` if do nothing and this API will treat the callable simply returns
@@ -2323,17 +2318,18 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
         semantics. For example:
 
         .. code-block:: python
-        
-            import paddle.fluid as fluid
-            a = fluid.data(name='a', shape=[-1, 1], dtype='float32')
-            b = fluid.data(name='b', shape=[-1, 1], dtype='float32')
+
+            import paddle
+
+            a = paddle.zeros((1, 1))
+            b = paddle.zeros((1, 1))
             c = a * b
-            out = fluid.layers.cond(a < b, lambda: a + c, lambda: b * b)
+            out = paddle.nn.cond(a < b, lambda: a + c, lambda: b * b)
 
         No matter whether ``a < b`` , ``c = a * b`` will run.
 
     Args:
-        pred(Variable): A boolean tensor whose numel should be 1. The boolean
+        pred(Tensor): A boolean tensor whose numel should be 1. The boolean
             value determines whether to return the result of ``true_fn`` or
             ``false_fn`` .
         true_fn(callable, optional): A callable to be performed if ``pred`` is
@@ -2345,7 +2341,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
              refer to :ref:`api_guide_Name` .
 
     Returns:
-        Variable|list(Variable)|tuple(Variable): returns ``true_fn()`` if the
+        Tensor|list(Tensor)|tuple(Tensor): returns ``true_fn()`` if the
         predicate ``pred`` is true else ``false_fn()`` .
 
     Raises:
@@ -2356,10 +2352,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            from paddle.fluid.executor import Executor
-            from paddle.fluid.framework import Program, program_guard
+            import paddle
 
             #
             # pseudocode:
@@ -2369,32 +2362,28 @@ def cond(pred, true_fn=None, false_fn=None, name=None):
             #     return 3, 2
             #
 
+
             def true_func():
-                return layers.fill_constant(
-                    shape=[1, 2], dtype='int32', value=1), layers.fill_constant(
-                        shape=[2, 3], dtype='bool', value=True)
+                return paddle.fill_constant(shape=[1, 2], dtype='int32',
+                                            value=1), paddle.fill_constant(shape=[2, 3],
+                                                                           dtype='bool',
+                                                                           value=True)
+
 
             def false_func():
-                return layers.fill_constant(
-                    shape=[3, 4], dtype='float32', value=3), layers.fill_constant(
-                        shape=[4, 5], dtype='int64', value=2)
-
-            main_program = Program()
-            startup_program = Program()
-            with program_guard(main_program, startup_program):
-                x = layers.fill_constant(shape=[1], dtype='float32', value=0.1)
-                y = layers.fill_constant(shape=[1], dtype='float32', value=0.23)
-                pred = layers.less_than(x, y)            
-                out = layers.cond(pred, true_func, false_func)
-                # out is a tuple containing 2 tensors
-
-            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-            ) else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            ret = exe.run(main_program, fetch_list=out)
+                return paddle.fill_constant(shape=[3, 4], dtype='float32',
+                                            value=3), paddle.fill_constant(shape=[4, 5],
+                                                                           dtype='int64',
+                                                                           value=2)
+
+            x = paddle.fill_constant(shape=[1], dtype='float32', value=0.1)
+            y = paddle.fill_constant(shape=[1], dtype='float32', value=0.23)
+            pred = paddle.less_than(x=x, y=y, name=None)
+            ret = paddle.nn.cond(pred, true_func, false_func)
+            # ret is a tuple containing 2 tensors
             # ret[0] = [[1 1]]
             # ret[1] = [[ True  True  True]
-            #           [ True  True  True]]
+            #           [ True  True  True]]            
 
     """
     if in_dygraph_mode():

From 6cdf2c9604065ea3c48639348e31dfc57ca0aa7c Mon Sep 17 00:00:00 2001
From: Bai Yifan <me@ethanbai.com>
Date: Mon, 12 Oct 2020 20:50:12 +0800
Subject: [PATCH 87/91] mig deformable_conv to deform_conv2d (#27841)

* mig deformable_conv to deform_conv2d
---
 .../unittests/test_deformable_conv_op.py      |  34 ++++
 .../unittests/test_directory_migration.py     |   2 +-
 python/paddle/static/nn/__init__.py           |   4 +-
 python/paddle/static/nn/common.py             | 179 +++++++++++++++++-
 4 files changed, 215 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
index e685d7b5f53b0..eed637b1d5da1 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from op_test import OpTest
@@ -260,6 +261,7 @@ def init_group(self):
 class TestModulatedDeformableConvInvalidInput(unittest.TestCase):
     def test_error(self):
         def test_invalid_input():
+            paddle.enable_static()
             input = [1, 3, 32, 32]
             offset = fluid.data(
                 name='offset', shape=[None, 3, 32, 32], dtype='float32')
@@ -271,6 +273,7 @@ def test_invalid_input():
         self.assertRaises(TypeError, test_invalid_input)
 
         def test_invalid_offset():
+            paddle.enable_static()
             input = fluid.data(
                 name='input', shape=[None, 3, 32, 32], dtype='int32')
             offset = fluid.data(
@@ -283,5 +286,36 @@ def test_invalid_offset():
         self.assertRaises(TypeError, test_invalid_offset)
 
 
+class TestDeformConv2dAPI(unittest.TestCase):
+    def test_api(self):
+        def test_deform_conv2d_v1():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name='input_v1', shape=[None, 3, 32, 32], dtype='float32')
+            offset = paddle.static.data(
+                name='offset_v1', shape=[None, 4, 32, 32], dtype='float32')
+            out = paddle.static.nn.deform_conv2d(
+                input, offset, None, num_filters=4, filter_size=1)
+
+            assert (out.shape == (-1, 4, 32, 32))
+
+        test_deform_conv2d_v1()
+
+        def test_deform_conv2d_v2():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name='input_v2', shape=[None, 3, 32, 32], dtype='float32')
+            offset = paddle.static.data(
+                name='offset_v2', shape=[None, 4, 32, 32], dtype='float32')
+            mask = paddle.static.data(
+                name='mask_v2', shape=[None, 2, 32, 32], dtype='float32')
+            out = paddle.static.nn.deform_conv2d(
+                input, offset, mask, num_filters=4, filter_size=1)
+
+            assert (out.shape == (-1, 4, 32, 32))
+
+        test_deform_conv2d_v2()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index fd014f3b4ecaf..28232e9ba4dc0 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -63,7 +63,7 @@ def test_new_directory(self):
             'paddle.static.nn.conv3d', 'paddle.static.nn.conv3d_transpose',
             'paddle.static.nn.create_parameter',
             'paddle.static.nn.crf_decoding', 'paddle.static.nn.data_norm',
-            'paddle.static.nn.deformable_conv', 'paddle.static.nn.group_norm',
+            'paddle.static.nn.deform_conv2d', 'paddle.static.nn.group_norm',
             'paddle.static.nn.instance_norm', 'paddle.static.nn.layer_norm',
             'paddle.static.nn.multi_box_head', 'paddle.static.nn.nce',
             'paddle.static.nn.prelu', 'paddle.static.nn.row_conv',
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index cd089432b1ca3..3ae65e879f723 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -25,7 +25,7 @@
     'create_parameter',
     'crf_decoding',
     'data_norm',
-    'deformable_conv',
+    'deform_conv2d',
     'group_norm',
     'instance_norm',
     'layer_norm',
@@ -39,6 +39,7 @@
 ]
 
 from .common import fc  #DEFINE_ALIAS
+from .common import deform_conv2d  #DEFINE_ALIAS
 
 from ...fluid.layers import batch_norm  #DEFINE_ALIAS
 from ...fluid.layers import bilinear_tensor_product  #DEFINE_ALIAS
@@ -50,7 +51,6 @@
 from ...fluid.layers import create_parameter  #DEFINE_ALIAS
 from ...fluid.layers import crf_decoding  #DEFINE_ALIAS
 from ...fluid.layers import data_norm  #DEFINE_ALIAS
-from ...fluid.layers import deformable_conv  #DEFINE_ALIAS
 from ...fluid.layers import group_norm  #DEFINE_ALIAS
 from ...fluid.layers import instance_norm  #DEFINE_ALIAS
 from ...fluid.layers import layer_norm  #DEFINE_ALIAS
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 59ffacdaebed5..93a603f4770a7 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -15,7 +15,7 @@
 import paddle
 from paddle.fluid.framework import static_only
 
-__all__ = ['fc']
+__all__ = ['fc', 'deform_conv2d']
 
 
 @static_only
@@ -163,3 +163,180 @@ def fc(x,
                                   bias_attr=bias_attr,
                                   act=activation,
                                   name=name)
+
+
+@static_only
+def deform_conv2d(x,
+                  offset,
+                  mask,
+                  num_filters,
+                  filter_size,
+                  stride=1,
+                  padding=0,
+                  dilation=1,
+                  groups=1,
+                  deformable_groups=1,
+                  im2col_step=1,
+                  weight_attr=None,
+                  bias_attr=None,
+                  name=None):
+    """
+
+    Compute 2-D deformable convolution on 4-D input.
+    Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:
+
+
+    Deformable Convolution v2:
+
+    .. math::
+
+        y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k) * \Delta m_k}
+
+    Deformable Convolution v1:
+
+    .. math::
+
+        y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k)}
+
+    Where :math:`\Delta p_k` and :math:`\Delta m_k` are the learnable offset and modulation scalar for the k-th location,
+    Which :math:`\Delta m_k` is one in deformable convolution v1. Please refer to `Deformable ConvNets v2: More Deformable, Better Results
+    <https://arxiv.org/abs/1811.11168v2>`_ and `Deformable Convolutional Networks <https://arxiv.org/abs/1703.06211>`_.
+
+    Example:
+        - Input:
+
+          X shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
+
+          Offset shape: :math:`(N, 2 * deformable\_groups * H_f * H_w, H_{in}, W_{in})`
+
+          Mask shape: :math:`(N, deformable\_groups * H_f * H_w, H_{in}, W_{in})`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+    Args:
+        x (Tensor): The input image with [N, C, H, W] format. A Tensor with type
+            float32, float64.
+        offset (Tensor): The input coordinate offset of deformable convolution layer.
+            A Tensor with type float32, float64.
+        Mask (Tensor, Optional): The input mask of deformable convolution layer.
+            A Tensor with type float32, float64. It should be None when you use
+            deformable convolution v1.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride (int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int): The groups number of the deformable conv layer. According to
+            grouped convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1.
+        deformable_groups (int): The number of deformable group partitions.
+            Default: deformable_groups = 1.
+        im2col_step (int): Maximum number of images per im2col computation;
+            The total batch size should be devisable by this value or smaller
+            than this value; if you face out of memory problem, you can try
+            to use a smaller value here.
+            Default: im2col_step = 1.
+        weight_attr (ParamAttr, Optional): The parameter attribute for learnable parameters/weights
+            of deformable conv. If it is set to None or one attribute of ParamAttr,
+            deformable conv will create ParamAttr as weight_attr.
+            If the Initializer of the weight_attr is not set, the parameter is
+            initialized with :math:`Normal(0.0, std)`, and the
+            :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr|bool, Optional): The parameter attribute for the bias of
+            deformable conv layer. If it is set to False, no bias will be added
+            to the output units. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        name(str, Optional): For details, please refer to :ref:`api_guide_Name`.
+                        Generally, no setting is required. Default: None.
+    Returns:
+        Tensor: The tensor storing the deformable convolution \
+                  result. A Tensor with type float32, float64.
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+    Examples:
+        .. code-block:: python
+
+          #deformable conv v2:
+
+          import paddle
+          paddle.enable_static()
+
+          C_in, H_in, W_in = 3, 32, 32
+          filter_size, deformable_groups = 3, 1
+          data = paddle.static.data(name='data', shape=[None, C_in, H_in, W_in], dtype='float32')
+          offset = paddle.static.data(name='offset', shape=[None, 2*deformable_groups*filter_size**2, H_in, W_in], dtype='float32')
+          mask = paddle.static.data(name='mask', shape=[None, deformable_groups*filter_size**2, H_in, W_in], dtype='float32')
+          out = paddle.static.nn.deform_conv2d(x=data, offset=offset, mask=mask,
+                                             num_filters=2, filter_size=filter_size, padding=1)
+
+          #deformable conv v1:
+
+          import paddle
+          paddle.enable_static()
+
+          C_in, H_in, W_in = 3, 32, 32
+          filter_size, deformable_groups = 3, 1
+          data = paddle.static.data(name='data', shape=[None, C_in, H_in, W_in], dtype='float32')
+          offset = paddle.static.data(name='offset', shape=[None, 2*deformable_groups*filter_size**2, H_in, W_in], dtype='float32')
+          out = paddle.static.nn.deform_conv2d(x=data, offset=offset, mask=None,
+                                             num_filters=2, filter_size=filter_size, padding=1)
+    """
+
+    if mask is None:
+        return paddle.fluid.layers.deformable_conv(
+            input=x,
+            offset=offset,
+            mask=mask,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            deformable_groups=deformable_groups,
+            im2col_step=im2col_step,
+            param_attr=weight_attr,
+            bias_attr=bias_attr,
+            modulated=False,
+            name=name)
+    else:
+        return paddle.fluid.layers.deformable_conv(
+            input=x,
+            offset=offset,
+            mask=mask,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            deformable_groups=deformable_groups,
+            im2col_step=im2col_step,
+            param_attr=weight_attr,
+            bias_attr=bias_attr,
+            modulated=True,
+            name=name)

From 445634fa8b62863afaf679beed5b4c672353f21b Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Mon, 12 Oct 2020 22:08:16 +0800
Subject: [PATCH 88/91] polish Return doc of DataLoader (#27808)

* polish Return doc of DataLoader. test=develop
---
 python/paddle/fluid/reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 6cc00a7fd3734..35dcd45223419 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -215,7 +215,7 @@ class DataLoader(object):
             None.
 
     Returns:
-        DataLoader: an iterable object for data iterating
+        DataLoader: an iterable object for data iterating, each elemnet of the generated data is a Tensor.
 
     Examples:
         

From bbc837ee72cc57e089c4e13f610ac5aae1fa81b6 Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Tue, 13 Oct 2020 08:53:11 +0800
Subject: [PATCH 89/91] add info log for trt input dynamic shape check (#27796)

* add info log for trt input dynamic shape check

* fix error msg error
---
 .../fluid/inference/tensorrt/convert/op_converter.h  | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index ac0a04b9a116d..4a386ac1d81c5 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -164,6 +164,7 @@ class OpConverter {
       const std::unordered_set<std::string>& parameters,
       const std::vector<std::string>& outputs, TensorRTEngine* engine) {
     engine->InitNetwork();
+    bool all_dynamic_shape_set = true;
     for (auto& input : inputs) {
       if (parameters.count(input)) continue;
       auto* var = block_desc->FindVar(input);
@@ -181,6 +182,13 @@ class OpConverter {
         auto max_input_shape = engine->max_input_shape()[input];
         auto optim_input_shape = engine->optim_input_shape()[input];
         size_t ranks = min_input_shape.size();
+        if (ranks == 0) {
+          all_dynamic_shape_set = false;
+          LOG(INFO) << "trt input [" << input.c_str()
+                    << "] dynamic shape info not set, please check and retry.";
+          // check other input
+          continue;
+        }
         std::vector<int64_t> input_shape;
         input_shape.push_back(-1);
         for (size_t i = 1; i < ranks; i++) {
@@ -207,6 +215,10 @@ class OpConverter {
             Vec2TRT_Dims(var_shape, input));
       }
     }
+    PADDLE_ENFORCE_EQ(all_dynamic_shape_set, true,
+                      platform::errors::InvalidArgument(
+                          "some trt inputs dynamic shape info not set, "
+                          "check the INFO log above for more details."));
     framework::proto::BlockDesc* block_proto = block_desc->Proto();
     ConvertBlock(*block_proto, parameters, scope, engine);
     for (auto& output : outputs) {

From c5f2802d56743b45def13f509a8c6a8cb133345b Mon Sep 17 00:00:00 2001
From: Chengmo <cmchengmo@163.com>
Date: Tue, 13 Oct 2020 09:52:59 +0800
Subject: [PATCH 90/91] =?UTF-8?q?=E3=80=90paddle.fleet=E3=80=91Update=20fl?=
 =?UTF-8?q?eetrun=20&=20ps-heter=20(#27472)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* refine fleetrun.ps_launch

* update fleet run for multi device support

* ps_graph support ps-gpu

* fix heter save

* add heter save unittest

* fix unittest & simple code

* update fleetrun

* fix fleetrun

* fix launch barrier

* fix role maker

* add paddlecloud rolemaker unittest

* rename heter_worker_device_guard
---
 .../framework/distributed_strategy.proto      |   1 +
 .../distributed/fleet/base/role_maker.py      | 174 +++---
 python/paddle/distributed/fleet/launch.py     | 351 +++--------
 .../paddle/distributed/fleet/launch_utils.py  | 555 +++++++++++++++++-
 .../parameter_server_optimizer.py             |   4 +
 .../fleet/runtime/parameter_server_runtime.py |  43 +-
 .../fleet/parameter_server/ir/public.py       |  26 +-
 .../tests/unittests/dist_fleet_heter_ctr.py   |   4 +
 .../tests/unittests/fleet_ps_training.py      |   6 +-
 .../unittests/test_dist_fleet_heter_base.py   |   2 +-
 .../test_dist_fleet_heter_program.py          |   4 +
 .../tests/unittests/test_fleet_launch.sh      |  18 +-
 .../unittests/test_fleet_rolemaker_init.py    | 149 +++++
 13 files changed, 962 insertions(+), 375 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_fleet_rolemaker_init.py

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 21e28d7ac86d0..881ef30ffe690 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -98,6 +98,7 @@ message AsyncConfig {
   optional int32 send_wait_times = 7 [ default = 1 ];
   optional bool runtime_split_send_recv = 8 [ default = false ];
   optional bool launch_barrier = 9 [ default = true ];
+  optional string heter_worker_device_guard = 10 [ default = 'cpu' ];
 }
 
 message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; }
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index deba3b4a17d1b..ce9826d7e59ae 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -530,13 +530,6 @@ def _get_heter_worker_endpoint(self):
         return self._heter_trainer_endpoints[(self._current_id) %
                                              self._heter_worker_num()]
 
-    def _get_heter_worker_device(self):
-        """
-        Returns:
-            string: heter_trainer's device of current node, e.g: CPU/GPU/XPU
-        """
-        return self._heter_trainer_device.upper()
-
 
 class PaddleCloudRoleMaker(RoleMakerBase):
     def __init__(self, is_collective=False, **kwargs):
@@ -677,88 +670,99 @@ def _is_heter_worker(self):
         return self._role == Role.HETER_WORKER
 
     def _ps_env(self):
-        try:
-            # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
-            # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
-            self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST")
-
-            if self._server_endpoints is None:
-                # back to non_distributed execution.
-                self._server_endpoints = ""
-                self._trainers_num = 1
-                self._role = Role.WORKER
-                self._current_id = 0
-                self._nodes_num = 1
-                self._heter_trainers_num = 0
-                self._heter_trainer_endpoints = None
-                self._non_distributed = True
-                return
-
-            self._server_endpoints = self._server_endpoints.split(",")
-
-            self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
-            if self._worker_endpoints:
-                self._worker_endpoints = self._worker_endpoints.split(",")
-            else:
-                self._worker_endpoints = []
+        # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
+        # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
+        self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST", None)
+
+        if self._server_endpoints is None:
+            # back to non_distributed execution.
+            self._server_endpoints = ""
+            self._trainers_num = 1
+            self._role = Role.WORKER
+            self._current_id = 0
+            self._nodes_num = 1
+            self._heter_trainers_num = 0
+            self._heter_trainer_endpoints = None
+            self._non_distributed = True
+            return
+
+        self._server_endpoints = self._server_endpoints.split(",")
+
+        self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", None)
+        if self._worker_endpoints != None:
+            self._worker_endpoints = self._worker_endpoints.split(",")
+        else:
+            self._worker_endpoints = []
+
+        trainers_num = os.getenv("PADDLE_TRAINERS_NUM", None)
+        if trainers_num == None:
+            raise ValueError(
+                "Can not find PADDLE_TRAINERS_NUM, please check your environment."
+            )
+        trainers_num = int(trainers_num)
 
-            trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
-            training_role = os.environ["TRAINING_ROLE"]
+        training_role = os.getenv("TRAINING_ROLE", None)
+        if training_role == None:
+            raise ValueError(
+                "Can not find TRAINING_ROLE, please check your environment.")
 
-            if training_role not in ["TRAINER", "PSERVER", "HETER_TRAINER"]:
+        if training_role not in ["TRAINER", "PSERVER", "HETER_TRAINER"]:
+            raise ValueError(
+                "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment.".
+                format(training_role))
+
+        # For heter parameter server env setting
+        heter_trainer_eplist = os.getenv("PADDLE_HETER_TRAINER_IP_PORT_LIST",
+                                         "")
+        if heter_trainer_eplist != "":
+            try:
+                heter_trainer_eplist = os.environ[
+                    "PADDLE_HETER_TRAINER_IP_PORT_LIST"].split(",")
+            except:
                 raise ValueError(
-                    "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER, but get {}, please check your environment.".
-                    format(training_role))
-
-            # For heter parameter server env setting
-            heter_trainer_eplist = os.getenv(
-                "PADDLE_HETER_TRAINER_IP_PORT_LIST", None)
-            heter_trainer_device = os.getenv("PADDLE_HETER_TRAINER_DEVICE",
-                                             None)
-            if heter_trainer_eplist and heter_trainer_device:
-                try:
-                    heter_trainer_eplist = os.environ[
-                        "PADDLE_HETER_TRAINER_IP_PORT_LIST"].split(",")
-                except:
-                    raise ValueError(
-                        "Can not Find PADDLE_HETER_TRAINER_IP_PORT_LIST in env or its format doesn't match the requirement: 'IP:PORT,IP:PORT' ."
-                    )
-
-                self._is_heter_parameter_server_mode = True
-                heter_trainers_num = len(heter_trainer_eplist)
-                current_node_device = heter_trainer_device.upper()
-                if current_node_device not in ["CPU", "GPU", "XPU"]:
-                    raise ValueError(
-                        "Heter Trainer doesn't support {} device now, please use CPU / GPU / XPU(KunLun)".
-                        format(heter_trainer_device))
-                self._heter_trainer_device = current_node_device
-            else:
-                self._is_heter_parameter_server_mode = False
-                heter_trainers_num = 0
-
-            if training_role == "TRAINER":
-                role = Role.WORKER
-                current_id = int(os.environ["PADDLE_TRAINER_ID"])
-                if len(self._worker_endpoints) > 0:
-                    self._cur_endpoint = self._worker_endpoints[current_id]
-            elif training_role == "PSERVER":
-                role = Role.SERVER
-                port = os.environ["PADDLE_PORT"]
-                ip = os.environ["POD_IP"]
-                self._cur_endpoint = ip + ":" + port
-                current_id = self._server_endpoints.index(self._cur_endpoint)
-            elif training_role == "HETER_TRAINER":
-                role = Role.HETER_WORKER
-                cur_ip = os.environ["POD_IP"]
-                cur_port = os.environ["PADDLE_PORT"]
-                curr_endpoint = ":".join([cur_ip, cur_port])
-                current_id = heter_trainer_eplist.index(curr_endpoint)
-            else:
+                    "Can not Find PADDLE_HETER_TRAINER_IP_PORT_LIST in env or its format doesn't match the requirement: 'IP:PORT,IP:PORT' ."
+                )
+
+            self._is_heter_parameter_server_mode = True
+            heter_trainers_num = len(heter_trainer_eplist)
+        else:
+            self._is_heter_parameter_server_mode = False
+            heter_trainers_num = 0
+
+        if training_role == "TRAINER":
+            role = Role.WORKER
+            current_id = os.getenv("PADDLE_TRAINER_ID", None)
+            if current_id == None:
                 raise ValueError(
-                    "TRAINING_ROLE must be PSERVER or TRAINER or HETER_TRAINER")
-        except ValueError as e:
-            raise ValueError(
-                "Something wrong with PaddleCloud, please check environment")
+                    "Can not find PADDLE_TRAINER_ID, please check your environment."
+                )
+            current_id = int(current_id)
+            if len(self._worker_endpoints) > 0:
+                self._cur_endpoint = self._worker_endpoints[current_id]
+        elif training_role == "PSERVER":
+            role = Role.SERVER
+            port = os.getenv("PADDLE_PORT", None)
+            if port == None:
+                raise ValueError(
+                    "Can not find PADDLE_PORT, please check your environment.")
+            ip = os.getenv("POD_IP", None)
+            if ip == None:
+                raise ValueError(
+                    "Can not find POD_IP, please check your environment.")
+            self._cur_endpoint = ip + ":" + port
+            current_id = self._server_endpoints.index(self._cur_endpoint)
+        elif training_role == "HETER_TRAINER":
+            role = Role.HETER_WORKER
+            cur_port = os.getenv("PADDLE_PORT", None)
+            if cur_port == None:
+                raise ValueError(
+                    "Can not find PADDLE_PORT, please check your environment.")
+            cur_ip = os.getenv("POD_IP", None)
+            if cur_ip == None:
+                raise ValueError(
+                    "Can not find POD_IP, please check your environment.")
+            curr_endpoint = ":".join([cur_ip, cur_port])
+            current_id = heter_trainer_eplist.index(curr_endpoint)
 
         self._trainers_num = trainers_num
         self._role = role
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 015d59b516e94..2e23a915454fa 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -89,14 +89,16 @@ def _parse_args():
         description='''start paddle training using multi-process mode.
 see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
 ''')
+    base_group = parser.add_argument_group("Base Parameters")
 
-    # Optional arguments for the launch helper
-    parser.add_argument(
-        "--ips",
+    base_group.add_argument(
+        "--log_dir",
         type=str,
-        default="127.0.0.1",
-        help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
-    parser.add_argument(
+        default="log",
+        help="The path for each process's log.If it's not set, the log will printed to default pipe."
+    )
+
+    base_group.add_argument(
         "--gpus",
         type=str,
         default=None,
@@ -104,22 +106,7 @@ def _parse_args():
         "each process is bound to a single GPU. And if it's not set, this module will use all the gpu cards for training."
     )
 
-    parser.add_argument(
-        "--servers", type=str, default="", help="User defined servers ip:port")
-    parser.add_argument(
-        "--workers", type=str, default="", help="User defined workers ip:port")
-    parser.add_argument("--worker_num", type=int, help="number of workers")
-
-    parser.add_argument("--server_num", type=int, help="number of servers")
-
-    parser.add_argument(
-        "--log_dir",
-        type=str,
-        default="log",
-        help="The path for each process's log.If it's not set, the log will printed to default pipe."
-    )
-    # positional
-    parser.add_argument(
+    base_group.add_argument(
         "training_script",
         type=str,
         help="The full path to the single GPU training "
@@ -127,8 +114,34 @@ def _parse_args():
         "followed by all the arguments for the "
         "training script")
 
-    # rest from the training program
-    parser.add_argument('training_script_args', nargs=REMAINDER)
+    base_group.add_argument('training_script_args', nargs=REMAINDER)
+
+    # Optional arguments for the launch helper
+    # for collective
+    collective_group = parser.add_argument_group("Collective Parameters")
+    collective_group.add_argument(
+        "--ips",
+        type=str,
+        default="127.0.0.1",
+        help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
+
+    ps_group = parser.add_argument_group("Parameter-Server Parameters")
+    # for parameter server
+    ps_group.add_argument(
+        "--servers", type=str, default="", help="User defined servers ip:port")
+    ps_group.add_argument(
+        "--workers", type=str, default="", help="User defined workers ip:port")
+    ps_group.add_argument(
+        "--heter_workers",
+        type=str,
+        default="",
+        help="User defined heter workers ip:port")
+
+    ps_group.add_argument("--worker_num", type=int, help="number of workers")
+    ps_group.add_argument("--server_num", type=int, help="number of servers")
+    ps_group.add_argument(
+        "--heter_worker_num", type=int, help="number of heter_workers")
+
     return parser.parse_args()
 
 
@@ -166,35 +179,6 @@ def get_cluster_from_args(args, gpus):
     return get_cluster(node_ips, node_ip, trainer_endpoints, gpus)
 
 
-def get_gpus(gpus):
-    if gpus is None:
-        gpus_num = fluid.core.get_cuda_device_count()
-        res_gpus = [str(x) for x in range(0, gpus_num)]
-    else:
-        cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
-        if cuda_visible_devices is None or cuda_visible_devices == "":
-            res_gpus = [x.strip() for x in gpus.split(',')]
-        else:
-            # change gpus into relative values
-            # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.gpus=4,5,6,7;
-            # therefore gpus=0,1,2,3
-            cuda_visible_devices_list = cuda_visible_devices.split(',')
-            for x in gpus.split(','):
-                assert x in cuda_visible_devices_list, "Can't find "\
-                    "your gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
-                    % (x, cuda_visible_devices)
-            res_gpus = [
-                cuda_visible_devices_list.index(x.strip())
-                for x in gpus.split(',')
-            ]
-            logger.info("Change selected_gpus into reletive values. --ips:{} "
-                        "will change into relative_ips:{} according to your "
-                        "CUDA_VISIBLE_DEVICES:{}".format(
-                            gpus, res_gpus, cuda_visible_devices_list))
-
-    return res_gpus
-
-
 def launch_collective(args):
     # parse arguments, used for cloud-single-machine and local
     gpus = get_gpus(args.gpus)
@@ -245,209 +229,37 @@ def launch_collective(args):
         shutil.rmtree(gloo_rendezvous_dir)
 
 
-def launch_ps(args):
-    ports = None
-    start_port = 6170
-    if args.server_num:
-        server_num = args.server_num
-        ports = get_ports(server_num, 0)
-        server_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
-    else:
-        assert args.servers != "", "The setting of CPU mode must be either server_num or servers."
-        server_endpoints = args.servers
-    server_endpoints_ips = [
-        x.strip().split(":")[0] for x in server_endpoints.split(",")
-    ]
-    server_endpoints_port = [
-        x.strip().split(":")[1] for x in server_endpoints.split(",")
+def launch_ps(args, distribute_mode):
+    cloud_flag = cloud_utils.use_paddlecloud()
+
+    # for ps-cpu on paddlecloud
+    if cloud_flag and distribute_mode == DistributeMode.PS:
+        direct_start(args)
+        return
+    elif cloud_flag and distribute_mode == DistributeMode.PS_HETER:
+        cloud_ps_heter_env_set(args)
+        args.workers = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+        args.servers = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST")
+        args.heter_workers = os.getenv("PADDLE_HETER_TRAINER_IP_PORT_LIST")
+
+    ps_launcher = ParameterServerLauncher(args, distribute_mode)
+    ps_launcher.start_ps()
+    return
+
+
+def which_distributed_mode(args):
+    ps_args = [
+        '--worker_num',
+        '--server_num',
+        '--heter_worker_num',
+        '--servers',
+        '--workers',
+        '--heter_workers',
     ]
-    server_num = len(server_endpoints_ips)
-
-    if args.worker_num:
-        worker_num = args.worker_num
-        ports = get_ports(worker_num, server_num)
-        worker_endpoints = ",".join(["127.0.0.1:" + str(x) for x in ports])
-    else:
-        assert args.workers != "", "The setting of CPU mode must be either worker_num or workers."
-        worker_endpoints = args.workers
-    worker_endpoints_ips = [
-        x.strip().split(":")[0] for x in worker_endpoints.split(",")
-    ]
-    worker_num = len(worker_endpoints_ips)
-    node_ips = list(set(server_endpoints_ips + worker_endpoints_ips))
-    worker_endpoints_len = [
-        len(x.strip().split(":")) for x in worker_endpoints.split(",")
-    ]
-    if 1 in worker_endpoints_len:
-        # if no port value in worker_endpoints, will set default port values.
-        worker_endpoints_port = range(start_port + server_num,
-                                      start_port + server_num + worker_num, 1)
-    else:
-        worker_endpoints_port = [
-            x.strip().split(":")[1] for x in worker_endpoints.split(",")
-        ]
-
-    # local train
-    if len(set(node_ips)) == 1:
-        current_node_ip = node_ips[0]
-    else:
-        _, current_node_ip = get_host_name_ip()
-
-    assert current_node_ip in node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
-        % (current_node_ip, node_ips)
-    node_rank = node_ips.index(current_node_ip)
-    logger.debug(
-        "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}, server_ports:{}".
-        format(node_ips, current_node_ip, node_rank, server_endpoints_port))
-
-    cluster = Cluster(hdfs=None)
-    server_rank = 0
-    worker_rank = 0
-    for node_rank, ip in enumerate(node_ips):
-        pod = Pod()
-        pod.rank = node_rank
-        pod.addr = ip
-        for i in range(len(server_endpoints_ips)):
-            if ip == server_endpoints_ips[i]:
-                server = Trainer()
-                server.endpoint = "%s:%s" % (ip, server_endpoints_port[i])
-                server.rank = server_rank
-                server_rank += 1
-                pod.servers.append(server)
-        for j in range(len(worker_endpoints_ips)):
-            if ip == worker_endpoints_ips[j]:
-                worker = Trainer()
-                worker.endpoint = "%s:%s" % (ip, worker_endpoints_port[i])
-                worker.rank = worker_rank
-                worker_rank += 1
-                pod.workers.append(worker)
-
-        cluster.pods.append(pod)
-
-    pod_rank = node_ips.index(current_node_ip)
-    pod = cluster.pods[pod_rank]
-
-    default_env = os.environ.copy()
-    current_env = copy.copy(default_env)
-
-    gloo_rendezvous_dir = tempfile.mkdtemp()
-    # add gloo env
-    current_env["PADDLE_WITH_GLOO"] = "1"
-    current_env["PADDLE_GLOO_RENDEZVOUS"] = "3"
-    current_env["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
-
-    current_env.pop("http_proxy", None)
-    current_env.pop("https_proxy", None)
-    procs = []
-    cmds = []
-    log_fns = []
-    for idx, cur_server in enumerate(pod.servers):
-        proc_env = {
-            "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
-            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
-            "PADDLE_PORT": cur_server.endpoint.split(":")[1],
-            "TRAINING_ROLE": "PSERVER",
-            "PADDLE_TRAINERS_NUM": str(worker_num),
-            "POD_IP": cur_server.endpoint.split(":")[0]
-        }
-        current_env.update(proc_env)
-
-        cmd = [sys.executable, "-u", args.training_script
-               ] + args.training_script_args
-        cmds.append(cmd)
-
-        if idx == 0:
-            logger.info(
-                "Local server start {} processes. First process distributed "
-                "environment info (Only For Debug): {}".format(
-                    len(pod.servers),
-                    pretty_print_envs(proc_env, ("Distributed Envs", "Value"))))
-
-        if args.log_dir is not None:
-            os.system("mkdir -p {}".format(args.log_dir))
-            fn = open("%s/serverlog.%d" % (args.log_dir, idx), "w")
-            log_fns.append(fn)
-            proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
-        else:
-            proc = subprocess.Popen(cmd, env=current_env)
-
-        tp = TrainerProc()
-        tp.proc = proc
-        tp.rank = cur_server.rank
-        tp.local_rank = idx
-        tp.log_fn = fn
-        tp.log_offset = fn.tell() if fn else None
-        tp.cmd = cmd
-
-        procs.append(tp)
-
-    for idx, cur_worker in enumerate(pod.workers):
-        proc_env = {
-            "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
-            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
-            "PADDLE_TRAINERS_NUM": str(worker_num),
-            "TRAINING_ROLE": "TRAINER",
-            "PADDLE_TRAINER_ID": str(cur_worker.rank)
-        }
-        current_env.update(proc_env)
-
-        cmd = [sys.executable, "-u", args.training_script
-               ] + args.training_script_args
-        cmds.append(cmd)
-
-        if idx == 0:
-            logger.info(
-                "Local worker start {} processes. First process distributed "
-                "environment info (Only For Debug): {}".format(
-                    len(pod.workers),
-                    pretty_print_envs(proc_env, ("Distributed Envs", "Value"))))
-
-        if args.log_dir is not None:
-            os.system("mkdir -p {}".format(args.log_dir))
-            fn = open("%s/workerlog.%d" % (args.log_dir, idx), "w")
-            log_fns.append(fn)
-            proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
-        else:
-            proc = subprocess.Popen(cmd, env=current_env)
-
-        tp = TrainerProc()
-        tp.proc = proc
-        tp.rank = cur_worker.rank
-        tp.local_rank = idx
-        tp.log_fn = fn
-        tp.log_offset = fn.tell() if fn else None
-        tp.cmd = cmd
-
-        procs.append(tp)
-
-    logger.info(
-        "Please check servers and workers logs in {}/workerlog.* and {}/serverlog.*".
-        format(args.log_dir, args.log_dir))
-    # only wait worker to finish here
-    for i, proc in enumerate(procs):
-        if i < len(pod.servers):
-            continue
-        procs[i].proc.wait()
-        if len(log_fns) > 0:
-            log_fns[i].close()
-
-    print("all workers exit, going to finish parameter server", file=sys.stderr)
-    for i in range(len(pod.servers)):
-        if len(log_fns) > 0:
-            log_fns[i].close()
-        procs[i].proc.terminate()
-    print("all parameter server are killed", file=sys.stderr)
-
-    if os.path.exists(gloo_rendezvous_dir):
-        shutil.rmtree(gloo_rendezvous_dir)
+    collective_args = ['--ips']
 
+    ps_heter_args = ["--heter_worker_num", "--heter_workers"]
 
-def launch():
-    args = _parse_args()
-    logger = get_logger()
-    _print_arguments(args)
-    ps_args = ['--worker_num', '--server_num', '--servers', '--workers']
-    collective_args = ['--ips', '--gpus']
     has_ps_args = [
         ps_arg for ps_arg in ps_args if ps_arg in " ".join(sys.argv[1:-1])
     ]
@@ -455,23 +267,46 @@ def launch():
         co_arg for co_arg in collective_args
         if co_arg in " ".join(sys.argv[1:-1])
     ]
+
+    if len(has_ps_args) > 1 and len(has_collective_args) > 1:
+        raise ValueError(
+            "Only one mode(Collective or Parameter-Server) can be selected at the same time, but more than one configuration was received."
+        )
+
     if fluid.core.is_compiled_with_cuda():
         cuda_device_num = fluid.core.get_cuda_device_count()
     else:
         cuda_device_num = 0
 
-    if len(has_ps_args) > 0 or cuda_device_num == 0:
-        logger.info("Run parameter-sever cpu mode. pserver arguments:{}".format(
-            has_ps_args))
-        launch_ps(args)
+    if len(has_ps_args) > 0:
+        logger.info(
+            "Run parameter-sever mode. pserver arguments:{}, cuda count:{}".
+            format(has_ps_args, cuda_device_num))
+        has_ps_heter_args = list(set(has_ps_args) & set(ps_heter_args))
+        if len(has_ps_heter_args) > 0:
+            return DistributeMode.PS_HETER
+        else:
+            return DistributeMode.PS
     elif len(has_collective_args) > 0:
         logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}".
                     format(has_collective_args, cuda_device_num))
-        launch_collective(args)
+        return DistributeMode.COLLECTIVE
     else:
         logger.warning(
             "Not found distinct arguments. Default use gpu collective mode")
+        return DistributeMode.COLLECTIVE
+
+
+def launch():
+    args = _parse_args()
+    logger = get_logger()
+    _print_arguments(args)
+
+    distribute_mode = which_distributed_mode(args)
+    if distribute_mode == DistributeMode.COLLECTIVE:
         launch_collective(args)
+    else:
+        launch_ps(args, distribute_mode)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 7540cd9f4c1f3..35782e0b04c5a 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -21,13 +21,27 @@
 import copy
 import sys
 import subprocess
+import tempfile
+import shutil
 from contextlib import closing
 import socket
+import warnings
 
+import paddle
+import paddle.fluid as fluid
 logger = logging.getLogger("root")
 logger.propagate = False
 
 
+class DistributeMode:
+    """
+    There are various mode for fleetrun, each of them is designed for different model.
+    """
+    COLLECTIVE = 0
+    PS = 1
+    PS_HETER = 2
+
+
 class Cluster(object):
     def __init__(self, hdfs):
         self.job_server = None
@@ -144,14 +158,16 @@ def __init__(self):
         self.trainers = []
         self.servers = []
         self.workers = []
+        self.heter_workers = []
         self.gpus = []
 
     def __str__(self):
         return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{} servers:{} \
-            workers:{}".format(self.rank, self.id, self.addr, self.port,
-                               self.gpus, [str(t) for t in self.trainers],
-                               [str(s) for s in self.servers],
-                               [str(w) for w in self.workers])
+            workers:{} heter_workers:{}".format(
+            self.rank, self.id, self.addr, self.port, self.gpus, [
+                str(t) for t in self.trainers
+            ], [str(s) for s in self.servers], [str(w) for w in self.workers],
+            [str(h) for h in self.heter_workers])
 
     def __eq__(self, pod):
         if self.rank != pod.rank or \
@@ -262,7 +278,7 @@ def terminate_local_procs(procs):
                 p.log_fn.close()
             logger.debug("terminate process id:{}".format(p.proc.pid))
 
-    #wait all process terminiated
+    # wait all process terminiated
     time.sleep(3)
     for step in range(0, 50):
         alive = False
@@ -406,10 +422,10 @@ def start_local_trainers(cluster,
     else:
         current_env = copy.copy(envs)
 
-    #paddle broadcast ncclUniqueId use socket, and
-    #proxy maybe make trainers unreachable, so delete them.
-    #if we set them to "", grpc will log error message "bad uri"
-    #so just delete them.
+    # paddle broadcast ncclUniqueId use socket, and
+    # proxy maybe make trainers unreachable, so delete them.
+    # if we set them to "", grpc will log error message "bad uri"
+    # so just delete them.
     current_env.pop("http_proxy", None)
     current_env.pop("https_proxy", None)
 
@@ -518,3 +534,524 @@ def watch_local_trainers(procs, nranks):
         raise
 
     return alive
+
+
+def get_gpus(gpus):
+    if gpus is None:
+        gpus_num = fluid.core.get_cuda_device_count()
+        res_gpus = [str(x) for x in range(0, gpus_num)]
+    else:
+        cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
+        if cuda_visible_devices is None or cuda_visible_devices == "":
+            res_gpus = [x.strip() for x in gpus.split(',')]
+        else:
+            # change gpus into relative values
+            # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.gpus=4,5,6,7;
+            # therefore gpus=0,1,2,3
+            cuda_visible_devices_list = cuda_visible_devices.split(',')
+            for x in gpus.split(','):
+                assert x in cuda_visible_devices_list, "Can't find "\
+                    "your gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
+                    % (x, cuda_visible_devices)
+            res_gpus = [
+                cuda_visible_devices_list.index(x.strip())
+                for x in gpus.split(',')
+            ]
+            logger.info("Change selected_gpus into reletive values. --ips:{} "
+                        "will change into relative_ips:{} according to your "
+                        "CUDA_VISIBLE_DEVICES:{}".format(
+                            gpus, res_gpus, cuda_visible_devices_list))
+
+    return res_gpus
+
+
+def direct_start(args):
+    # run ps-cpu mode on paddlecloud, using given envs
+    cmd = [sys.executable, "-u", args.training_script] + \
+        args.training_script_args
+    proc = subprocess.Popen(cmd)
+    proc.wait()
+    return
+
+
+def get_custom_endpoints(origin_endpoints, offset=0):
+    """
+    origin_endpoint: ip:port
+    user_define_endpoint: ip:(port+offset)
+    """
+    assert origin_endpoints != None
+    paddle_user_define_endpoints_list = []
+    for ip_port in origin_endpoints.split(","):
+        ip = ip_port.split(":")[0]
+        port = ip_port.split(":")[1]
+        new_port = int(port) + offset
+        paddle_user_define_endpoints_list.append(":".join((ip, str(new_port))))
+    paddle_user_define_endpoints = ",".join(paddle_user_define_endpoints_list)
+    return paddle_user_define_endpoints
+
+
+def cloud_ps_heter_env_set(args):
+    environs = {}
+
+    paddle_trainer_endpoints = os.getenv("TRAINER_IP_PORT_LIST", "")
+    assert paddle_trainer_endpoints != None
+
+    paddle_pserver_endpoints = os.getenv("PSERVER_IP_PORT_LIST", "")
+    assert paddle_pserver_endpoints != None
+
+    # hard code for paddlecloud custom-framework
+    avilable_ports = os.getenv("TRAINER_PORTS", "").split(",")
+    assert len(
+        avilable_ports
+    ) > 3, "set paddle_ports_num >= 2 in config.ini for paddlecloud job submit"
+
+    # hard code for paddlecloud custom-framework
+    trainers_num = len(paddle_pserver_endpoints.split(","))
+    assert trainers_num != 0
+    environs["PADDLE_TRAINERS_NUM"] = trainers_num
+    environs["TRAINERS_NUM"] = trainers_num
+
+    # hard code for paddlecloud custom-framework
+    environs["PADDLE_HETER_TRAINER_IP_PORT_LIST"] = paddle_trainer_endpoints
+    environs["PADDLE_PSERVERS_IP_PORT_LIST"] = paddle_pserver_endpoints
+    environs["PADDLE_TRAINER_ENDPOINTS"] = get_custom_endpoints(
+        paddle_pserver_endpoints, 1)
+    heter_worker_num = len(paddle_trainer_endpoints.split(","))
+    if (args.heter_worker_num != None) and (
+            heter_worker_num != args.heter_worker_num):
+        warnings.warn(
+            "Your fleetrun setting: heter_worker_num is {}, but we find {} device can be used, this setting has been changed.".
+            format(args.heter_worker_num, heter_worker_num))
+        args.heter_worker_num = heter_worker_num
+
+    for k, v in environs.items():
+        os.environ[k] = str(v)
+    logger.info("Set heter parameter server env: {}".format(
+        pretty_print_envs(environs)))
+
+
+class ParameterServerLauncher(object):
+    def __init__(self, args, distribute_mode):
+        self.args = args
+        self.distribute_mode = distribute_mode
+        self.server_num = 0
+        self.worker_num = 0
+        self.heter_worker_num = 0
+
+        self.server_endpoints = ""
+        self.server_endpoints_ips = []
+        self.server_endpoints_port = []
+
+        self.worker_endpoints = ""
+        self.worker_endpoints_ips = []
+        self.worker_endpoints_port = []
+
+        self.heter_worker_endpoints = ""
+        self.heter_worker_endpoints_ips = []
+        self.heter_worker_endpoints_port = []
+
+        self.is_local = True
+        self.current_node_ip = ""
+
+        self.get_role_endpoints(args)
+
+    def get_role_endpoints(self, args):
+        # get server envs
+        if args.server_num:
+            self.server_num = args.server_num
+            if args.servers:
+                assert len(
+                    args.servers.split(",")
+                ) == self.server_num, "The server_num and servers doesn't match. Expect servers endpoints num epual to server_num, but received servers enpoint num: {} and server_num {}".format(
+                    len(args.servers.split(",")), self.server_num)
+                self.server_endpoints = args.servers
+            else:
+                ports = get_ports(self.server_num, 0)
+                self.server_endpoints = ",".join(
+                    ["127.0.0.1:" + str(x) for x in ports])
+        else:
+            assert args.servers != "", "The setting of Parameter-Server must has server_num or servers."
+            self.server_endpoints = args.servers
+            self.server_num = len(self.server_endpoints.split(","))
+
+        # get worker envs
+        if args.worker_num:
+            self.worker_num = args.worker_num
+            if args.workers:
+                assert len(
+                    args.workers.split(",")
+                ) == self.worker_num, "The worker_num and workers doesn't match. Expect workers endpoints num epual to worker_num, but received workers enpoint num: {} and worker_num {}".format(
+                    len(args.workers.split(",")), self.worker_num)
+
+                self.worker_endpoints = args.workers
+            else:
+                ports = get_ports(self.worker_num, self.server_num)
+                self.worker_endpoints = ",".join(
+                    ["127.0.0.1:" + str(x) for x in ports])
+        else:
+            assert args.workers != "", "The setting of Parameter-Server must has worker_num or workers."
+            worker_endpoints_ips = [
+                x.strip().split(":")[0] for x in args.workers.split(",")
+            ]
+            self.worker_num = len(worker_endpoints_ips)
+            worker_endpoints_len = [
+                len(x.strip().split(":")) for x in args.workers.split(",")
+            ]
+
+            if 1 in worker_endpoints_len:
+                # if no port value in worker_endpoints, will set default port values.
+                start_port = 6170
+                worker_endpoints_port = range(
+                    start_port + self.server_num,
+                    start_port + self.server_num + self.worker_num, 1)
+                # create endpoints str
+                worker_endpoints = []
+                for i in range(self.worker_num):
+                    worker_endpoints.append(":".join((worker_endpoints_ips[
+                        i], str(worker_endpoints_port[i]))))
+                self.worker_endpoints = ",".join(worker_endpoints)
+            else:
+                self.worker_endpoints = args.workers
+
+        # get heter worker envs
+        if self.distribute_mode == DistributeMode.PS_HETER:
+            if args.heter_worker_num:
+                self.heter_worker_num = args.heter_worker_num
+                if args.heter_workers:
+                    assert len(
+                        args.heter_workers.split(",")
+                    ) == self.heter_worker_num, "The heter_worker_num and heter_workers doesn't match. Expect heter_workers endpoints num epual to heter_worker_num, but received heter_workers enpoint num: {} and heter_worker_num {}".format(
+                        len(args.heter_workers.split(",")),
+                        self.heter_worker_num)
+                    self.heter_worker_endpoints = args.heter_workers
+                else:
+                    ports = get_ports(self.heter_worker_num,
+                                      self.server_num + self.worker_num)
+                    self.heter_worker_endpoints = ",".join(
+                        ["127.0.0.1:" + str(x) for x in ports])
+            else:
+                assert args.heter_workers != "", "The setting of Parameter-Server heter mode must has heter_worker_num or heter_workers."
+                self.heter_worker_endpoints = args.heter_workers
+                self.heter_worker_num = len(
+                    self.heter_worker_endpoints.split(","))
+
+        # check local or user define
+        self.server_endpoints_ips = [
+            x.strip().split(":")[0] for x in self.server_endpoints.split(",")
+        ]
+        self.worker_endpoints_ips = [
+            x.strip().split(":")[0] for x in self.worker_endpoints.split(",")
+        ]
+        self.server_endpoints_port = [
+            x.strip().split(":")[1] for x in self.server_endpoints.split(",")
+        ]
+        self.worker_endpoints_port = [
+            x.strip().split(":")[1] for x in self.worker_endpoints.split(",")
+        ]
+        self.node_ips = list(
+            set(self.server_endpoints_ips + self.worker_endpoints_ips))
+        if self.distribute_mode == DistributeMode.PS_HETER:
+            self.heter_worker_endpoints_ips = [
+                x.strip().split(":")[0]
+                for x in self.heter_worker_endpoints.split(",")
+            ]
+            self.heter_worker_endpoints_port = [
+                x.strip().split(":")[1]
+                for x in self.heter_worker_endpoints.split(",")
+            ]
+            self.node_ips = list(
+                set(self.node_ips + self.heter_worker_endpoints_ips))
+
+        if len(set(self.node_ips)) == 1:
+            self.is_local = True
+            self.current_node_ip = self.node_ips[0]
+        else:
+            self.is_local = False
+            pod_ip = os.getenv("POD_IP", None)
+            if pod_ip == None:
+                _, self.current_node_ip = get_host_name_ip()
+            else:
+                self.current_node_ip = pod_ip
+            assert self.current_node_ip in self.node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
+                % (self.current_node_ip, self.node_ips)
+        self.node_rank = self.node_ips.index(self.current_node_ip)
+
+        logger.debug(
+            "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}".
+            format(self.node_ips, self.current_node_ip, self.node_rank))
+
+    def start_ps(self):
+        cluster = Cluster(hdfs=None)
+        server_rank = 0
+        worker_rank = 0
+        heter_worker_rank = 0
+
+        for node_rank, ip in enumerate(self.node_ips):
+            pod = Pod()
+            pod.rank = node_rank
+            pod.addr = ip
+            for i in range(len(self.server_endpoints_ips)):
+                if ip == self.server_endpoints_ips[i]:
+                    server = Trainer()
+                    server.endpoint = "%s:%s" % (ip,
+                                                 self.server_endpoints_port[i])
+                    server.rank = server_rank
+                    server_rank += 1
+                    pod.servers.append(server)
+            for j in range(len(self.worker_endpoints_ips)):
+                if ip == self.worker_endpoints_ips[j]:
+                    worker = Trainer()
+                    worker.endpoint = "%s:%s" % (ip,
+                                                 self.worker_endpoints_port[j])
+                    worker.rank = worker_rank
+                    worker_rank += 1
+                    pod.workers.append(worker)
+            for k in range(len(self.heter_worker_endpoints_ips)):
+                if ip == self.heter_worker_endpoints_ips[k]:
+                    heter_worker = Trainer()
+                    heter_worker.endpoint = "%s:%s" % (
+                        ip, self.heter_worker_endpoints_port[k])
+                    heter_worker.rank = heter_worker_rank
+                    heter_worker_rank += 1
+                    pod.heter_workers.append(heter_worker)
+
+            cluster.pods.append(pod)
+
+        pod = cluster.pods[self.node_rank]
+        self.gloo_rendezvous_dir = tempfile.mkdtemp()
+
+        # 3. subproces start
+        self.procs = {"worker": [], "server": [], "heter_worker": []}
+        self.cmds = {"worker": [], "server": [], "heter_worker": []}
+        self.log_fns = {"worker": [], "server": [], "heter_worker": []}
+
+        self.start_pod_server(self.args, pod)
+        self.start_pod_worker(self.args, pod)
+        self.start_pod_heter_worker(self.args, pod)
+
+        logger.info(
+            "Please check servers, workers and heter_worker logs in {}/workerlog.*, {}/serverlog.* and {}/heterlog.*".
+            format(self.args.log_dir, self.args.log_dir, self.args.log_dir))
+
+        # 4. wait for finish training
+        if len(self.procs["worker"]) > 0:
+            # if node has worker procs
+            # only wait worker to finish here
+            for i, proc in enumerate(self.procs["worker"]):
+                self.procs["worker"][i].proc.wait()
+                if len(self.log_fns["worker"]) > 0:
+                    self.log_fns["worker"][i].close()
+            logger.info(
+                "all workers exit, going to finish parameter server and heter_worker."
+            )
+            if len(self.procs["heter_worker"]) > 0:
+                for i, proc in enumerate(self.procs["heter_worker"]):
+                    self.log_fns["heter_worker"][i].close()
+                    self.procs["heter_worker"][i].proc.terminate()
+                logger.info("all heter_worker are killed")
+
+            if len(self.procs["server"]) > 0:
+                for i, proc in enumerate(self.procs["server"]):
+                    self.log_fns["server"][i].close()
+                    self.procs["server"][i].proc.terminate()
+                logger.info("all parameter server are killed")
+
+        else:
+            # if node has not worker procs
+            # blocking training process
+            if len(self.procs["server"]) > 0:
+                for i, proc in enumerate(self.procs["server"]):
+                    self.procs["server"][i].proc.wait()
+
+            if len(self.procs["heter_worker"]) > 0:
+                for i, proc in enumerate(self.procs["heter_worker"]):
+                    self.procs["heter_worker"][i].proc.wait()
+
+        if os.path.exists(self.gloo_rendezvous_dir):
+            shutil.rmtree(self.gloo_rendezvous_dir)
+
+    def start_pod_server(self, args, pod):
+        default_env = os.environ.copy()
+        current_env = copy.copy(default_env)
+        current_env.pop("http_proxy", None)
+        current_env.pop("https_proxy", None)
+        for idx, cur_server in enumerate(pod.servers):
+            proc_env = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": self.server_endpoints,
+                "PADDLE_TRAINER_ENDPOINTS": self.worker_endpoints,
+                "PADDLE_HETER_TRAINER_IP_PORT_LIST":
+                self.heter_worker_endpoints,
+                "PADDLE_PORT": cur_server.endpoint.split(":")[1],
+                "TRAINING_ROLE": "PSERVER",
+                "PADDLE_TRAINERS_NUM": str(self.worker_num),
+                "POD_IP": cur_server.endpoint.split(":")[0],
+                "PADDLE_WITH_GLOO": "1",
+                "PADDLE_GLOO_RENDEZVOUS": "2",
+                "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir
+            }
+            current_env.update(proc_env)
+
+            cmd = [sys.executable, "-u", args.training_script
+                   ] + args.training_script_args
+            self.cmds["server"].append(cmd)
+
+            if idx == 0:
+                logger.info(
+                    "Local server start {} processes. First process distributed "
+                    "environment info (Only For Debug): {}".format(
+                        len(pod.servers),
+                        pretty_print_envs(proc_env, ("Distributed Envs", "Value"
+                                                     ))))
+
+            if args.log_dir is not None:
+                os.system("mkdir -p {}".format(args.log_dir))
+                fn = open("%s/serverlog.%d" % (args.log_dir, idx), "w")
+                self.log_fns["server"].append(fn)
+                proc = subprocess.Popen(
+                    cmd, env=current_env, stdout=fn, stderr=fn)
+            else:
+                proc = subprocess.Popen(cmd, env=current_env)
+
+            tp = TrainerProc()
+            tp.proc = proc
+            tp.rank = cur_server.rank
+            tp.local_rank = idx
+            tp.log_fn = fn
+            tp.log_offset = fn.tell() if fn else None
+            tp.cmd = cmd
+
+            self.procs["server"].append(tp)
+
+    def start_pod_worker(self, args, pod):
+        default_env = os.environ.copy()
+        current_env = copy.copy(default_env)
+        current_env.pop("http_proxy", None)
+        current_env.pop("https_proxy", None)
+
+        heter_device_num = 0
+        device_list = []
+        if fluid.core.is_compiled_with_cuda():
+            device_list = get_gpus(args.gpus)
+            heter_device_num = len(device_list)
+        elif fluid.core.is_compiled_with_xpu():
+            heter_device_num = fluid.core.get_xpu_device_count()
+            device_list = [str(x) for x in range(0, heter_device_num)]
+
+        for idx, cur_worker in enumerate(pod.workers):
+            device_id = str(device_list[idx % heter_device_num])
+            proc_env = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": self.server_endpoints,
+                "PADDLE_TRAINER_ENDPOINTS": self.worker_endpoints,
+                "PADDLE_TRAINERS_NUM": str(self.worker_num),
+                "PADDLE_HETER_TRAINER_IP_PORT_LIST":
+                self.heter_worker_endpoints,
+                "TRAINING_ROLE": "TRAINER",
+                "PADDLE_TRAINER_ID": str(cur_worker.rank),
+                "PADDLE_WITH_GLOO": "1",
+                "PADDLE_GLOO_RENDEZVOUS": "2",
+                "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
+                "FLAGS_selected_gpus": "0",
+                "FLAGS_selected_xpus": "0",
+                "CUDA_VISIBLE_DEVICES": device_id,
+                "XPU_VISIBLE_DEVICES": device_id,
+            }
+            current_env.update(proc_env)
+
+            cmd = [sys.executable, "-u", args.training_script
+                   ] + args.training_script_args
+            self.cmds["worker"].append(cmd)
+
+            if idx == 0:
+                logger.info(
+                    "Local worker start {} processes. First process distributed "
+                    "environment info (Only For Debug): {}".format(
+                        len(pod.workers),
+                        pretty_print_envs(proc_env, ("Distributed Envs", "Value"
+                                                     ))))
+
+            if args.log_dir is not None:
+                os.system("mkdir -p {}".format(args.log_dir))
+                fn = open("%s/workerlog.%d" % (args.log_dir, idx), "w")
+                self.log_fns["worker"].append(fn)
+                proc = subprocess.Popen(
+                    cmd, env=current_env, stdout=fn, stderr=fn)
+            else:
+                proc = subprocess.Popen(cmd, env=current_env)
+
+            tp = TrainerProc()
+            tp.proc = proc
+            tp.rank = cur_worker.rank
+            tp.local_rank = idx
+            tp.log_fn = fn
+            tp.log_offset = fn.tell() if fn else None
+            tp.cmd = cmd
+
+            self.procs["worker"].append(tp)
+
+    def start_pod_heter_worker(self, args, pod):
+        default_env = os.environ.copy()
+        current_env = copy.copy(default_env)
+        current_env.pop("http_proxy", None)
+        current_env.pop("https_proxy", None)
+
+        heter_device_num = 0
+        device_list = []
+        if fluid.core.is_compiled_with_cuda():
+            device_list = get_gpus(args.gpus)
+            heter_device_num = len(device_list)
+        elif fluid.core.is_compiled_with_xpu():
+            heter_device_num = fluid.core.get_xpu_device_count()
+            device_list = [str(x) for x in range(0, heter_device_num)]
+        assert heter_device_num != 0
+
+        for idx, cur_heter_worker in enumerate(pod.heter_workers):
+            device_id = str(device_list[idx % heter_device_num])
+            proc_env = {
+                "PADDLE_PSERVERS_IP_PORT_LIST": self.server_endpoints,
+                "PADDLE_TRAINER_ENDPOINTS": self.worker_endpoints,
+                "PADDLE_HETER_TRAINER_IP_PORT_LIST":
+                self.heter_worker_endpoints,
+                "PADDLE_PORT": cur_heter_worker.endpoint.split(":")[1],
+                "TRAINING_ROLE": "HETER_TRAINER",
+                "PADDLE_TRAINERS_NUM": str(self.worker_num),
+                "POD_IP": cur_heter_worker.endpoint.split(":")[0],
+                "PADDLE_WITH_GLOO": "1",
+                "PADDLE_GLOO_RENDEZVOUS": "2",
+                "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
+                "FLAGS_selected_gpus": "0",
+                "FLAGS_selected_xpus": "0",
+                "CUDA_VISIBLE_DEVICES": device_id,
+                "XPU_VISIBLE_DEVICES": device_id,
+            }
+            current_env.update(proc_env)
+
+            cmd = [sys.executable, "-u", args.training_script
+                   ] + args.training_script_args
+            self.cmds["heter_worker"].append(cmd)
+
+            if idx == 0:
+                logger.info(
+                    "Local heter_worker start {} processes. First process distributed "
+                    "environment info (Only For Debug): {}".format(
+                        len(pod.heter_workers),
+                        pretty_print_envs(proc_env, ("Distributed Envs", "Value"
+                                                     ))))
+
+            if args.log_dir is not None:
+                os.system("mkdir -p {}".format(args.log_dir))
+                fn = open("%s/heterlog.%d" % (args.log_dir, idx), "w")
+                self.log_fns["heter_worker"].append(fn)
+                proc = subprocess.Popen(
+                    cmd, env=current_env, stdout=fn, stderr=fn)
+            else:
+                proc = subprocess.Popen(cmd, env=current_env)
+
+            tp = TrainerProc()
+            tp.proc = proc
+            tp.rank = cur_heter_worker.rank
+            tp.local_rank = idx
+            tp.log_fn = fn
+            tp.log_offset = fn.tell() if fn else None
+            tp.cmd = cmd
+
+            self.procs["heter_worker"].append(tp)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index 38ad41f8836b4..83345cb6f623e 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -74,6 +74,8 @@ def _build_trainer_programs(self, compiled_config):
             _startup = worker.delet_extra_optimizes_pass(_startup,
                                                          compiled_config)
 
+            compiled_config.set_origin_ps_main_program(_main)
+            compiled_config.set_origin_ps_startup_program(_startup)
             # for heter program
             if self.role_maker._is_heter_parameter_server_mode:
                 from paddle.fluid.incubate.fleet.parameter_server.ir import heter_trainer_pass as heter_worker
@@ -91,6 +93,8 @@ def _build_trainer_programs(self, compiled_config):
         else:
             _main = worker.append_send_ops_pass(_main, compiled_config)
             _startup = _startup
+            compiled_config.set_origin_ps_main_program(_main)
+            compiled_config.set_origin_ps_startup_program(_startup)
 
         return _main, _startup
 
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 42be7e869d9a7..266c7d0f405bf 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -210,18 +210,23 @@ def get_sparse_attrs():
             warnings.warn("communicator has been initialized, skip")
 
     def _get_executor(self):
-        if self.role_maker._is_heter_worker():
-            if self.role_maker._get_heter_worker_device() == "GPU":
-                gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
-                executor = Executor(fluid.CUDAPlace(gpu_id))
-            elif self.role_maker._get_heter_worker_device() == "XPU":
-                xpu_id = int(os.getenv("FLAGS_selected_xpus", "0"))
-                executor = Executor(fluid.XPUPlace(xpu_id))
-            else:
-                raise ValueError("Not Support Device {}".format(
-                    self.role_maker._get_heter_worker_device()))
-        else:
-            executor = fluid.Executor(fluid.CPUPlace())
+        executor = fluid.Executor(fluid.CPUPlace())
+        if self.role_maker._is_heter_parameter_server_mode:
+            heter_worker_device_guard = self.context[
+                "valid_strategy"].a_sync_configs[
+                    "heter_worker_device_guard"].upper()
+            if heter_worker_device_guard not in ["GPU", "XPU", "CPU"]:
+                raise ValueError("Heter Worker Not Support Device {}".format(
+                    heter_worker_device_guard))
+            if self.role_maker._is_heter_worker():
+                if heter_worker_device_guard == "GPU":
+                    executor = Executor(
+                        fluid.CUDAPlace(
+                            int(os.getenv("FLAGS_selected_gpus", "0"))))
+                elif heter_worker_device_guard == "XPU":
+                    executor = Executor(
+                        fluid.XPUPlace(
+                            int(os.getenv("FLAGS_selected_xpus", "0"))))
         return executor
 
     def _init_server(self, *args, **kwargs):
@@ -233,12 +238,14 @@ def _init_server(self, *args, **kwargs):
             model_dirname = None
 
         executor = self._get_executor()
+        if self.role_maker._is_heter_worker() and self.context[
+                "valid_strategy"].a_sync_configs["launch_barrier"]:
+            # for heter trainer wait server ready
+            wait_server_ready(self.role_maker._get_pserver_endpoints())
         executor.run(fluid.default_startup_program())
 
         if self.role_maker._is_heter_worker():
             self._init_worker()
-
-        if self.role_maker._is_heter_worker():
             return
 
         if not model_dirname:
@@ -470,13 +477,13 @@ def _save_distributed_params(self, executor, dirname, context,
 
     def _save_distributed_persistables(self, executor, dirname, main_program):
         dense_ctx = self.compiled_strategy.get_communicator_recv_context(
-            recv_type=1)
+            recv_type=1, use_origin_program=True)
 
         sparse_ctx = self.compiled_strategy.get_communicator_recv_context(
-            recv_type=2)
+            recv_type=2, use_origin_program=True)
 
         distributed_ctx = self.compiled_strategy.get_communicator_recv_context(
-            recv_type=3)
+            recv_type=3, use_origin_program=True)
 
         recv_dense_varnames = self._save_dense_params(executor, dirname,
                                                       dense_ctx, main_program)
@@ -528,7 +535,7 @@ def _ps_inference_save_persistables(self,
             )
 
         if main_program is None:
-            main_program = fluid.default_main_program()
+            main_program = self.compiled_strategy.get_origin_ps_main_program()
 
         if isinstance(main_program, CompiledProgram):
             raise TypeError(
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index e348c67ae0461..90847382c86e1 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -133,6 +133,8 @@ def __init__(self, main_program, startup_program, strategy, role_maker):
 
         self.origin_main_program = main_program
         self.origin_startup_program = startup_program
+        self.origin_ps_main_program = main_program
+        self.origin_ps_startup_program = startup_program
 
         self.strategy = strategy
         self.role_maker = role_maker
@@ -153,6 +155,11 @@ def __init__(self, main_program, startup_program, strategy, role_maker):
 
         self._build_var_distributed()
 
+        # for heter-ps save variables
+        self.origin_merged_variables_pairs = list(self.merged_variables_pairs)
+        self.origin_merged_dense_pairs = list(self.merged_dense_pairs)
+        self.origin_merged_sparse_pairs = list(self.merged_sparse_pairs)
+
     def get_distributed_mode(self):
         trainer = self.strategy.get_trainer_runtime_config()
         return trainer.mode
@@ -214,6 +221,18 @@ def get_origin_main_program(self):
     def get_origin_startup_program(self):
         return self.origin_startup_program
 
+    def set_origin_ps_main_program(self, program):
+        self.origin_ps_main_program = program
+
+    def set_origin_ps_startup_program(self, program):
+        self.origin_ps_startup_program = program
+
+    def get_origin_ps_main_program(self):
+        return self.origin_ps_main_program
+
+    def get_origin_ps_startup_program(self):
+        return self.origin_ps_startup_program
+
     def get_sparse_varname_on_ps(self, is_distributed, endpoint=None):
         if not endpoint:
             endpoint = self.get_ps_endpoint()
@@ -378,7 +397,9 @@ def get_communicator_send_context(self):
             send_ctx[name] = ctx
         return send_ctx
 
-    def get_communicator_recv_context(self, recv_type=1):
+    def get_communicator_recv_context(self,
+                                      recv_type=1,
+                                      use_origin_program=False):
         # recv_type
         # 1 : DENSE 2. SPARSE 3. DISTRIBUTED 4. ALL
         distibuted_varnames = get_sparse_tablenames(self.origin_main_program,
@@ -392,7 +413,8 @@ def get_communicator_recv_context(self, recv_type=1):
         sparse_recv_ctx = {}
         distributed_recv_ctx = {}
 
-        for merged in self.merged_variables_pairs:
+        variables_pairs = self.merged_variables_pairs if not use_origin_program else self.origin_merged_variables_pairs
+        for merged in variables_pairs:
             params = merged[0]
             if params.merged_var.name in sparse_varnames:
                 continue
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
index fefaecd3b8979..7fc66e8e84961 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -169,6 +169,10 @@ def do_pyreader_training(self, fleet):
             except fluid.core.EOFException:
                 self.reader.reset()
 
+        if fleet.is_first_worker():
+            model_path = tempfile.mkdtemp()
+            fleet.save_persistables(executor=exe, dirname=model_path)
+            shutil.rmtree(model_path)
         fleet.stop_worker()
 
     def do_dataset_training(self, fleet):
diff --git a/python/paddle/fluid/tests/unittests/fleet_ps_training.py b/python/paddle/fluid/tests/unittests/fleet_ps_training.py
index a9e9675a61160..65fa1ef935ef1 100644
--- a/python/paddle/fluid/tests/unittests/fleet_ps_training.py
+++ b/python/paddle/fluid/tests/unittests/fleet_ps_training.py
@@ -20,8 +20,12 @@
 
 input_x = fluid.layers.data(name="x", shape=[32], dtype='float32')
 input_y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+input_y = fluid.layers.cast(input_y, dtype="float32")
+
+with fluid.device_guard("gpu"):
+    input_y = fluid.layers.cast(input_y, dtype="int64")
+    cost = mlp(input_x, input_y)
 
-cost = mlp(input_x, input_y)
 optimizer = fluid.optimizer.Adagrad(learning_rate=0.01)
 
 role = role_maker.PaddleCloudRoleMaker()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
index 6c5a1d6e36c25..071b68bf9e856 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py
@@ -288,7 +288,7 @@ def _run_cluster(self, model, envs):
         print("tr end communicate")
 
         tr0_ret = tr0.returncode
-        tr1_ret = tr0.returncode
+        tr1_ret = tr1.returncode
 
         # close trainer file
         tr0_pipe.close()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
index 7f4e5d99e0208..eed8d5f1a496e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
@@ -50,6 +50,10 @@ def build_role(self):
     def build_strategy(self):
         self.strategy = paddle.distributed.fleet.DistributedStrategy()
         self.strategy.a_sync = True
+        self.strategy.a_sync_configs = {
+            "launch_barrier": False,
+            "heter_worker_device_guard": "gpu"
+        }
         return self.strategy
 
     def build_input(self):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
index e717962ead2e2..4cd8dc3d945e1 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
@@ -28,13 +28,27 @@ function test_launch_ps(){
     fi
 }
 
+function test_launch_ps_heter(){
+    fleetrun --server_num=2 --worker_num=2 --heter_worker_num=2 fleet_ps_training.py 2> ut.elog
+    if grep -q "server are killed" ut.elog; then
+        echo "test heter pserver launch succeed"
+    else
+        echo "test pserver launch failed"
+        exit -1
+    fi
+}
+
 if [[ ${WITH_GPU} == "OFF" ]]; then
+    echo "in cpu test mode"
     test_launch_ps
     exit 0
 fi
 
+echo "No.1 unittest"
 test_launch_ps
+test_launch_ps_heter
 # use default values
+echo "No.2 unittest"
 fleetrun multi_process.py fleetrun
 
 # use paddlecloud
@@ -48,6 +62,7 @@ export PADDLE_TRAINER_ID=0
 export PADDLE_PORT=35789
 export TRAINER_PORTS_NUM=2
 
+echo "No.3 unittest"
 distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
 CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun
 
@@ -83,7 +98,7 @@ fi
 unset PADDLE_PORT
 export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
 
-echo ""
+echo "No.4 unittest"
 echo "paddle.distributed.launch async poll process test"
 if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun abort; then
     echo "train abort as planned"
@@ -112,5 +127,6 @@ rm -rf $file_0_0 $file_0_1
 
 distributed_args="--gpus=0,1 --log_dir=testlog"
 export PADDLE_LAUNCH_LOG="test_launch_filelock_0"
+echo "No.5 unittest"
 CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} find_ports.py
 str_0="worker_endpoints:127.0.0.1:6070,127.0.0.1:6071"
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_init.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_init.py
new file mode 100644
index 0000000000000..9f8ee1b46e827
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_init.py
@@ -0,0 +1,149 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cloud role maker."""
+
+from __future__ import print_function
+import os
+import platform
+import shutil
+import tempfile
+import unittest
+import paddle
+import paddle.distributed.fleet.base.role_maker as role_maker
+
+
+class TestPSCloudRoleMakerCase1(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMake Parameter Server.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+
+    def test_paddle_trainers_num(self):
+        # PADDLE_TRAINERS_NUM
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro._generate_role)
+
+
+class TestPSCloudRoleMakerCase2(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMake Parameter Server.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        os.environ["PADDLE_TRAINERS_NUM"] = str(2)
+
+    def test_training_role(self):
+        # TRAINING_ROLE
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro._generate_role)
+
+
+class TestPSCloudRoleMakerCase3(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMake Parameter Server.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        os.environ["PADDLE_TRAINERS_NUM"] = str(2)
+        os.environ["TRAINING_ROLE"] = 'TRAINER'
+
+    def test_trainer_id(self):
+        # PADDLE_TRAINER_ID
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro._generate_role)
+
+
+class TestPSCloudRoleMakerCase4(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMake Parameter Server.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        os.environ["PADDLE_TRAINERS_NUM"] = str(2)
+        os.environ["TRAINING_ROLE"] = 'PSERVER'
+
+    def test_ps_port(self):
+        # PADDLE_PORT
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro._generate_role)
+
+
+class TestPSCloudRoleMakerCase5(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMake Parameter Server.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        os.environ["PADDLE_TRAINERS_NUM"] = str(2)
+        os.environ["TRAINING_ROLE"] = 'PSERVER'
+        os.environ["PADDLE_PORT"] = str(4001)
+
+    def test_ps_ip(self):
+        # POD_IP
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro._generate_role)
+
+
+class TestPSCloudRoleMakerCase6(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMake Parameter Server.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        os.environ[
+            "PADDLE_HETER_TRAINER_IP_PORT_LIST"] = "127.0.0.1:4003,127.0.0.1:4004"
+        os.environ["PADDLE_TRAINERS_NUM"] = str(2)
+        os.environ["TRAINING_ROLE"] = 'HETER_TRAINER'
+
+    def test_heter_port(self):
+        # PADDLE_PORT
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro._generate_role)
+
+
+class TestPSCloudRoleMakerCase7(unittest.TestCase):
+    """
+    Test cases for PaddleCloudRoleMake Parameter Server.
+    """
+
+    def setUp(self):
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:4001,127.0.0.1:4002"
+        os.environ[
+            "PADDLE_HETER_TRAINER_IP_PORT_LIST"] = "127.0.0.1:4003,127.0.0.1:4004"
+        os.environ["PADDLE_TRAINERS_NUM"] = str(2)
+        os.environ["TRAINING_ROLE"] = 'HETER_TRAINER'
+        os.environ["PADDLE_PORT"] = str(4003)
+
+    def test_heter_ip(self):
+        # POD_IP
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertRaises(ValueError, ro._generate_role)
+
+
+if __name__ == "__main__":
+    unittest.main()

From af57537ec78594e74db3acadf8107bd646e01db8 Mon Sep 17 00:00:00 2001
From: Peihan <lphs1234567@gmail.com>
Date: Tue, 13 Oct 2020 09:58:33 +0800
Subject: [PATCH 91/91] remove dy2static test_lac predictor run case (#27844)

* remove test_lac predictor run case
---
 .../unittests/dygraph_to_static/test_lac.py      | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index d8cb3854d3e23..c9bc8cc647df3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -28,8 +28,6 @@
 from paddle.fluid.dygraph import declarative, ProgramTranslator
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
-from predictor_utils import PredictorTools
-
 SEED = 2020
 
 program_translator = ProgramTranslator()
@@ -540,7 +538,6 @@ def verify_predict(self):
             dy_pre = self.predict_dygraph(batch)
             st_pre = self.predict_static(batch)
             dy_jit_pre = self.predict_dygraph_jit(batch)
-            predictor_pre = self.predict_analysis_inference(batch)
             self.assertTrue(
                 np.allclose(dy_pre, st_pre),
                 msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
@@ -548,10 +545,6 @@ def verify_predict(self):
                 np.allclose(dy_jit_pre, st_pre),
                 msg="dy_jit_pre:\n {}\n, st_pre: \n{}.".format(dy_jit_pre,
                                                                st_pre))
-            self.assertTrue(
-                np.allclose(predictor_pre, st_pre),
-                msg="predictor_pre:\n {}\n, st_pre: \n{}.".format(predictor_pre,
-                                                                  st_pre))
 
     def predict_dygraph(self, batch):
         words, targets, length = batch
@@ -602,15 +595,6 @@ def predict_dygraph_jit(self, batch):
 
             return pred_res.numpy()
 
-    def predict_analysis_inference(self, batch):
-        words, targets, length = batch
-
-        output = PredictorTools(self.args.model_save_dir,
-                                self.args.model_filename,
-                                self.args.params_filename, [words, length])
-        out = output()
-        return out
-
 
 if __name__ == "__main__":
     unittest.main()