Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fuse L2Decay and momentum when param.regularizer is set #32845

Merged
merged 5 commits into from
Jun 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 93 additions & 7 deletions python/paddle/fluid/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
from .initializer import Constant
from .layer_helper import LayerHelper
from .layers import ops
from .regularizer import append_regularization_ops
from .dygraph import base as imperative_base
from .dygraph import no_grad
from .dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay
Expand Down Expand Up @@ -884,6 +883,93 @@ def backward(self,
act_no_grad_set, callbacks)
return params_grads

def _create_regularization_of_grad(self, param, grad, regularization=None):
""" Create and add backward regularization Operators

Function helper of append_regularization_ops.
"""
# If no gradient or no regularization is specified, then we don't need to do anything
if grad is None or ((not hasattr(param, 'regularizer') or
(hasattr(param, 'regularizer') and
param.regularizer is None)) and
regularization is None):
return grad
regularization_term = None
if hasattr(param, 'regularizer') and param.regularizer is not None:
# Add variable for regularization term in grad block
regularization_term = param.regularizer(param, grad, grad.block)
elif regularization is not None:
regularization_term = regularization(param, grad, grad.block)

assert regularization_term is not None

new_grad = grad
if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
# FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
# the grad's type and name will be changed. But the gradient's name
# is used in ParallelExecutor Reduce mode, so I add a flag for
# the new_grad here.
new_grad = grad.block.create_var(
name=grad.name + core.kNewGradSuffix(),
dtype=param.dtype,
shape=param.shape,
lod_level=param.lod_level,
type=core.VarDesc.VarType.LOD_TENSOR)

inputs = {"X": [grad, regularization_term]}
outputs = {"Out": [new_grad]}
if framework.in_dygraph_mode():
new_grad = core.ops.sum([grad, regularization_term])
else:
grad.block.append_op(type='sum', inputs=inputs, outputs=outputs)

return new_grad

def append_regularization_ops(self,
parameters_and_grads,
regularization=None):
r"""Create and add backward regularization Operators

Creates and adds backward regularization operators in the BlockDesc.
This will add gradients of the regularizer function to the gradients
of the parameters and return these modified gradients. This is the
same as implementing weight decay in optimizers for regularization.

Args:
parameters_and_grads: A list of (parameters, gradients) pairs
that need to be regularized.
regularization: A global regularizer. If the parameter is not
set. It will be applied with regularizer.

Returns:
list[(Variable, Variable)]: list of (parameters, gradients) \
pair with the regularized gradient

Raises:
Exception: Unknown regularization type
"""
params_and_grads = []
if framework.in_dygraph_mode():
for param, grad in parameters_and_grads:
new_grad = self._create_regularization_of_grad(param, grad,
regularization)
params_and_grads.append((param, new_grad))
else:
repeate_regularizer = False
with framework.name_scope('regularization'):
for param, grad in parameters_and_grads:
if not repeate_regularizer and param.regularizer is not None and regularization is not None:
repeate_regularizer = True
logging.info(
"If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
"The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
% regularization.__str__())
with param.block.program._optimized_guard([param, grad]):
new_grad = self._create_regularization_of_grad(
param, grad, regularization)
params_and_grads.append((param, new_grad))
return params_and_grads

def apply_gradients(self, params_grads):
"""
Second part of `minimize`, appending optimization operators for
Expand Down Expand Up @@ -916,8 +1002,8 @@ def apply_gradients(self, params_grads):
params_grads = append_gradient_clip_ops(params_grads)

# Add regularization if any
params_grads = append_regularization_ops(params_grads,
self.regularization)
params_grads = self.append_regularization_ops(params_grads,
self.regularization)

optimize_ops = self._create_optimization_pass(params_grads)
return optimize_ops
Expand All @@ -939,8 +1025,8 @@ def apply_optimize(self, loss, startup_program, params_grads):
framework.default_startup_program()):
if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads)
params_grads = append_regularization_ops(params_grads,
self.regularization)
params_grads = self.append_regularization_ops(
params_grads, self.regularization)
optimize_ops = self._create_optimization_pass(params_grads)
else:
program = loss.block.program
Expand Down Expand Up @@ -1674,8 +1760,8 @@ def apply_gradients(self, params_grads):
not_dgc_params_grads = append_gradient_clip_ops(
not_dgc_params_grads)

not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads,
self.regularization)
not_dgc_params_grads = self.append_regularization_ops(
not_dgc_params_grads, self.regularization)

params_grads = not_dgc_params_grads + dgc_params_grads
params_grads = sorted(params_grads, key=lambda x: x[0].name)
Expand Down
86 changes: 0 additions & 86 deletions python/paddle/fluid/regularizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,92 +22,6 @@
__all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer']


def _create_regularization_of_grad(param, grad, regularization=None):
""" Create and add backward regularization Operators

Function helper of append_regularization_ops.
"""
# If no gradient or no regularization is specified, then we don't need to do anything
if grad is None or ((not hasattr(param, 'regularizer') or (
hasattr(param, 'regularizer') and param.regularizer is None)) and
regularization is None):
return grad
regularization_term = None
if hasattr(param, 'regularizer') and param.regularizer is not None:
# Add variable for regularization term in grad block
regularization_term = param.regularizer(param, grad, grad.block)
elif regularization is not None:
regularization_term = regularization(param, grad, grad.block)

assert regularization_term is not None

new_grad = grad
if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
# FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
# the grad's type and name will be changed. But the gradient's name
# is used in ParallelExecutor Reduce mode, so I add a flag for
# the new_grad here.
new_grad = grad.block.create_var(
name=grad.name + core.kNewGradSuffix(),
dtype=param.dtype,
shape=param.shape,
lod_level=param.lod_level,
type=core.VarDesc.VarType.LOD_TENSOR)

inputs = {"X": [grad, regularization_term]}
outputs = {"Out": [new_grad]}
if in_dygraph_mode():
new_grad = core.ops.sum([grad, regularization_term])
else:
grad.block.append_op(type='sum', inputs=inputs, outputs=outputs)

return new_grad


def append_regularization_ops(parameters_and_grads, regularization=None):
r"""Create and add backward regularization Operators

Creates and adds backward regularization operators in the BlockDesc.
This will add gradients of the regularizer function to the gradients
of the parameters and return these modified gradients. This is the
same as implementing weight decay in optimizers for regularization.

Args:
parameters_and_grads: A list of (parameters, gradients) pairs
that need to be regularized.
regularization: A global regularizer. If the parameter is not
set. It will be applied with regularizer.

Returns:
list[(Variable, Variable)]: list of (parameters, gradients) \
pair with the regularized gradient

Raises:
Exception: Unknown regularization type
"""
params_and_grads = []
if in_dygraph_mode():
for param, grad in parameters_and_grads:
new_grad = _create_regularization_of_grad(param, grad,
regularization)
params_and_grads.append((param, new_grad))
else:
repeate_regularizer = False
with framework.name_scope('regularization'):
for param, grad in parameters_and_grads:
if not repeate_regularizer and param.regularizer is not None and regularization is not None:
repeate_regularizer = True
logging.info(
"If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
"The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
% regularization.__str__())
with param.block.program._optimized_guard([param, grad]):
new_grad = _create_regularization_of_grad(param, grad,
regularization)
params_and_grads.append((param, new_grad))
return params_and_grads


class WeightDecayRegularizer(object):
"""Base class for weight decay regularizers

Expand Down
71 changes: 71 additions & 0 deletions python/paddle/fluid/tests/unittests/test_momentum_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,77 @@ def test_momentum_static(self):
exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)


class TestFusedMomentumWithDecayAPI(unittest.TestCase):
def get_program(self, weight_attr, bias_attr=False):
main_program = paddle.static.Program()
startup_program = paddle.static.Program()
with paddle.static.program_guard(
main_program=main_program, startup_program=startup_program):
x = paddle.static.data(name='x', shape=[10, 10])
linear = paddle.nn.Linear(
10, 10, weight_attr=weight_attr, bias_attr=bias_attr)
out = linear(x)
loss = paddle.mean(out)
optimizer = paddle.optimizer.Momentum(
learning_rate=0.01,
momentum=0.9,
weight_decay=paddle.regularizer.L2Decay(0.5))
optimizer.minimize(loss)
return main_program

def test_param_has_l2decay(self):
paddle.enable_static()
weight_attr = paddle.ParamAttr(
name="weight",
initializer=paddle.nn.initializer.Constant(value=0.5),
regularizer=paddle.regularizer.L2Decay(0.1))
program = self.get_program(weight_attr, bias_attr=False)
ops = program.global_block().ops

self.assertEqual(ops[-1].attr('regularization_method'), 'l2_decay')
self.assertEqual(ops[-1].attr('regularization_coeff'), np.float32(0.1))
for i in range(len(ops)):
self.assertTrue('sum' not in ops[i].type)
self.assertTrue('scale' not in ops[i].type)

def test_param_has_l1decay(self):
paddle.enable_static()
weight_attr = paddle.ParamAttr(
name="weight",
initializer=paddle.nn.initializer.Constant(value=0.5),
regularizer=paddle.regularizer.L1Decay(0.1))
bias_attr = paddle.ParamAttr(
name="bias",
initializer=paddle.nn.initializer.Constant(value=0.),
regularizer=None)
program = self.get_program(weight_attr, bias_attr)
ops = program.global_block().ops

self.assertEqual(ops[-1].type, 'momentum')
self.assertEqual(ops[-2].type, 'momentum')
self.assertEqual(ops[-3].type, 'sum')
self.assertEqual(ops[-4].type, 'scale')
self.assertEqual(ops[-5].type, 'sign')
self.assertEqual(ops[-6].type, 'matmul_grad')
if 'weight' in ops[-1].input('Param'):
self.assertEqual(ops[-1].attr('regularization_method'), '')
self.assertEqual(ops[-1].attr('regularization_coeff'), 0)
if 'bias' in ops[-2].input('Param'):
self.assertEqual(ops[-2].attr('regularization_method'), 'l2_decay')
self.assertEqual(ops[-2].attr('regularization_coeff'),
np.float32(0.5))

def test_param_has_no_regularizer(self):
paddle.enable_static()
program = self.get_program(weight_attr=None)
ops = program.global_block().ops
self.assertEqual(ops[-1].attr('regularization_method'), 'l2_decay')
self.assertEqual(ops[-1].attr('regularization_coeff'), np.float32(0.5))
for i in range(len(ops)):
self.assertTrue('sum' not in ops[i].type)
self.assertTrue('scale' not in ops[i].type)


class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
def __update_params(self, momentum, linear):
for i in range(10):
Expand Down
2 changes: 2 additions & 0 deletions python/paddle/fluid/tests/unittests/test_regularizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def test_l2decay_regularizer(self):
params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1)
count_ops = len(block.ops)
optimizer = paddle.optimizer.Adam()
params_grads = optimizer.append_regularization_ops(params_grads)
self.assertEqual(len(params_grads), 1)
self.assertEqual(len(block.ops), count_ops + 2)
Expand Down Expand Up @@ -97,6 +98,7 @@ def test_l2decay_regularizer(self):
params_grads = append_backward(mean_out)
self.assertEqual(len(params_grads), 1)
count_ops = len(block.ops)
optimizer = paddle.optimizer.Adam()
params_grads = optimizer.append_regularization_ops(params_grads)
self.assertEqual(len(params_grads), 1)
self.assertEqual(len(block.ops), count_ops + 3)
Expand Down
35 changes: 31 additions & 4 deletions python/paddle/optimizer/momentum.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,19 @@ def _create_accumulators(self, block, parameters):
)
self._add_accumulator(self._velocity_acc_str, p)

def _create_regularization_of_grad(self, param, grad, regularization=None):
""" Create and add backward regularization Operators

Function helper of append_regularization_ops.
"""
# If ParamAttr is set to L2Decay, we skip doing regularization here. And then we fused
# L2Decay with momentum which can refer to _append_optimize_op below.
if hasattr(param, 'regularizer') and isinstance(param.regularizer,
L2DecayRegularizer):
return grad
return super(Momentum, self)._create_regularization_of_grad(
param, grad, regularization)

def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
if isinstance(param_and_grad, dict):
Expand All @@ -261,15 +274,29 @@ def _append_optimize_op(self, block, param_and_grad):
param_and_grad[0])
lr = self._create_param_lr(param_and_grad)

# For fusion of momentum and l2decay
param = param_and_grad[0]
regularization_method = self._regularization_method
regularization_coeff = self._regularization_coeff
if hasattr(param, 'regularizer'):
# we skip param's l2decay before, so fuse it with momentum here.
if isinstance(param.regularizer, L2DecayRegularizer):
regularization_method = "l2_decay"
regularization_coeff = param.regularizer._regularization_coeff
# the param's regularization has been done before, we avoid do l2decay in momentum.
elif param.regularizer is not None:
regularization_method = ""
regularization_coeff = 0

if framework.in_dygraph_mode():
if isinstance(param_and_grad, dict):
self._update_regularization(param_and_grad['weight_decay'])
_, _ = core.ops.momentum(
param_and_grad[0], param_and_grad[1], velocity_acc, lr,
param_and_grad[0], velocity_acc, 'mu', self._momentum,
'use_nesterov', self._use_nesterov, 'regularization_method',
self._regularization_method, 'regularization_coeff',
self._regularization_coeff)
regularization_method, 'regularization_coeff',
regularization_coeff)
return None

find_master = self._multi_precision and param_and_grad[
Expand All @@ -280,8 +307,8 @@ def _append_optimize_op(self, block, param_and_grad):
attrs = {
"mu": self._momentum,
"use_nesterov": self._use_nesterov,
"regularization_method": self._regularization_method,
"regularization_coeff": self._regularization_coeff,
"regularization_method": regularization_method,
"regularization_coeff": regularization_coeff,
"multi_precision": find_master,
"rescale_grad": self._rescale_grad
}
Expand Down
Loading