diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h index 2de5242831835..2082f8bb76fb6 100644 --- a/paddle/framework/grad_op_desc_maker.h +++ b/paddle/framework/grad_op_desc_maker.h @@ -87,7 +87,11 @@ class GradOpDescMakerBase { auto onames = this->Output(name); ret_val.reserve(onames.size()); std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val), - GradVarName); + [this](const std::string& fwd_var_name) -> std::string { + auto g_name = GradVarName(fwd_var_name); + (*this->grad_to_var_)[g_name] = fwd_var_name; + return g_name; + }); return ret_val; } diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h index 4cf784a0d0d31..a5ffb162928bf 100644 --- a/paddle/framework/op_desc.h +++ b/paddle/framework/op_desc.h @@ -129,7 +129,7 @@ class OpDesc { } proto::OpDesc desc_; - // input arg name => output variable names + // input arg name => input variable names VariableNameMap inputs_; // output arg name => output variable names VariableNameMap outputs_; diff --git a/paddle/operators/norm_op.cc b/paddle/operators/norm_op.cc index b198b76cd49ca..0eeafcaae0a01 100644 --- a/paddle/operators/norm_op.cc +++ b/paddle/operators/norm_op.cc @@ -39,7 +39,7 @@ class NormOpMaker : public framework::OpProtoAndCheckerMaker { "M = C * H * W"); AddComment(R"DOC( "Input shape: $(N, C, H, W)$ - Sclae shape: $(C, 1)$ + Scale shape: $(C, 1)$ Output shape: $(N, C, H, W)$ Where forward diff --git a/paddle/operators/norm_op.h b/paddle/operators/norm_op.h index 7bee48919e9fd..5759d6f1f07d5 100644 --- a/paddle/operators/norm_op.h +++ b/paddle/operators/norm_op.h @@ -66,7 +66,7 @@ class NormKernel : public framework::OpKernel { context.GetPlace()); auto tmp = framework::EigenVector::Flatten(tmp_tensor); - // get colsum and sqrt , inverse + // get colsum and sqrt , inverse auto dim = Eigen::array({{0}}); tmp.device(*place) = x_square_batch_eigen.sum(dim); tmp.device(*place) = (tmp + epsilon).sqrt().inverse(); diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 8d0eb53b8e1ff..cea2d1e09068d 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -1,8 +1,9 @@ from paddle.v2.fluid import framework as framework from . import core import collections +import copy -__all__ = ['append_backward'] +__all__ = ['append_backward', 'calc_gradient'] def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None): @@ -65,6 +66,18 @@ def _all_in_set_(cands, s): return True +def _some_in_set_(cands, s): + """ + Test if some elements of 'cands' are in set 's' + """ + if len(cands) == 0: + return False + for c in cands: + if c in s: + return True + return False + + def _strip_grad_suffix_(name): """ Strip the grad suffix from the given varibale name @@ -169,8 +182,8 @@ def _op_can_be_removed_(op_desc, no_grad_set): return op_descs -def _append_backward_ops_(target, - block, +def _append_backward_ops_(block, + ops, target_block, no_grad_dict, grad_to_var, @@ -179,8 +192,8 @@ def _append_backward_ops_(target, Create all grad ops, and insert them into given block Args: - target(Variable): the target variable of forward pass block(Block): the block where forward ops are + ops(Op): the forward operators whose backward ops need to be added target_block(Block): the block which is going to hold new generated grad ops no_grad_dict(dict): key(int) block index @@ -202,14 +215,14 @@ def empty_callback(block, context): # grad_op_descs holds created grad_op, and will be appended to target_block grad_op_descs = [] program = block.program - for op in reversed(block.ops): + for op in reversed(ops): grad_sub_block_list = [] # If the op has its own sub-block, deal with the sub-block first if op.has_attr("sub_block"): sub_block = program.block(op.block_attr("sub_block")) grad_sub_block = program.create_block(parent_idx=sub_block.idx) - _append_backward_ops_(target, sub_block, grad_sub_block, - no_grad_dict, grad_to_var, callback) + _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block, + no_grad_dict, grad_to_var) grad_sub_block_list.append(grad_sub_block.desc) # Getting op's corresponding grad_op @@ -224,14 +237,6 @@ def empty_callback(block, context): grad_op_descs = _remove_no_grad_branch_(grad_op_descs, no_grad_dict[block.idx]) - if target_block.idx == 0: - grad_op_descs.insert( - 0, - _create_op_desc_("fill_constant", {}, { - "Out": [_append_grad_suffix_(target.name)] - }, {"shape": [1], - "value": 1.0, - "dtype": target.dtype})) # append op_desc in grad_op_descs to target_block for op_desc in grad_op_descs: new_op_desc = target_block.desc.append_op() @@ -252,7 +257,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): In most cases, this dict is generated by _append_backward_ops_() grad_info_map(dict)(output argument): key(str): forward variable name - val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index + val(tuple): a tuple of (str, Block), str is the corresponding grad name, Block is the block containing grad variable """ for op_idx in range(start_op_idx, block.desc.op_size()): op_desc = block.desc.op(op_idx) @@ -279,41 +284,63 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): _infer_var_data_type_(arg, block) +def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map): + var_map = copy.copy(target_grad_map) + for op_idx in range(start_op_idx, block.desc.op_size()): + op_desc = block.desc.op(op_idx) + for name in op_desc.input_arg_names(): + if name in var_map: + op_desc.rename_input(name, var_map[name]) + + for name in op_desc.output_arg_names(): + if block.desc.find_var(name.encode("ascii")): + new_name = "%s_%s" % (name, core.unique_integer(name)) + op_desc.rename_output(name, new_name) + var_map[name] = new_name + + for g, ng in var_map.iteritems(): + if g in grad_to_var: + grad_to_var[ng] = grad_to_var[g] + grad_to_var.pop(g) + + +def _get_stop_gradients_(program): + no_grad_dict = dict() + assert isinstance(program, framework.Program) + for block in program.blocks: + assert isinstance(block, framework.Block) + block_no_grad_set = set() + for var in block.vars.itervalues(): + assert isinstance(var, framework.Variable) + if var.stop_gradient: + block_no_grad_set.add(_append_grad_suffix_(var.name)) + no_grad_dict[block.idx] = block_no_grad_set + return no_grad_dict + + def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None): """ Append backward part to main_program Args: loss(Variable): The variable generated by cost function. - parameter_list(list): Parameters that need to be updated by optimizer. - If None, it means all parameters need to be updated. + parameter_list(list[string]): Parameters that need to be updated by + optimizer. If None, it means all parameters need to be updated. no_grad_set(set): Variables that have no gradients in Block 0. - If None, the set will be generated inside the function and - contains all variables with `step_gradient=True` from all blocks. + All variables with `step_gradient=True` from all blocks will be + automatically added. Return: - (list[Variable]): list of (parameters, gradients) pair. + (list[(Variable,Variable)]): list of (parameter, gradient) pair. """ assert isinstance(loss, framework.Variable) program = loss.block.program - no_grad_dict = dict() if no_grad_set is None: - assert isinstance(program, framework.Program) - for block in program.blocks: - assert isinstance(block, framework.Block) - block_no_grad_set = set() - for var in block.vars.itervalues(): - assert isinstance(var, framework.Variable) - if var.stop_gradient: - block_no_grad_set.add(_append_grad_suffix_(var.name)) - no_grad_dict[block.idx] = block_no_grad_set - elif isinstance(no_grad_set, set): - no_grad_dict = { - 0: set([_append_grad_suffix_(name) for name in no_grad_set]) - } - else: - raise ValueError("'no_grad_set' should be a set or None.") + no_grad_set = set() + no_grad_set = copy.copy(no_grad_set) + no_grad_dict = _get_stop_gradients_(program) + no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set)) grad_info_map = dict() root_block = program.block(0) @@ -322,8 +349,25 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None): current_block_idx = program.current_block_idx grad_to_var = dict() - _append_backward_ops_(loss, root_block, root_block, no_grad_dict, + op_desc = _create_op_desc_("fill_constant", {}, { + "Out": [_append_grad_suffix_(loss.name)] + }, {"shape": [1], + "value": 1.0, + "dtype": loss.dtype}) + root_block.desc.append_op().copy_from(op_desc) + + block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0])) + op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set) + no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set)) + + _append_backward_ops_(root_block, op_path, root_block, no_grad_dict, grad_to_var, callback) + + # Because calc_gradient may be called multiple times, + # we need rename the internal gradient variables so that they have + # different names. + _rename_grad_(root_block, fwd_op_num, grad_to_var, {}) + _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map) program.current_block_idx = current_block_idx @@ -334,6 +378,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None): else: params = program.global_block().all_parameters() parameters = [param.name for param in params] + params_and_grads = [] for param in parameters: if param not in grad_info_map: @@ -351,3 +396,147 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None): else: params_and_grads.append((param_var, None)) return params_and_grads + + +def _as_list(x): + if x is None: + return [] + return list(x) if isinstance(x, collections.Sequence) else [x] + + +def _find_op_path_(block, outputs, inputs, no_grad_set): + """ + no_grad_set will also be changed + """ + input_names = set([inp.name for inp in inputs]) + output_names = set([out.name for out in outputs]) + + relevant_op_flags = [True] * len(block.ops) + + # All the inputs of the block are used if inputs is empty, + if inputs: + for i, op in enumerate(block.ops): + if _some_in_set_(op.desc.input_arg_names(), input_names): + for name in op.desc.output_arg_names(): + if name not in no_grad_set: + input_names.add(name) + else: + relevant_op_flags[i] = False + + for i, op in reversed(list(enumerate(block.ops))): + if _some_in_set_(op.desc.output_arg_names(), output_names): + for name in op.desc.input_arg_names(): + if name not in no_grad_set: + output_names.add(name) + else: + relevant_op_flags[i] = False + + op_path = [ + block.ops[i] for i in range(len(block.ops)) if relevant_op_flags[i] + ] + + if inputs: + for op in op_path: + for name in op.desc.input_arg_names(): + if name not in input_names: + no_grad_set.add(name) + + return op_path + + +def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None): + """ + Backpropagate the graidents of targets to inputs. + + Args: + targets(Variable|list[Variable]): The target variables + inputs(Variable|list[Variable]): The input variables + no_grad_set(set[string]): The names of variables that have no gradients + in Block 0. All variables with `stop_gradient=True` from all blocks + will be automatically added. + + Return: + (list[Variable]): list of gradients for inputs + If an input does not affect targets, the corresponding gradient variable + will be None + """ + targets = _as_list(targets) + inputs = _as_list(inputs) + target_gradients = _as_list(target_gradients) + + block = targets[0].block + prog = block.program + block_idx = block.idx + + if not target_gradients: + target_gradients = [None] * len(targets) + + if len(targets) != len(target_gradients): + raise ValueError( + "Should have the same number of target_gradients as targets") + + if no_grad_set is None: + no_grad_set = set() + no_grad_set = copy.copy(no_grad_set) + no_grad_dict = _get_stop_gradients_(prog) + no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set)) + + fwd_op_num = block.desc.op_size() + + target_grad_map = {} + for i, grad in enumerate(target_gradients): + target = targets[i] + if grad is None: + grad_name = _append_grad_suffix_(target.name) + op_desc = _create_op_desc_("fill_constant_batch_size_like", + {"Input": [target.name]}, + {"Out": [grad_name]}, { + "shape": target.shape, + "value": 1.0, + "dtype": target.dtype, + 'input_dim_idx': 0, + 'output_dim_idx': 0 + }) + block.desc.append_op().copy_from(op_desc) + else: + if target.block.idx != block_idx or target.block.program != prog: + raise ValueError("all targets must be in the same block") + if target.shape != grad.shape: + raise ValueError( + "The shapes of target and grad are different: %s %s" % ( + target.name, grad.name)) + target_grad_map[_append_grad_suffix_(target.name)] = grad.name + + for input in inputs: + if input.block.program != prog: + raise "input must be in the same program as targets" + + block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0])) + op_path = _find_op_path_(block, targets, inputs, block_no_grad_set) + no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set)) + grad_to_var = dict() + grad_info_map = dict() + _append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var) + + # Because calc_gradient may be called multiple times, + # we need rename the internal gradient variables so that they have + # different names. + _rename_grad_(block, fwd_op_num, grad_to_var, target_grad_map) + + _append_backward_vars_(block, fwd_op_num, grad_to_var, grad_info_map) + prog.sync_with_cpp() + + grad_vars = [] + for input_var in inputs: + if input_var.name not in grad_info_map: + grad_vars.append(None) + else: + grad_info = grad_info_map[input_var.name] + grad_block = grad_info[1] + grad_var = grad_block.var(grad_info[0]) + grad_vars.append(grad_var) + + if len(grad_vars) == 1: + return grad_vars[0] + else: + return grad_vars diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py index 23fe13f9bbf3e..544623c4bce0c 100644 --- a/python/paddle/v2/fluid/layers/ops.py +++ b/python/paddle/v2/fluid/layers/ops.py @@ -1,7 +1,17 @@ from ..registry import register_layer __activations__ = [ - 'abs', 'tanh', 'sigmoid', 'relu', 'sqrt', 'ceil', 'floor', 'log', 'round' + 'abs', + 'ceil', + 'exp', + 'floor', + 'log', + 'relu', + 'round', + 'sigmoid', + 'sqrt', + 'square', + 'tanh', ] __all__ = [ diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py index 9ce25a9e0831a..5f12ecfc14f75 100644 --- a/python/paddle/v2/fluid/layers/tensor.py +++ b/python/paddle/v2/fluid/layers/tensor.py @@ -1,7 +1,8 @@ from ..layer_helper import LayerHelper +from ..param_attr import ParamAttr __all__ = [ - 'create_tensor', 'cast', 'concat', 'sums', 'assign', + 'create_tensor', 'create_parameter', 'cast', 'concat', 'sums', 'assign', 'fill_constant_batch_size_like', 'fill_constant', 'ones', 'zeros' ] @@ -11,6 +12,33 @@ def create_tensor(dtype, name=None): return helper.create_variable(name=helper.name, dtype=dtype) +def create_parameter(shape, + dtype, + attr=None, + is_bias=False, + default_initializer=None): + """ + Create a parameter + Args: + shape(list[int]): shape of the parameter + dtype(string): element type of the parameter + attr(ParamAttr): attributes of the parameter + is_bias(bool): This can affect which default initializer is chosen + when default_initializer is None. If is_bias, + initializer.Constant(0.0) will be used. Otherwise, + Xavier() will be used. + default_initializer(Initializer): initializer for the parameter + + Returns: + Parameter: the created parameter + """ + helper = LayerHelper("create_parameter") + if attr is None: + attr = ParamAttr() + return helper.create_parameter(attr, shape, dtype, is_bias, + default_initializer) + + def cast(x, dtype): """ This function takes in the input with input_dtype @@ -180,7 +208,8 @@ def fill_constant_batch_size_like(input, Examples: .. code-block:: python - data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64') + data = fluid.layers.fill_constant_batch_size_like( + input=like, shape=[1], value=0, dtype='int64') """ helper = LayerHelper("fill_constant_batch_size_like", **locals()) out = helper.create_tmp_variable(dtype=dtype) diff --git a/python/paddle/v2/fluid/tests/test_calc_gradient.py b/python/paddle/v2/fluid/tests/test_calc_gradient.py new file mode 100644 index 0000000000000..c34c8ff6d143f --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_calc_gradient.py @@ -0,0 +1,25 @@ +import unittest + +import paddle.v2.fluid as fluid +import paddle.v2.fluid.layers as layers +import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.optimizer as optimizer +from paddle.v2.fluid.backward import calc_gradient + + +class TestCalcGradient(unittest.TestCase): + def test_calc_gradient(self): + x = layers.create_parameter(dtype="float32", shape=[5, 10]) + y = layers.create_parameter(dtype="float32", shape=[10, 8]) + mul_out = layers.mul(x=x, y=y) + mean_out = layers.mean(x=mul_out) + a = calc_gradient(mean_out, mul_out) + b = calc_gradient(mean_out, x) + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + exe.run(fluid.default_main_program(), feed={}, fetch_list=[a, b]) + + +if __name__ == "__main__": + unittest.main()