diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py index 3072c375123a2..bb535f0926061 100644 --- a/demo/seqToseq/api_train_v2.py +++ b/demo/seqToseq/api_train_v2.py @@ -21,9 +21,12 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): size=word_vector_dim, param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) src_forward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size) + name='src_forward_gru', input=src_embedding, size=encoder_size) src_backward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size, reverse=True) + name='src_backward_gru', + input=src_embedding, + size=encoder_size, + reverse=True) encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) #### Decoder @@ -34,7 +37,9 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): backward_first = paddle.layer.first_seq(input=src_backward) with paddle.layer.mixed( - size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot: + name="decoder_boot_mixed", + size=decoder_size, + act=paddle.activation.Tanh()) as decoder_boot: decoder_boot += paddle.layer.full_matrix_projection( input=backward_first) @@ -44,11 +49,17 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word): name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) context = paddle.networks.simple_attention( + name="simple_attention", encoded_sequence=enc_vec, encoded_proj=enc_proj, decoder_state=decoder_mem) - with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: + with paddle.layer.mixed( + name="input_recurrent", + size=decoder_size * 3, + # enable error clipping + layer_attr=paddle.attr.ExtraAttr( + error_clipping_threshold=100.0)) as decoder_inputs: decoder_inputs += paddle.layer.full_matrix_projection(input=context) decoder_inputs += paddle.layer.full_matrix_projection( input=current_word) @@ -57,9 +68,12 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word): name='gru_decoder', input=decoder_inputs, output_mem=decoder_mem, + # uncomment to enable local threshold for gradient clipping + # param_attr=paddle.attr.ParamAttr(gradient_clipping_threshold=9.9), size=decoder_size) with paddle.layer.mixed( + name="gru_step_output", size=target_dict_dim, bias_attr=True, act=paddle.activation.Softmax()) as out: @@ -125,7 +139,13 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word): def main(): - paddle.init(use_gpu=False, trainer_count=1) + paddle.init( + use_gpu=False, + trainer_count=1, + # log gradient clipping info + log_clipping=True, + # log error clipping info + log_error_clipping=True) is_generating = False # source and target dict dim. @@ -140,6 +160,8 @@ def main(): # define optimize method and trainer optimizer = paddle.optimizer.Adam( learning_rate=5e-5, + # uncomment to enable global threshold for gradient clipping + # gradient_clipping_threshold=10.0, regularization=paddle.optimizer.L2Regularization(rate=8e-4)) trainer = paddle.trainer.SGD(cost=cost, parameters=parameters,