diff --git a/nmt_without_attention/README.md b/nmt_without_attention/README.md index a54b715102..2fd43bbdda 100644 --- a/nmt_without_attention/README.md +++ b/nmt_without_attention/README.md @@ -51,14 +51,15 @@ RNN 的原始结构用一个向量来存储隐状态,然而这种结构的 RNN 在 PaddlePaddle 中,双向编码器可以很方便地调用相关 APIs 实现: ```python -#### Encoder src_word_id = paddle.layer.data( name='source_language_word', type=paddle.data_type.integer_value_sequence(source_dict_dim)) + # source embedding src_embedding = paddle.layer.embedding( input=src_word_id, size=word_vector_dim) -# use bidirectional_gru + +# bidirectional GRU as encoder encoded_vector = paddle.networks.bidirectional_gru( input=src_embedding, size=encoder_size, @@ -84,19 +85,17 @@ encoded_vector = paddle.networks.bidirectional_gru( ### 无注意力机制的解码器 -PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 +- PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例介绍的则是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的`recurrent_layer_group`。首先,自定义单步逻辑函数,再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现,其中,单步逻辑函数`gru_decoder_without_attention()`相关代码如下: ```python -#### Decoder +# the initialization state for decoder GRU encoder_last = paddle.layer.last_seq(input=encoded_vector) -encoder_last_projected = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(input=encoder_last)) +encoder_last_projected = paddle.layer.fc( + size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last) -# gru step +# the step function for decoder GRU def gru_decoder_without_attention(enc_vec, current_word): ''' Step function for gru decoder @@ -106,33 +105,29 @@ def gru_decoder_without_attention(enc_vec, current_word): :type current_word: layer object ''' decoder_mem = paddle.layer.memory( - name='gru_decoder', - size=decoder_size, - boot_layer=encoder_last_projected) + name="gru_decoder", + size=decoder_size, + boot_layer=encoder_last_projected) context = paddle.layer.last_seq(input=enc_vec) - decoder_inputs = paddle.layer.mixed( - size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) + decoder_inputs = paddle.layer.fc( + size=decoder_size * 3, input=[context, current_word]) gru_step = paddle.layer.gru_step( - name='gru_decoder', + name="gru_decoder", act=paddle.activation.Tanh(), gate_act=paddle.activation.Sigmoid(), input=decoder_inputs, output_mem=decoder_mem, size=decoder_size) - out = paddle.layer.mixed( + out = paddle.layer.fc( size=target_dict_dim, bias_attr=True, act=paddle.activation.Softmax(), - input=paddle.layer.full_matrix_projection(input=gru_step)) - return out + input=gru_step) + return out ``` 在模型训练和测试阶段,解码器的行为有很大的不同: @@ -143,34 +138,14 @@ def gru_decoder_without_attention(enc_vec, current_word): 训练和生成的逻辑分别实现在如下的`if-else`条件分支中: ```python -decoder_group_name = "decoder_group" -group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) +group_input1 = paddle.layer.StaticInput(input=encoded_vector) group_inputs = [group_input1] -if not generating: - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_without_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost -else: +decoder_group_name = "decoder_group" +if is_generating: trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, - embedding_name='_target_language_embedding', + embedding_name="_target_language_embedding", embedding_size=word_vector_dim) group_inputs.append(trg_embedding) @@ -184,6 +159,26 @@ else: max_length=max_length) return beam_gen +else: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name="target_language_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name="_target_language_embedding")) + group_inputs.append(trg_embedding) + + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_without_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name="target_language_next_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost ``` ## 数据准备 @@ -191,29 +186,31 @@ else: ## 模型的训练与测试 -在定义好网络结构后,就可以进行模型训练与测试了。根据用户运行时传递的参数是`--train` 还是 `--generate`,Python 脚本的 `main()` 函数分别调用函数`train()`和`generate()`来完成模型的训练与测试。 - ### 模型训练 -模型训练阶段,函数 `train()` 依次完成了如下的逻辑: + +启动模型训练的十分简单,只需在命令行窗口中执行`python train.py`。模型训练阶段 `train.py` 脚本中的 `train()` 函数依次完成了如下的逻辑: **a) 由网络定义,解析网络结构,初始化模型参数** -``` -# initialize model +```python +# define the network topolgy. cost = seq2seq_net(source_dict_dim, target_dict_dim) parameters = paddle.parameters.create(cost) ``` **b) 设定训练过程中的优化策略、定义训练数据读取 `reader`** -``` -# define optimize method and trainer +```python +# define optimization method optimizer = paddle.optimizer.RMSProp( learning_rate=1e-3, gradient_clipping_threshold=10.0, regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + +# define the trainer instance trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) + # define data reader wmt14_reader = paddle.batch( paddle.reader.shuffle( @@ -223,40 +220,33 @@ wmt14_reader = paddle.batch( **c) 定义事件句柄,打印训练中间结果、保存模型快照** -``` -# define event_handler callback +```python +# define the event_handler callback def event_handler(event): if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0 and event.batch_id > 0: - with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' % - event.batch_id, 'w') as f: + if not event.batch_id % 100 and event.batch_id: + with gzip.open( + os.path.join(save_path, + "nmt_without_att_%05d_batch_%05d.tar.gz" % + event.pass_id, event.batch_id), "w") as f: parameters.to_tar(f) - if event.batch_id % 10 == 0: - print "\nPass %d, Batch %d, Cost%f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() + if event.batch_id and not event.batch_id % 10: + logger.info("Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics)) ``` **d) 开始训练** -``` -# start to train +```python +# start training trainer.train( reader=wmt14_reader, event_handler=event_handler, num_passes=2) ``` -启动模型训练的十分简单,只需在命令行窗口中执行 - -``` -python nmt_without_attention_v2.py --train -``` - 输出样例为 -``` +```text Pass 0, Batch 0, Cost 267.674663, {'classification_error_evaluator': 1.0} ......... Pass 0, Batch 10, Cost 172.892294, {'classification_error_evaluator': 0.953895092010498} @@ -268,81 +258,80 @@ Pass 0, Batch 30, Cost 153.633665, {'classification_error_evaluator': 0.86438035 Pass 0, Batch 40, Cost 168.170543, {'classification_error_evaluator': 0.8348183631896973} ``` +### 生成翻译结果 +利用训练好的模型生成翻译文本也十分简单。 + +1. 首先请修改`generate.py`脚本中`main`中传递给`generate`函数的参数,以选择使用哪一个保存的模型来生成。默认参数如下所示: + + ```python + generate( + source_dict_dim=30000, + target_dict_dim=30000, + batch_size=20, + beam_size=3, + model_path="models/nmt_without_att_params_batch_00100.tar.gz") + ``` + +2. 在终端执行命令 `python generate.py`,脚本中的`generate()`执行了依次如下逻辑: + + **a) 加载测试样本** + + ```python + # load data samples for generation + gen_creator = paddle.dataset.wmt14.gen(source_dict_dim) + gen_data = [] + for item in gen_creator(): + gen_data.append((item[0], )) + ``` + + **b) 初始化模型,执行`infer()`为每个输入样本生成`beam search`的翻译结果** + + ```python + beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) + with gzip.open(init_models_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + # prob is the prediction probabilities, and id is the prediction word. + beam_result = paddle.infer( + output_layer=beam_gen, + parameters=parameters, + input=gen_data, + field=['prob', 'id']) + ``` + + **c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果** + + ```python + beam_result = inferer.infer(input=test_batch, field=["prob", "id"]) + + gen_sen_idx = np.where(beam_result[1] == -1)[0] + assert len(gen_sen_idx) == len(test_batch) * beam_size + + start_pos, end_pos = 1, 0 + for i, sample in enumerate(test_batch): + print(" ".join([ + src_dict[w] for w in sample[0][1:-1] + ])) # skip the start and ending mark when print the source sentence + for j in xrange(beam_size): + end_pos = gen_sen_idx[i * beam_size + j] + print("%.4f\t%s" % (beam_result[0][i][j], " ".join( + trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) + start_pos = end_pos + 2 + print("\n") + ``` + +设置beam search的宽度为3,输入为一个法文句子,则自动为测试数据生成对应的翻译结果,输出格式如下: + +```text +Elles connaissent leur entreprise mieux que personne . +-3.754819 They know their business better than anyone . +-4.445528 They know their businesses better than anyone . +-5.026885 They know their business better than anybody . -### 模型测试 -模型测试阶段,函数`generate()`执行了依次如下逻辑: - -**a) 加载测试样本** - -``` -# load data samples for generation -gen_creator = paddle.dataset.wmt14.gen(source_dict_dim) -gen_data = [] -for item in gen_creator(): - gen_data.append((item[0], )) ``` - -**b) 初始化模型,执行`infer()`为每个输入样本生成`beam search`的翻译结果** - -``` -beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) -with gzip.open(init_models_path) as f: - parameters = paddle.parameters.Parameters.from_tar(f) -# prob is the prediction probabilities, and id is the prediction word. -beam_result = paddle.infer( - output_layer=beam_gen, - parameters=parameters, - input=gen_data, - field=['prob', 'id']) -``` - -**c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果** - -``` -# get the dictionary -src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) - -# the delimited element of generated sequences is -1, -# the first element of each generated sequence is the sequence length -seq_list = [] -seq = [] -for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - -prob = beam_result[0] -for i in xrange(len(gen_data)): - print "\n*******************************************************\n" - print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n" - for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] -``` - -模型测试的执行与模型训练类似,只需执行 - -``` -python nmt_without_attention_v2.py --generate -``` -则自动为测试数据生成了对应的翻译结果。 -设置beam search的宽度为3,输入某个法文句子 - -``` -src: Elles connaissent leur entreprise mieux que personne . -``` - -其对应的英文翻译结果为 - -``` -prob = -3.754819: They know their business better than anyone . -prob = -4.445528: They know their businesses better than anyone . -prob = -5.026885: They know their business better than anybody . -``` - -* `prob`表示生成句子的得分,随之其后则是翻译生成的句子; -* `` 表示句子的开始,``表示一个句子的结束,如果出现了在词典中未包含的词,则用``替代。 +- 第一行为输入的源语言句子。 +- 第二 ~ beam_size + 1 行是柱搜索生成的 `beam_size` 条翻译结果 + - 相同行的输出以“\t”分隔为两列,第一列是句子的log 概率,第二列是翻译结果的文本。 + - 符号`` 表示句子的开始,符号``表示一个句子的结束,如果出现了在词典中未包含的词,则用符号``替代。 至此,我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到,PaddlePaddle 提供了灵活丰富的API供大家选择和使用,使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域,各种新方法新思想在不断涌现。在学习完本例后,读者若有兴趣和余力,可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。 diff --git a/nmt_without_attention/generate.py b/nmt_without_attention/generate.py new file mode 100644 index 0000000000..1de4f46264 --- /dev/null +++ b/nmt_without_attention/generate.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +import os +import logging +import numpy as np + +from network_conf import seq2seq_net + +logger = logging.getLogger("paddle") +logger.setLevel(logging.WARNING) + + +def infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict): + beam_result = inferer.infer(input=test_batch, field=["prob", "id"]) + + gen_sen_idx = np.where(beam_result[1] == -1)[0] + assert len(gen_sen_idx) == len(test_batch) * beam_size + + start_pos, end_pos = 1, 0 + for i, sample in enumerate(test_batch): + print(" ".join([ + src_dict[w] for w in sample[0][1:-1] + ])) # skip the start and ending mark when print the source sentence + for j in xrange(beam_size): + end_pos = gen_sen_idx[i * beam_size + j] + print("%.4f\t%s" % (beam_result[0][i][j], " ".join( + trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) + start_pos = end_pos + 2 + print("\n") + + +def generate(source_dict_dim, target_dict_dim, model_path, beam_size, + batch_size): + """ + Sequence generation for NMT. + + :param source_dict_dim: size of source dictionary + :type source_dict_dim: int + :param target_dict_dim: size of target dictionary + :type target_dict_dim: int + :param model_path: path for inital model + :type model_path: string + :param beam_size: the expanson width in each generation setp + :param beam_size: int + :param batch_size: the number of training examples in one forward pass + :param batch_size: int + """ + + assert os.path.exists(model_path), "trained model does not exist." + + # step 1: prepare dictionary + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) + + # step 2: load the trained model + paddle.init(use_gpu=False, trainer_count=1) + with gzip.open(model_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + beam_gen = seq2seq_net( + source_dict_dim, + target_dict_dim, + beam_size=beam_size, + max_length=100, + is_generating=True) + inferer = paddle.inference.Inference( + output_layer=beam_gen, parameters=parameters) + + # step 3: iterating over the testing dataset + test_batch = [] + for idx, item in enumerate(paddle.dataset.wmt14.gen(source_dict_dim)()): + test_batch.append([item[0]]) + if len(test_batch) == batch_size: + infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict) + test_batch = [] + + if len(test_batch): + infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict) + test_batch = [] + + +if __name__ == "__main__": + generate( + source_dict_dim=30000, + target_dict_dim=30000, + batch_size=20, + beam_size=3, + model_path="models/nmt_without_att_params_batch_00100.tar.gz") diff --git a/nmt_without_attention/index.html b/nmt_without_attention/index.html index 35177ee5a6..d9287ecb41 100644 --- a/nmt_without_attention/index.html +++ b/nmt_without_attention/index.html @@ -93,14 +93,15 @@ 在 PaddlePaddle 中,双向编码器可以很方便地调用相关 APIs 实现: ```python -#### Encoder src_word_id = paddle.layer.data( name='source_language_word', type=paddle.data_type.integer_value_sequence(source_dict_dim)) + # source embedding src_embedding = paddle.layer.embedding( input=src_word_id, size=word_vector_dim) -# use bidirectional_gru + +# bidirectional GRU as encoder encoded_vector = paddle.networks.bidirectional_gru( input=src_embedding, size=encoder_size, @@ -126,19 +127,17 @@ ### 无注意力机制的解码器 -PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 +- PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例介绍的则是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的`recurrent_layer_group`。首先,自定义单步逻辑函数,再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现,其中,单步逻辑函数`gru_decoder_without_attention()`相关代码如下: ```python -#### Decoder +# the initialization state for decoder GRU encoder_last = paddle.layer.last_seq(input=encoded_vector) -encoder_last_projected = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(input=encoder_last)) +encoder_last_projected = paddle.layer.fc( + size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last) -# gru step +# the step function for decoder GRU def gru_decoder_without_attention(enc_vec, current_word): ''' Step function for gru decoder @@ -148,33 +147,29 @@ :type current_word: layer object ''' decoder_mem = paddle.layer.memory( - name='gru_decoder', - size=decoder_size, - boot_layer=encoder_last_projected) + name="gru_decoder", + size=decoder_size, + boot_layer=encoder_last_projected) context = paddle.layer.last_seq(input=enc_vec) - decoder_inputs = paddle.layer.mixed( - size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) + decoder_inputs = paddle.layer.fc( + size=decoder_size * 3, input=[context, current_word]) gru_step = paddle.layer.gru_step( - name='gru_decoder', + name="gru_decoder", act=paddle.activation.Tanh(), gate_act=paddle.activation.Sigmoid(), input=decoder_inputs, output_mem=decoder_mem, size=decoder_size) - out = paddle.layer.mixed( + out = paddle.layer.fc( size=target_dict_dim, bias_attr=True, act=paddle.activation.Softmax(), - input=paddle.layer.full_matrix_projection(input=gru_step)) - return out + input=gru_step) + return out ``` 在模型训练和测试阶段,解码器的行为有很大的不同: @@ -185,34 +180,14 @@ 训练和生成的逻辑分别实现在如下的`if-else`条件分支中: ```python -decoder_group_name = "decoder_group" -group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) +group_input1 = paddle.layer.StaticInput(input=encoded_vector) group_inputs = [group_input1] -if not generating: - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_without_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost -else: +decoder_group_name = "decoder_group" +if is_generating: trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, - embedding_name='_target_language_embedding', + embedding_name="_target_language_embedding", embedding_size=word_vector_dim) group_inputs.append(trg_embedding) @@ -226,6 +201,26 @@ max_length=max_length) return beam_gen +else: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name="target_language_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name="_target_language_embedding")) + group_inputs.append(trg_embedding) + + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_without_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name="target_language_next_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost ``` ## 数据准备 @@ -233,29 +228,31 @@ ## 模型的训练与测试 -在定义好网络结构后,就可以进行模型训练与测试了。根据用户运行时传递的参数是`--train` 还是 `--generate`,Python 脚本的 `main()` 函数分别调用函数`train()`和`generate()`来完成模型的训练与测试。 - ### 模型训练 -模型训练阶段,函数 `train()` 依次完成了如下的逻辑: + +启动模型训练的十分简单,只需在命令行窗口中执行`python train.py`。模型训练阶段 `train.py` 脚本中的 `train()` 函数依次完成了如下的逻辑: **a) 由网络定义,解析网络结构,初始化模型参数** -``` -# initialize model +```python +# define the network topolgy. cost = seq2seq_net(source_dict_dim, target_dict_dim) parameters = paddle.parameters.create(cost) ``` **b) 设定训练过程中的优化策略、定义训练数据读取 `reader`** -``` -# define optimize method and trainer +```python +# define optimization method optimizer = paddle.optimizer.RMSProp( learning_rate=1e-3, gradient_clipping_threshold=10.0, regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + +# define the trainer instance trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) + # define data reader wmt14_reader = paddle.batch( paddle.reader.shuffle( @@ -265,40 +262,33 @@ **c) 定义事件句柄,打印训练中间结果、保存模型快照** -``` -# define event_handler callback +```python +# define the event_handler callback def event_handler(event): if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0 and event.batch_id > 0: - with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' % - event.batch_id, 'w') as f: + if not event.batch_id % 100 and event.batch_id: + with gzip.open( + os.path.join(save_path, + "nmt_without_att_%05d_batch_%05d.tar.gz" % + event.pass_id, event.batch_id), "w") as f: parameters.to_tar(f) - if event.batch_id % 10 == 0: - print "\nPass %d, Batch %d, Cost%f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() + if event.batch_id and not event.batch_id % 10: + logger.info("Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics)) ``` **d) 开始训练** -``` -# start to train +```python +# start training trainer.train( reader=wmt14_reader, event_handler=event_handler, num_passes=2) ``` -启动模型训练的十分简单,只需在命令行窗口中执行 - -``` -python nmt_without_attention_v2.py --train -``` - 输出样例为 -``` +```text Pass 0, Batch 0, Cost 267.674663, {'classification_error_evaluator': 1.0} ......... Pass 0, Batch 10, Cost 172.892294, {'classification_error_evaluator': 0.953895092010498} @@ -310,81 +300,80 @@ Pass 0, Batch 40, Cost 168.170543, {'classification_error_evaluator': 0.8348183631896973} ``` +### 生成翻译结果 +利用训练好的模型生成翻译文本也十分简单。 + +1. 首先请修改`generate.py`脚本中`main`中传递给`generate`函数的参数,以选择使用哪一个保存的模型来生成。默认参数如下所示: + + ```python + generate( + source_dict_dim=30000, + target_dict_dim=30000, + batch_size=20, + beam_size=3, + model_path="models/nmt_without_att_params_batch_00100.tar.gz") + ``` + +2. 在终端执行命令 `python generate.py`,脚本中的`generate()`执行了依次如下逻辑: + + **a) 加载测试样本** + + ```python + # load data samples for generation + gen_creator = paddle.dataset.wmt14.gen(source_dict_dim) + gen_data = [] + for item in gen_creator(): + gen_data.append((item[0], )) + ``` + + **b) 初始化模型,执行`infer()`为每个输入样本生成`beam search`的翻译结果** + + ```python + beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) + with gzip.open(init_models_path) as f: + parameters = paddle.parameters.Parameters.from_tar(f) + # prob is the prediction probabilities, and id is the prediction word. + beam_result = paddle.infer( + output_layer=beam_gen, + parameters=parameters, + input=gen_data, + field=['prob', 'id']) + ``` + + **c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果** + + ```python + beam_result = inferer.infer(input=test_batch, field=["prob", "id"]) + + gen_sen_idx = np.where(beam_result[1] == -1)[0] + assert len(gen_sen_idx) == len(test_batch) * beam_size + + start_pos, end_pos = 1, 0 + for i, sample in enumerate(test_batch): + print(" ".join([ + src_dict[w] for w in sample[0][1:-1] + ])) # skip the start and ending mark when print the source sentence + for j in xrange(beam_size): + end_pos = gen_sen_idx[i * beam_size + j] + print("%.4f\t%s" % (beam_result[0][i][j], " ".join( + trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) + start_pos = end_pos + 2 + print("\n") + ``` + +设置beam search的宽度为3,输入为一个法文句子,则自动为测试数据生成对应的翻译结果,输出格式如下: + +```text +Elles connaissent leur entreprise mieux que personne . +-3.754819 They know their business better than anyone . +-4.445528 They know their businesses better than anyone . +-5.026885 They know their business better than anybody . -### 模型测试 -模型测试阶段,函数`generate()`执行了依次如下逻辑: - -**a) 加载测试样本** - -``` -# load data samples for generation -gen_creator = paddle.dataset.wmt14.gen(source_dict_dim) -gen_data = [] -for item in gen_creator(): - gen_data.append((item[0], )) ``` - -**b) 初始化模型,执行`infer()`为每个输入样本生成`beam search`的翻译结果** - -``` -beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) -with gzip.open(init_models_path) as f: - parameters = paddle.parameters.Parameters.from_tar(f) -# prob is the prediction probabilities, and id is the prediction word. -beam_result = paddle.infer( - output_layer=beam_gen, - parameters=parameters, - input=gen_data, - field=['prob', 'id']) -``` - -**c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果** - -``` -# get the dictionary -src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) - -# the delimited element of generated sequences is -1, -# the first element of each generated sequence is the sequence length -seq_list = [] -seq = [] -for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - -prob = beam_result[0] -for i in xrange(len(gen_data)): - print "\n*******************************************************\n" - print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n" - for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] -``` - -模型测试的执行与模型训练类似,只需执行 - -``` -python nmt_without_attention_v2.py --generate -``` -则自动为测试数据生成了对应的翻译结果。 -设置beam search的宽度为3,输入某个法文句子 - -``` -src: Elles connaissent leur entreprise mieux que personne . -``` - -其对应的英文翻译结果为 - -``` -prob = -3.754819: They know their business better than anyone . -prob = -4.445528: They know their businesses better than anyone . -prob = -5.026885: They know their business better than anybody . -``` - -* `prob`表示生成句子的得分,随之其后则是翻译生成的句子; -* `` 表示句子的开始,``表示一个句子的结束,如果出现了在词典中未包含的词,则用``替代。 +- 第一行为输入的源语言句子。 +- 第二 ~ beam_size + 1 行是柱搜索生成的 `beam_size` 条翻译结果 + - 相同行的输出以“\t”分隔为两列,第一列是句子的log 概率,第二列是翻译结果的文本。 + - 符号`` 表示句子的开始,符号``表示一个句子的结束,如果出现了在词典中未包含的词,则用符号``替代。 至此,我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到,PaddlePaddle 提供了灵活丰富的API供大家选择和使用,使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域,各种新方法新思想在不断涌现。在学习完本例后,读者若有兴趣和余力,可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。 diff --git a/nmt_without_attention/network_conf.py b/nmt_without_attention/network_conf.py new file mode 100644 index 0000000000..77a1dc77c3 --- /dev/null +++ b/nmt_without_attention/network_conf.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +import paddle.v2 as paddle +import sys +import gzip + + +def seq2seq_net(source_dict_dim, + target_dict_dim, + word_vector_dim=620, + rnn_hidden_size=1000, + beam_size=1, + max_length=50, + is_generating=False): + """ + Define the network structure of NMT, including encoder and decoder. + + :param source_dict_dim: size of source dictionary + :type source_dict_dim : int + :param target_dict_dim: size of target dictionary + :type target_dict_dim: int + :param word_vector_dim: size of source language word embedding + :type word_vector_dim: int + :param rnn_hidden_size: size of hidden state of encoder and decoder RNN + :type rnn_hidden_size: int + :param beam_size: expansion width in each step when generating + :type beam_size: int + :param max_length: max iteration number in generation + :type max_length: int + :param generating: whether to generate sequence or to train + :type generating: bool + """ + + decoder_size = encoder_size = rnn_hidden_size + + src_word_id = paddle.layer.data( + name="source_language_word", + type=paddle.data_type.integer_value_sequence(source_dict_dim)) + src_embedding = paddle.layer.embedding( + input=src_word_id, size=word_vector_dim) + + # use bidirectional_gru as the encoder + encoded_vector = paddle.networks.bidirectional_gru( + input=src_embedding, + size=encoder_size, + fwd_act=paddle.activation.Tanh(), + fwd_gate_act=paddle.activation.Sigmoid(), + bwd_act=paddle.activation.Tanh(), + bwd_gate_act=paddle.activation.Sigmoid(), + return_seq=True) + #### Decoder + encoder_last = paddle.layer.last_seq(input=encoded_vector) + encoder_last_projected = paddle.layer.fc( + size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last) + + # gru step + def gru_decoder_without_attention(enc_vec, current_word): + """ + Step function for gru decoder + + :param enc_vec: encoded vector of source language + :type enc_vec: layer object + :param current_word: current input of decoder + :type current_word: layer object + """ + decoder_mem = paddle.layer.memory( + name="gru_decoder", + size=decoder_size, + boot_layer=encoder_last_projected) + + context = paddle.layer.last_seq(input=enc_vec) + + decoder_inputs = paddle.layer.fc( + size=decoder_size * 3, input=[context, current_word]) + + gru_step = paddle.layer.gru_step( + name="gru_decoder", + act=paddle.activation.Tanh(), + gate_act=paddle.activation.Sigmoid(), + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + + out = paddle.layer.fc( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=gru_step) + return out + + group_input1 = paddle.layer.StaticInput(input=encoded_vector) + group_inputs = [group_input1] + + decoder_group_name = "decoder_group" + if is_generating: + trg_embedding = paddle.layer.GeneratedInput( + size=target_dict_dim, + embedding_name="_target_language_embedding", + embedding_size=word_vector_dim) + group_inputs.append(trg_embedding) + + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_without_attention, + input=group_inputs, + bos_id=0, + eos_id=1, + beam_size=beam_size, + max_length=max_length) + + return beam_gen + else: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name="target_language_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name="_target_language_embedding")) + group_inputs.append(trg_embedding) + + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_without_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name="target_language_next_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost diff --git a/nmt_without_attention/nmt_without_attention.py b/nmt_without_attention/nmt_without_attention.py deleted file mode 100644 index 5a61b525e6..0000000000 --- a/nmt_without_attention/nmt_without_attention.py +++ /dev/null @@ -1,263 +0,0 @@ -#!/usr/bin/env python - -import sys -import gzip -import paddle.v2 as paddle - -### Parameters -word_vector_dim = 620 -latent_chain_dim = 1000 - -beam_size = 5 -max_length = 50 - - -def seq2seq_net(source_dict_dim, target_dict_dim, generating=False): - ''' - Define the network structure of NMT, including encoder and decoder. - - :param source_dict_dim: size of source dictionary - :type source_dict_dim : int - :param target_dict_dim: size of target dictionary - :type target_dict_dim: int - ''' - - decoder_size = encoder_size = latent_chain_dim - - #### Encoder - src_word_id = paddle.layer.data( - name='source_language_word', - type=paddle.data_type.integer_value_sequence(source_dict_dim)) - src_embedding = paddle.layer.embedding( - input=src_word_id, size=word_vector_dim) - # use bidirectional_gru - encoded_vector = paddle.networks.bidirectional_gru( - input=src_embedding, - size=encoder_size, - fwd_act=paddle.activation.Tanh(), - fwd_gate_act=paddle.activation.Sigmoid(), - bwd_act=paddle.activation.Tanh(), - bwd_gate_act=paddle.activation.Sigmoid(), - return_seq=True) - #### Decoder - encoder_last = paddle.layer.last_seq(input=encoded_vector) - encoder_last_projected = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(input=encoder_last)) - - # gru step - def gru_decoder_without_attention(enc_vec, current_word): - ''' - Step function for gru decoder - - :param enc_vec: encoded vector of source language - :type enc_vec: layer object - :param current_word: current input of decoder - :type current_word: layer object - ''' - decoder_mem = paddle.layer.memory( - name='gru_decoder', - size=decoder_size, - boot_layer=encoder_last_projected) - - context = paddle.layer.last_seq(input=enc_vec) - - decoder_inputs = paddle.layer.mixed( - size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) - - gru_step = paddle.layer.gru_step( - name='gru_decoder', - act=paddle.activation.Tanh(), - gate_act=paddle.activation.Sigmoid(), - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) - - out = paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax(), - input=paddle.layer.full_matrix_projection(input=gru_step)) - return out - - decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) - group_inputs = [group_input1] - - if not generating: - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_without_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost - else: - - trg_embedding = paddle.layer.GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) - group_inputs.append(trg_embedding) - - beam_gen = paddle.layer.beam_search( - name=decoder_group_name, - step=gru_decoder_without_attention, - input=group_inputs, - bos_id=0, - eos_id=1, - beam_size=beam_size, - max_length=max_length) - - return beam_gen - - -def train(source_dict_dim, target_dict_dim): - ''' - Training function for NMT - - :param source_dict_dim: size of source dictionary - :type source_dict_dim: int - :param target_dict_dim: size of target dictionary - :type target_dict_dim: int - ''' - # initialize model - cost = seq2seq_net(source_dict_dim, target_dict_dim) - parameters = paddle.parameters.create(cost) - - # define optimize method and trainer - optimizer = paddle.optimizer.RMSProp( - learning_rate=1e-3, - gradient_clipping_threshold=10.0, - regularization=paddle.optimizer.L2Regularization(rate=8e-4)) - trainer = paddle.trainer.SGD( - cost=cost, parameters=parameters, update_equation=optimizer) - # define data reader - wmt14_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(source_dict_dim), buf_size=8192), - batch_size=55) - - # define event_handler callback - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0 and event.batch_id > 0: - with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' % - event.batch_id, 'w') as f: - parameters.to_tar(f) - - if event.batch_id % 10 == 0: - print "\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() - - # start to train - trainer.train( - reader=wmt14_reader, event_handler=event_handler, num_passes=2) - - -def generate(source_dict_dim, target_dict_dim, init_models_path): - ''' - Generating function for NMT - - :param source_dict_dim: size of source dictionary - :type source_dict_dim: int - :param target_dict_dim: size of target dictionary - :type target_dict_dim: int - :param init_models_path: path for inital model - :type init_models_path: string - ''' - - # load data samples for generation - gen_creator = paddle.dataset.wmt14.gen(source_dict_dim) - gen_data = [] - for item in gen_creator(): - gen_data.append((item[0], )) - - beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) - with gzip.open(init_models_path) as f: - parameters = paddle.parameters.Parameters.from_tar(f) - # prob is the prediction probabilities, and id is the prediction word. - beam_result = paddle.infer( - output_layer=beam_gen, - parameters=parameters, - input=gen_data, - field=['prob', 'id']) - - # get the dictionary - src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) - - # the delimited element of generated sequences is -1, - # the first element of each generated sequence is the sequence length - seq_list, seq = [], [] - for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - - prob = beam_result[0] - for i in xrange(len(gen_data)): - print "\n*******************************************************\n" - print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n" - for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] - - -def usage_helper(): - print "Please specify training/generating phase!" - print "Usage: python nmt_without_attention_v2.py --train/generate" - exit(1) - - -def main(): - if not (len(sys.argv) == 2): - usage_helper() - if sys.argv[1] == '--train': - generating = False - elif sys.argv[1] == '--generate': - generating = True - else: - usage_helper() - - # initialize paddle - paddle.init(use_gpu=False, trainer_count=1) - source_language_dict_dim = 30000 - target_language_dict_dim = 30000 - - if generating: - # modify this path to speicify a trained model. - init_models_path = 'models/nmt_without_att_params_batch_1800.tar.gz' - if not os.path.exists(init_models_path): - print "trained model cannot be found." - exit(1) - generate(source_language_dict_dim, target_language_dict_dim, - init_models_path) - else: - if not os.path.exists('./models'): - os.system('mkdir ./models') - train(source_language_dict_dim, target_language_dict_dim) - - -if __name__ == '__main__': - main() diff --git a/nmt_without_attention/train.py b/nmt_without_attention/train.py new file mode 100644 index 0000000000..9600df8e5b --- /dev/null +++ b/nmt_without_attention/train.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +import os +import logging +import paddle.v2 as paddle + +from network_conf import seq2seq_net + +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) + + +def train(save_dir_path, source_dict_dim, target_dict_dim): + ''' + Training function for NMT + + :param save_dir_path: path of the directory to save the trained models. + :param save_dir_path: str + :param source_dict_dim: size of source dictionary + :type source_dict_dim: int + :param target_dict_dim: size of target dictionary + :type target_dict_dim: int + ''' + if not os.path.exists(save_dir_path): + os.mkdir(save_dir_path) + + # initialize PaddlePaddle + paddle.init(use_gpu=False, trainer_count=1) + + cost = seq2seq_net(source_dict_dim, target_dict_dim) + parameters = paddle.parameters.create(cost) + + # define optimization method and the trainer instance + optimizer = paddle.optimizer.RMSProp( + learning_rate=1e-3, + gradient_clipping_threshold=10.0, + regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + trainer = paddle.trainer.SGD( + cost=cost, parameters=parameters, update_equation=optimizer) + + # define data reader + wmt14_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(source_dict_dim), buf_size=8192), + batch_size=8) + + # define the event_handler callback + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if not event.batch_id % 100 and event.batch_id: + with gzip.open( + os.path.join(save_path, + "nmt_without_att_%05d_batch_%05d.tar.gz" % + event.pass_id, event.batch_id), "w") as f: + parameters.to_tar(f) + + if event.batch_id and not event.batch_id % 10: + logger.info("Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics)) + + # start training + trainer.train( + reader=wmt14_reader, event_handler=event_handler, num_passes=2) + + +if __name__ == '__main__': + train(save_dir_path="models", source_dict_dim=3000, target_dict_dim=3000)