Merge pull request PaddlePaddle#45 from 0YuanZhang0/sequence_tagging

Sequence tagging
qingqing01 · Apr 19, 2020 · 33ece5d · 33ece5d
2 parents c54980b + 6462645
commit 33ece5d
Show file tree

Hide file tree

Showing 18 changed files with 613 additions and 508 deletions.
diff --git a/sequence_tagging/README.md → examples/sequence_tagging/README.md b/sequence_tagging/README.md → examples/sequence_tagging/README.md
@@ -54,7 +54,7 @@ python downloads.py dataset
 我们开源了在自建数据集上训练的词法分析模型，可供用户直接使用，可通过下述链接进行下载:
 ```bash
 # download baseline model
-python downloads.py lac
+python downloads.py model
 ```
 
 ### 模型训练
@@ -66,84 +66,54 @@ GPU上单卡训练
 export CUDA_VISIBLE_DEVICES=0
 
 python -u train.py \
-          --train_file ./data/train.tsv \
-          --test_file ./data/test.tsv \
-          --word_dict_path ./conf/word.dic \
-          --label_dict_path ./conf/tag.dic \ 
-          --word_rep_dict_path ./conf/q2b.dic \
           --device gpu \
-          --grnn_hidden_dim 128 \
-          --word_emb_dim 128 \
-          --bigru_num 2 \
-          --base_learning_rate 1e-3 \
-          --batch_size 300 \
-          --epoch 10 \
-          --save_dir   ./model \
-          --num_devices 1 \
-          -d
-
-# -d： 是否使用动态图模式进行训练，如果使用静态图训练，命令行请删除-d参数
+          --dynamic False
+
+# --device: 使用gpu设备还是cpu设备
+# --dynamic： 是否使用动态图模式进行训练，如果使用静态图训练，设置为True, 动态图设置为False
 ```
+
 GPU上多卡训练
+
 ```
 # setting visible devices for training
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 
 python -m paddle.distributed.launch --selected_gpus=0,1,2,3  train.py \
-          --train_file ./data/train.tsv \
-          --test_file  ./data/test.tsv \
-          --word_dict_path ./conf/word.dic \
-          --label_dict_path ./conf/tag.dic \ 
-          --word_rep_dict_path ./conf/q2b.dic \
           --device gpu \
-          --grnn_hidden_dim 128 \
-          --word_emb_dim 128 \
-          --bigru_num 2 \
-          --base_learning_rate 1e-3 \
-          --batch_size 300 \
-          --epoch 10 \
-          --save_dir   ./model \
-          -d
+          --dynamic False
 
-# -d： 是否使用动态图模式进行训练，如果使用静态图训练，命令行请删除-d参数
+# --device: 使用gpu设备还是cpu设备
+# --dynamic: 是否使用动态图模式进行训练，如果使用静态图训练，设置为True, 动态图设置为False
 ```
+
 CPU上训练
+
 ```
 python -u train.py \
-          --train_file ./data/train.tsv \
-          --test_file ./data/test.tsv \
-          --word_dict_path ./conf/word.dic \
-          --label_dict_path ./conf/tag.dic \ 
-          --word_rep_dict_path ./conf/q2b.dic \
           --device cpu \
-          --grnn_hidden_dim 128 \
-          --word_emb_dim 128 \
-          --bigru_num 2 \
-          --base_learning_rate 1e-3 \
-          --batch_size 300 \
-          --epoch 10 \
-          --save_dir   ./model \
-          -d
+          --dynamic False
 
+# --device: 使用gpu设备还是cpu设备
+# --dynamic: 是否使用动态图模式进行训练，如果使用静态图训练，设置为True, 动态图设置为False
 ```
 
 ### 模型预测
 
 加载已有的模型，对未知的数据进行预测
 ```bash
 python predict.py \
-      --predict_file ./data/infer.tsv \
-      --word_dict_path ./conf/word.dic \
-      --label_dict_path ./conf/tag.dic \
-      --word_rep_dict_path ./conf/q2b.dic \
       --init_from_checkpoint  model_baseline/params \
       --output_file predict.result  \
       --mode predict \
       --device cpu  \
-      -d 
-
-# -d： 是否使用动态图模式进行训练，如果使用静态图训练，命令行请删除-d参数
+      --dynamic False
 
+# --init_from_checkpoint: 初始化模型
+# --output_file: 预测结果文件
+# --device: 使用gpu还是cpu设备
+# --mode: 开启模式, 设置为train时，进行训练，设置为predict时进行预测
+# --dynamic: 是否使用动态图模式进行训练，如果使用静态图训练，设置为True, 动态图设置为False
 ```
 
 ### 模型评估
@@ -152,15 +122,15 @@ python predict.py \
 ```bash
 # baseline model
 python eval.py \
-        --test_file  ./data/test.tsv \
-        --word_dict_path ./conf/word.dic  \
-        --label_dict_path ./conf/tag.dic  \
-        --word_rep_dict_path ./conf/q2b.dic \
         --init_from_checkpoint  ./model_baseline/params \
+        --mode predict \
         --device cpu  \
-        -d
+        --dynamic False
 
-# -d： 是否使用动态图模式进行训练，如果使用静态图训练，命令行请删除-d参数
+# --init_from_checkpoint: 初始化模型
+# --device: 使用gpu还是cpu设备
+# --mode: 开启模式, 设置为train时，进行训练，设置为predict时进行预测
+# --dynamic: 是否使用动态图模式进行训练，如果使用静态图训练，设置为True, 动态图设置为False
 ```
 
 

diff --git a/sequence_tagging/conf/q2b.dic → examples/sequence_tagging/conf/q2b.dic b/sequence_tagging/conf/q2b.dic → examples/sequence_tagging/conf/q2b.dic
diff --git a/sequence_tagging/conf/tag.dic → examples/sequence_tagging/conf/tag.dic b/sequence_tagging/conf/tag.dic → examples/sequence_tagging/conf/tag.dic
diff --git a/sequence_tagging/conf/word.dic → examples/sequence_tagging/conf/word.dic b/sequence_tagging/conf/word.dic → examples/sequence_tagging/conf/word.dic
diff --git a/sequence_tagging/downloads.py → examples/sequence_tagging/downloads.py b/sequence_tagging/downloads.py → examples/sequence_tagging/downloads.py
@@ -33,19 +33,19 @@
         'name': 'lexical_analysis-dataset-2.0.0.tar.gz',
         'md5': '71e4a9a36d0f0177929a1bccedca7dba'
     },
-    'LAC_MODEL': {
-        'name': 'lexical_analysis-2.0.0.tar.gz',
-        'md5': "fc1daef00de9564083c7dc7b600504ca"
+    'MODEL': {
+        'name': 'sequence_tagging_dy.tar.gz',
+        'md5': "1125d374c03c8218b6e47325dcf607e3"
     },
 }
 
 
 def usage():
-    desc = ("\nDownload datasets and pretrained models for LAC.\n"
+    desc = ("\nDownload datasets and pretrained models for sequence tagging.\n"
             "Usage:\n"
             "   1. python download.py all\n"
             "   2. python download.py dataset\n"
-            "   3. python download.py lac\n")
+            "   3. python download.py model\n")
     print(desc)
 
 
@@ -136,13 +136,13 @@ def download(name, dir_path):
 
     if sys.argv[1] == 'all':
         download('DATA', pwd)
-        download('LAC_MODEL', pwd)
+        download('MODEL', pwd)
 
     if sys.argv[1] == "dataset":
         download('DATA', pwd)
 
-    elif sys.argv[1] == "lac":
-        download('LAC_MODEL', pwd)
+    elif sys.argv[1] == "model":
+        download('MODEL', pwd)
 
     else:
         usage()
diff --git a/sequence_tagging/downloads.sh → examples/sequence_tagging/downloads.sh b/sequence_tagging/downloads.sh → examples/sequence_tagging/downloads.sh
@@ -5,9 +5,9 @@ if [ -d ./model_baseline/ ]
 then
     echo "./model_baseline/ directory already existed, ignore download"
 else
-    wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/lexical_analysis-2.0.0.tar.gz
-    tar xvf lexical_analysis-2.0.0.tar.gz
-    /bin/rm lexical_analysis-2.0.0.tar.gz
+    wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/sequence_tagging_dy.tar.gz
+    tar xvf sequence_tagging_dy.tar.gz
+    /bin/rm sequence_tagging_dy.tar.gz
 fi
 
 # download dataset file to ./data/

diff --git a/examples/sequence_tagging/eval.py b/examples/sequence_tagging/eval.py
@@ -0,0 +1,99 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+SequenceTagging network structure
+"""
+
+from __future__ import division
+from __future__ import print_function
+
+import io
+import os
+import sys
+import math
+import argparse
+import numpy as np
+
+from train import SeqTagging
+from utils.configure import PDConfig
+from utils.check import check_gpu, check_version
+from utils.metrics import chunk_count
+from reader import LacDataset, create_lexnet_data_generator, create_dataloader
+
+work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(os.path.join(work_dir, "../"))
+from hapi.model import set_device, Input
+
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import AdamOptimizer
+from paddle.fluid.layers.utils import flatten
+
+
+def main(args):
+    place = set_device(args.device)
+    fluid.enable_dygraph(place) if args.dynamic else None
+
+    inputs = [Input([None, None], 'int64', name='words'), 
+              Input([None], 'int64', name='length')] 
+
+    feed_list = None if args.dynamic else [x.forward() for x in inputs]
+    dataset = LacDataset(args)
+    eval_path = args.test_file
+
+    chunk_evaluator = fluid.metrics.ChunkEvaluator()
+    chunk_evaluator.reset()
+
+    eval_generator = create_lexnet_data_generator(
+        args, reader=dataset, file_name=eval_path, place=place, mode="test")
+
+    eval_dataset = create_dataloader(
+        eval_generator, place, feed_list=feed_list)
+
+    vocab_size = dataset.vocab_size
+    num_labels = dataset.num_labels
+    model = SeqTagging(args, vocab_size, num_labels)
+
+    optim = AdamOptimizer(
+        learning_rate=args.base_learning_rate,
+        parameter_list=model.parameters())
+
+    model.mode = "test"
+    model.prepare(inputs=inputs)
+    model.load(args.init_from_checkpoint, skip_mismatch=True)
+
+    for data in eval_dataset():
+        if len(data) == 1: 
+            batch_data = data[0]
+            targets = np.array(batch_data[2])
+        else: 
+            batch_data = data
+            targets = batch_data[2].numpy()
+        inputs_data = [batch_data[0], batch_data[1]]
+        crf_decode, length = model.test(inputs=inputs_data)
+        num_infer_chunks, num_label_chunks, num_correct_chunks = chunk_count(crf_decode, targets, length, dataset.id2label_dict)
+        chunk_evaluator.update(num_infer_chunks, num_label_chunks, num_correct_chunks)
+
+    precision, recall, f1 = chunk_evaluator.eval()
+    print("[test] P: %.5f, R: %.5f, F1: %.5f" % (precision, recall, f1))
+
+
+if __name__ == '__main__': 
+    args = PDConfig(yaml_file="sequence_tagging.yaml")
+    args.build()
+    args.Print()
+
+    use_gpu = True if args.device == "gpu" else False
+    check_gpu(use_gpu)
+    check_version()
+    main(args)
diff --git a/sequence_tagging/images/gru-crf-model.png → ...sequence_tagging/images/gru-crf-model.png b/sequence_tagging/images/gru-crf-model.png → ...sequence_tagging/images/gru-crf-model.png
diff --git a/examples/sequence_tagging/predict.py b/examples/sequence_tagging/predict.py
@@ -0,0 +1,94 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+SequenceTagging network structure
+"""
+
+from __future__ import division
+from __future__ import print_function
+
+import io
+import os
+import sys
+import math
+import argparse
+import numpy as np
+
+from train import SeqTagging
+from utils.check import check_gpu, check_version
+from utils.configure import PDConfig
+from reader import LacDataset, create_lexnet_data_generator, create_dataloader
+
+work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(os.path.join(work_dir, "../"))
+from hapi.model import set_device, Input
+
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import AdamOptimizer
+from paddle.fluid.layers.utils import flatten
+
+
+def main(args):
+    place = set_device(args.device)
+    fluid.enable_dygraph(place) if args.dynamic else None
+
+    inputs = [Input([None, None], 'int64', name='words'), 
+              Input([None], 'int64', name='length')]
+
+    feed_list = None if args.dynamic else [x.forward() for x in inputs]
+    dataset = LacDataset(args)
+    predict_path = args.predict_file
+
+    predict_generator = create_lexnet_data_generator(
+        args, reader=dataset, file_name=predict_path, place=place, mode="predict")
+
+    predict_dataset = create_dataloader(
+        predict_generator, place, feed_list=feed_list)
+
+    vocab_size = dataset.vocab_size
+    num_labels = dataset.num_labels
+    model = SeqTagging(args, vocab_size, num_labels)
+
+    optim = AdamOptimizer(
+        learning_rate=args.base_learning_rate,
+        parameter_list=model.parameters())
+
+    model.mode = "test"
+    model.prepare(inputs=inputs)
+
+    model.load(args.init_from_checkpoint, skip_mismatch=True)
+
+    f = open(args.output_file, "wb")
+    for data in predict_dataset(): 
+        if len(data) == 1: 
+            input_data = data[0]
+        else: 
+            input_data = data
+        results, length = model.test(inputs=flatten(input_data))
+        for i in range(len(results)): 
+            word_len = length[i]
+            word_ids = results[i][: word_len]
+            tags = [dataset.id2label_dict[str(id)] for id in word_ids]
+            f.write("\002".join(tags) + "\n")
+
+
+if __name__ == '__main__': 
+    args = PDConfig(yaml_file="sequence_tagging.yaml")
+    args.build()
+    args.Print()
+
+    use_gpu = True if args.device == "gpu" else False
+    check_gpu(use_gpu)
+    check_version()
+    main(args)
diff --git a/sequence_tagging/reader.py → examples/sequence_tagging/reader.py b/sequence_tagging/reader.py → examples/sequence_tagging/reader.py