Skip to content

Commit

Permalink
Merge pull request PaddlePaddle#45 from 0YuanZhang0/sequence_tagging
Browse files Browse the repository at this point in the history
Sequence tagging
  • Loading branch information
xyzhou-puck authored Apr 19, 2020
2 parents c54980b + 6462645 commit 33ece5d
Show file tree
Hide file tree
Showing 18 changed files with 613 additions and 508 deletions.
84 changes: 27 additions & 57 deletions sequence_tagging/README.md → examples/sequence_tagging/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ python downloads.py dataset
我们开源了在自建数据集上训练的词法分析模型,可供用户直接使用,可通过下述链接进行下载:
```bash
# download baseline model
python downloads.py lac
python downloads.py model
```

### 模型训练
Expand All @@ -66,84 +66,54 @@ GPU上单卡训练
export CUDA_VISIBLE_DEVICES=0
python -u train.py \
--train_file ./data/train.tsv \
--test_file ./data/test.tsv \
--word_dict_path ./conf/word.dic \
--label_dict_path ./conf/tag.dic \
--word_rep_dict_path ./conf/q2b.dic \
--device gpu \
--grnn_hidden_dim 128 \
--word_emb_dim 128 \
--bigru_num 2 \
--base_learning_rate 1e-3 \
--batch_size 300 \
--epoch 10 \
--save_dir ./model \
--num_devices 1 \
-d
# -d: 是否使用动态图模式进行训练,如果使用静态图训练,命令行请删除-d参数
--dynamic False
# --device: 使用gpu设备还是cpu设备
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为True, 动态图设置为False
```

GPU上多卡训练

```
# setting visible devices for training
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle.distributed.launch --selected_gpus=0,1,2,3 train.py \
--train_file ./data/train.tsv \
--test_file ./data/test.tsv \
--word_dict_path ./conf/word.dic \
--label_dict_path ./conf/tag.dic \
--word_rep_dict_path ./conf/q2b.dic \
--device gpu \
--grnn_hidden_dim 128 \
--word_emb_dim 128 \
--bigru_num 2 \
--base_learning_rate 1e-3 \
--batch_size 300 \
--epoch 10 \
--save_dir ./model \
-d
--dynamic False
# -d: 是否使用动态图模式进行训练,如果使用静态图训练,命令行请删除-d参数
# --device: 使用gpu设备还是cpu设备
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为True, 动态图设置为False
```

CPU上训练

```
python -u train.py \
--train_file ./data/train.tsv \
--test_file ./data/test.tsv \
--word_dict_path ./conf/word.dic \
--label_dict_path ./conf/tag.dic \
--word_rep_dict_path ./conf/q2b.dic \
--device cpu \
--grnn_hidden_dim 128 \
--word_emb_dim 128 \
--bigru_num 2 \
--base_learning_rate 1e-3 \
--batch_size 300 \
--epoch 10 \
--save_dir ./model \
-d
--dynamic False
# --device: 使用gpu设备还是cpu设备
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为True, 动态图设置为False
```

### 模型预测

加载已有的模型,对未知的数据进行预测
```bash
python predict.py \
--predict_file ./data/infer.tsv \
--word_dict_path ./conf/word.dic \
--label_dict_path ./conf/tag.dic \
--word_rep_dict_path ./conf/q2b.dic \
--init_from_checkpoint model_baseline/params \
--output_file predict.result \
--mode predict \
--device cpu \
-d

# -d: 是否使用动态图模式进行训练,如果使用静态图训练,命令行请删除-d参数
--dynamic False

# --init_from_checkpoint: 初始化模型
# --output_file: 预测结果文件
# --device: 使用gpu还是cpu设备
# --mode: 开启模式, 设置为train时,进行训练,设置为predict时进行预测
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为True, 动态图设置为False
```

### 模型评估
Expand All @@ -152,15 +122,15 @@ python predict.py \
```bash
# baseline model
python eval.py \
--test_file ./data/test.tsv \
--word_dict_path ./conf/word.dic \
--label_dict_path ./conf/tag.dic \
--word_rep_dict_path ./conf/q2b.dic \
--init_from_checkpoint ./model_baseline/params \
--mode predict \
--device cpu \
-d
--dynamic False

# -d: 是否使用动态图模式进行训练,如果使用静态图训练,命令行请删除-d参数
# --init_from_checkpoint: 初始化模型
# --device: 使用gpu还是cpu设备
# --mode: 开启模式, 设置为train时,进行训练,设置为predict时进行预测
# --dynamic: 是否使用动态图模式进行训练,如果使用静态图训练,设置为True, 动态图设置为False
```


Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,19 @@
'name': 'lexical_analysis-dataset-2.0.0.tar.gz',
'md5': '71e4a9a36d0f0177929a1bccedca7dba'
},
'LAC_MODEL': {
'name': 'lexical_analysis-2.0.0.tar.gz',
'md5': "fc1daef00de9564083c7dc7b600504ca"
'MODEL': {
'name': 'sequence_tagging_dy.tar.gz',
'md5': "1125d374c03c8218b6e47325dcf607e3"
},
}


def usage():
desc = ("\nDownload datasets and pretrained models for LAC.\n"
desc = ("\nDownload datasets and pretrained models for sequence tagging.\n"
"Usage:\n"
" 1. python download.py all\n"
" 2. python download.py dataset\n"
" 3. python download.py lac\n")
" 3. python download.py model\n")
print(desc)


Expand Down Expand Up @@ -136,13 +136,13 @@ def download(name, dir_path):

if sys.argv[1] == 'all':
download('DATA', pwd)
download('LAC_MODEL', pwd)
download('MODEL', pwd)

if sys.argv[1] == "dataset":
download('DATA', pwd)

elif sys.argv[1] == "lac":
download('LAC_MODEL', pwd)
elif sys.argv[1] == "model":
download('MODEL', pwd)

else:
usage()
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ if [ -d ./model_baseline/ ]
then
echo "./model_baseline/ directory already existed, ignore download"
else
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/lexical_analysis-2.0.0.tar.gz
tar xvf lexical_analysis-2.0.0.tar.gz
/bin/rm lexical_analysis-2.0.0.tar.gz
wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/sequence_tagging_dy.tar.gz
tar xvf sequence_tagging_dy.tar.gz
/bin/rm sequence_tagging_dy.tar.gz
fi

# download dataset file to ./data/
Expand Down
99 changes: 99 additions & 0 deletions examples/sequence_tagging/eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
SequenceTagging network structure
"""

from __future__ import division
from __future__ import print_function

import io
import os
import sys
import math
import argparse
import numpy as np

from train import SeqTagging
from utils.configure import PDConfig
from utils.check import check_gpu, check_version
from utils.metrics import chunk_count
from reader import LacDataset, create_lexnet_data_generator, create_dataloader

work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(os.path.join(work_dir, "../"))
from hapi.model import set_device, Input

import paddle.fluid as fluid
from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.layers.utils import flatten


def main(args):
place = set_device(args.device)
fluid.enable_dygraph(place) if args.dynamic else None

inputs = [Input([None, None], 'int64', name='words'),
Input([None], 'int64', name='length')]

feed_list = None if args.dynamic else [x.forward() for x in inputs]
dataset = LacDataset(args)
eval_path = args.test_file

chunk_evaluator = fluid.metrics.ChunkEvaluator()
chunk_evaluator.reset()

eval_generator = create_lexnet_data_generator(
args, reader=dataset, file_name=eval_path, place=place, mode="test")

eval_dataset = create_dataloader(
eval_generator, place, feed_list=feed_list)

vocab_size = dataset.vocab_size
num_labels = dataset.num_labels
model = SeqTagging(args, vocab_size, num_labels)

optim = AdamOptimizer(
learning_rate=args.base_learning_rate,
parameter_list=model.parameters())

model.mode = "test"
model.prepare(inputs=inputs)
model.load(args.init_from_checkpoint, skip_mismatch=True)

for data in eval_dataset():
if len(data) == 1:
batch_data = data[0]
targets = np.array(batch_data[2])
else:
batch_data = data
targets = batch_data[2].numpy()
inputs_data = [batch_data[0], batch_data[1]]
crf_decode, length = model.test(inputs=inputs_data)
num_infer_chunks, num_label_chunks, num_correct_chunks = chunk_count(crf_decode, targets, length, dataset.id2label_dict)
chunk_evaluator.update(num_infer_chunks, num_label_chunks, num_correct_chunks)

precision, recall, f1 = chunk_evaluator.eval()
print("[test] P: %.5f, R: %.5f, F1: %.5f" % (precision, recall, f1))


if __name__ == '__main__':
args = PDConfig(yaml_file="sequence_tagging.yaml")
args.build()
args.Print()

use_gpu = True if args.device == "gpu" else False
check_gpu(use_gpu)
check_version()
main(args)
File renamed without changes
94 changes: 94 additions & 0 deletions examples/sequence_tagging/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
SequenceTagging network structure
"""

from __future__ import division
from __future__ import print_function

import io
import os
import sys
import math
import argparse
import numpy as np

from train import SeqTagging
from utils.check import check_gpu, check_version
from utils.configure import PDConfig
from reader import LacDataset, create_lexnet_data_generator, create_dataloader

work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(os.path.join(work_dir, "../"))
from hapi.model import set_device, Input

import paddle.fluid as fluid
from paddle.fluid.optimizer import AdamOptimizer
from paddle.fluid.layers.utils import flatten


def main(args):
place = set_device(args.device)
fluid.enable_dygraph(place) if args.dynamic else None

inputs = [Input([None, None], 'int64', name='words'),
Input([None], 'int64', name='length')]

feed_list = None if args.dynamic else [x.forward() for x in inputs]
dataset = LacDataset(args)
predict_path = args.predict_file

predict_generator = create_lexnet_data_generator(
args, reader=dataset, file_name=predict_path, place=place, mode="predict")

predict_dataset = create_dataloader(
predict_generator, place, feed_list=feed_list)

vocab_size = dataset.vocab_size
num_labels = dataset.num_labels
model = SeqTagging(args, vocab_size, num_labels)

optim = AdamOptimizer(
learning_rate=args.base_learning_rate,
parameter_list=model.parameters())

model.mode = "test"
model.prepare(inputs=inputs)

model.load(args.init_from_checkpoint, skip_mismatch=True)

f = open(args.output_file, "wb")
for data in predict_dataset():
if len(data) == 1:
input_data = data[0]
else:
input_data = data
results, length = model.test(inputs=flatten(input_data))
for i in range(len(results)):
word_len = length[i]
word_ids = results[i][: word_len]
tags = [dataset.id2label_dict[str(id)] for id in word_ids]
f.write("\002".join(tags) + "\n")


if __name__ == '__main__':
args = PDConfig(yaml_file="sequence_tagging.yaml")
args.build()
args.Print()

use_gpu = True if args.device == "gpu" else False
check_gpu(use_gpu)
check_version()
main(args)
File renamed without changes.
Loading

0 comments on commit 33ece5d

Please sign in to comment.