Skip to content

Commit

Permalink
Update express_ner example (PaddlePaddle#63)
Browse files Browse the repository at this point in the history
* Update express_ner example

* update run_bigru_crf

* fix msra_ner example
  • Loading branch information
smallv0221 committed Mar 4, 2021
1 parent d37bdfc commit c3c3462
Show file tree
Hide file tree
Showing 10 changed files with 78 additions and 72 deletions.
2 changes: 1 addition & 1 deletion docs/datasets.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ PaddleNLP提供了

| 数据集名称 | 简介 | 调用方法 |
| ---- | --------- | ------ |
| [CSSE COVID-19](../examples/time_series) |约翰·霍普金斯大学系统科学与工程中心新冠病例数据 | [time_series](https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/examples/time_series)|
| [CSSE COVID-19](https://github.com/CSSEGISandData/COVID-19) |约翰·霍普金斯大学系统科学与工程中心新冠病例数据 | [time_series](../examples/time_series)|
| [UCIHousing](https://archive.ics.uci.edu/ml/datasets/Housing) | 波士顿房价预测数据集 | `paddle.text.datasets.UCIHousing`|

## 语料库
Expand Down
2 changes: 1 addition & 1 deletion examples/glue/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)

TASK_CLASSES = {
METRIC_CLASSES = {
"cola": Mcc,
"sst-2": Accuracy,
"mrpc": AccuracyAndF1,
Expand Down
64 changes: 31 additions & 33 deletions examples/named_entity_recognition/express_ner/run_bigru_crf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,21 @@
import paddle
import paddle.nn as nn

from paddlenlp.datasets import MapDataset
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.layers import LinearChainCrf, ViterbiDecoder, LinearChainCrfLoss
from paddlenlp.metrics import ChunkEvaluator
from paddlenlp.embeddings import TokenEmbedding


def parse_decodes(ds, decodes, lens):
def parse_decodes(ds, decodes, lens, label_vocab):
decodes = [x for batch in decodes for x in batch]
lens = [x for batch in lens for x in batch]
id_word = dict(zip(ds.word_vocab.values(), ds.word_vocab.keys()))
id_label = dict(zip(ds.label_vocab.values(), ds.label_vocab.keys()))
id_label = dict(zip(label_vocab.values(), label_vocab.keys()))

outputs = []
for idx, end in enumerate(lens):
sent = [id_word[x] for x in ds.word_ids[idx][:end]]
sent = ds.data[idx][0][:end]
tags = [id_label[x] for x in decodes[idx][:end]]
sent_out = []
tags_out = []
Expand Down Expand Up @@ -66,33 +66,20 @@ def load_dict(dict_path):
return vocab


class ExpressDataset(paddle.io.Dataset):
def __init__(self, data_path):
self.word_vocab = load_dict('./conf/word.dic')
self.label_vocab = load_dict('./conf/tag.dic')
self.word_ids = []
self.label_ids = []
def load_dataset(datafiles):
def read(data_path):
with open(data_path, 'r', encoding='utf-8') as fp:
next(fp)
for line in fp.readlines():
words, labels = line.strip('\n').split('\t')
words = words.split('\002')
labels = labels.split('\002')
sub_word_ids = convert_tokens_to_ids(words, self.word_vocab,
'OOV')
sub_label_ids = convert_tokens_to_ids(labels, self.label_vocab,
'O')
self.word_ids.append(sub_word_ids)
self.label_ids.append(sub_label_ids)
self.word_num = max(self.word_vocab.values()) + 1
self.label_num = max(self.label_vocab.values()) + 1
yield words, labels

def __len__(self):
return len(self.word_ids)

def __getitem__(self, index):
return self.word_ids[index], len(self.word_ids[index]), self.label_ids[
index]
if isinstance(datafiles, str):
return MapDataset(list(read(datafiles)))
elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
return [MapDataset(list(read(datafile))) for datafile in datafiles]


class BiGRUWithCRF(nn.Layer):
Expand Down Expand Up @@ -127,14 +114,26 @@ def forward(self, x, lens):
if __name__ == '__main__':
paddle.set_device('gpu')

train_ds = ExpressDataset('./data/train.txt')
dev_ds = ExpressDataset('./data/dev.txt')
test_ds = ExpressDataset('./data/test.txt')
train_ds, dev_ds, test_ds = load_dataset(datafiles=(
'./data/train.txt', './data/dev.txt', './data/test.txt'))

label_vocab = load_dict('./conf/tag.dic')
word_vocab = load_dict('./conf/word.dic')

def convert_example(example):
tokens, labels = example
tokens_ids = convert_tokens_to_ids(tokens, word_vocab, 'OOV')
label_ids = convert_tokens_to_ids(labels, label_vocab, 'O')
return tokens_ids, len(tokens_ids), label_ids

train_ds.map(convert_example)
dev_ds.map(convert_example)
test_ds.map(convert_example)

batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=train_ds.word_vocab.get('OOV')),
Pad(axis=0, pad_val=word_vocab.get('OOV')),
Stack(),
Pad(axis=0, pad_val=train_ds.label_vocab.get('O'))
Pad(axis=0, pad_val=label_vocab.get('O'))
): fn(samples)

train_loader = paddle.io.DataLoader(
Expand All @@ -159,14 +158,13 @@ def forward(self, x, lens):
return_list=True,
collate_fn=batchify_fn)

network = BiGRUWithCRF(300, 300, train_ds.word_num, train_ds.label_num)
network = BiGRUWithCRF(300, 300, len(word_vocab), len(label_vocab))
model = paddle.Model(network)

optimizer = paddle.optimizer.Adam(
learning_rate=0.001, parameters=model.parameters())
crf_loss = LinearChainCrfLoss(network.crf)
chunk_evaluator = ChunkEvaluator(
label_list=train_ds.label_vocab.keys(), suffix=True)
chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True)
model.prepare(optimizer, crf_loss, chunk_evaluator)

model.fit(train_data=train_loader,
Expand All @@ -177,7 +175,7 @@ def forward(self, x, lens):

model.evaluate(eval_data=test_loader)
outputs, lens, decodes = model.predict(test_data=test_loader)
preds = parse_decodes(test_ds, decodes, lens)
preds = parse_decodes(test_ds, decodes, lens, label_vocab)

file_path = "bigru_results.txt"
with open(file_path, "w", encoding="utf8") as fout:
Expand Down
60 changes: 29 additions & 31 deletions examples/named_entity_recognition/express_ner/run_ernie.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,20 @@
from functools import partial

import paddle
from paddlenlp.datasets import MapDataset
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.transformers import ErnieTokenizer, ErnieForTokenClassification
from paddlenlp.metrics import ChunkEvaluator


def parse_decodes(ds, decodes, lens):
def parse_decodes(ds, decodes, lens, label_vocab):
decodes = [x for batch in decodes for x in batch]
lens = [x for batch in lens for x in batch]
id_label = dict(zip(ds.label_vocab.values(), ds.label_vocab.keys()))
id_label = dict(zip(label_vocab.values(), label_vocab.keys()))

outputs = []
for idx, end in enumerate(lens):
sent = ds.word_ids[idx][:end]
sent = ds.data[idx][0][:end]
tags = [id_label[x] for x in decodes[idx][1:end]]
sent_out = []
tags_out = []
Expand Down Expand Up @@ -60,15 +61,15 @@ def evaluate(model, metric, data_loader):
(precision, recall, f1_score))


def predict(model, data_loader, ds):
def predict(model, data_loader, ds, label_vocab):
pred_list = []
len_list = []
for input_ids, seg_ids, lens, labels in data_loader:
logits = model(input_ids, seg_ids)
pred = paddle.argmax(logits, axis=-1)
pred_list.append(pred.numpy())
len_list.append(lens.numpy())
preds = parse_decodes(ds, pred_list, len_list)
preds = parse_decodes(ds, pred_list, len_list, label_vocab)
return preds


Expand All @@ -90,52 +91,49 @@ def load_dict(dict_path):
return vocab


class ExpressDataset(paddle.io.Dataset):
def __init__(self, data_path):
self.label_vocab = load_dict('./conf/tag.dic')
self.word_ids = []
self.label_ids = []
def load_dataset(datafiles):
def read(data_path):
with open(data_path, 'r', encoding='utf-8') as fp:
next(fp)
for line in fp.readlines():
words, labels = line.strip('\n').split('\t')
words = words.split('\002')
labels = labels.split('\002')
self.word_ids.append(words)
self.label_ids.append(labels)
self.label_num = max(self.label_vocab.values()) + 1
yield words, labels

def __len__(self):
return len(self.word_ids)

def __getitem__(self, index):
return self.word_ids[index], self.label_ids[index]
if isinstance(datafiles, str):
return MapDataset(list(read(datafiles)))
elif isinstance(datafiles, list) or isinstance(datafiles, tuple):
return [MapDataset(list(read(datafile))) for datafile in datafiles]


if __name__ == '__main__':
paddle.set_device('gpu')
train_ds, dev_ds, test_ds = load_dataset(datafiles=(
'./data/train.txt', './data/dev.txt', './data/test.txt'))

train_ds = ExpressDataset('./data/train.txt')
dev_ds = ExpressDataset('./data/dev.txt')
test_ds = ExpressDataset('./data/test.txt')

label_vocab = load_dict('./conf/tag.dic')
tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')

trans_func = partial(
convert_example, tokenizer=tokenizer, label_vocab=train_ds.label_vocab)
convert_example, tokenizer=tokenizer, label_vocab=label_vocab)

train_ds.map(trans_func)
dev_ds.map(trans_func)
test_ds.map(trans_func)

ignore_label = -1
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]),
Pad(axis=0, pad_val=tokenizer.pad_token_id),
Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
Stack(),
Pad(axis=0, pad_val=ignore_label)
): fn(list(map(trans_func, samples)))
): fn(samples)

train_loader = paddle.io.DataLoader(
dataset=train_ds,
batch_size=200,
shuffle=True,
shuffle=False,
return_list=True,
collate_fn=batchify_fn)
dev_loader = paddle.io.DataLoader(
Expand All @@ -150,9 +148,9 @@ def __getitem__(self, index):
collate_fn=batchify_fn)

model = ErnieForTokenClassification.from_pretrained(
"ernie-1.0", num_classes=train_ds.label_num)
"ernie-1.0", num_classes=len(label_vocab))

metric = ChunkEvaluator(label_list=train_ds.label_vocab.keys(), suffix=True)
metric = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True)
loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label)
optimizer = paddle.optimizer.AdamW(
learning_rate=2e-5, parameters=model.parameters())
Expand All @@ -163,7 +161,7 @@ def __getitem__(self, index):
for idx, (input_ids, token_type_ids, length,
labels) in enumerate(train_loader):
logits = model(input_ids, token_type_ids).reshape(
[-1, train_ds.label_num])
[-1, len(label_vocab)])
loss = paddle.mean(loss_fn(logits, labels.reshape([-1])))
loss.backward()
optimizer.step()
Expand All @@ -175,7 +173,7 @@ def __getitem__(self, index):
paddle.save(model.state_dict(),
'./ernie_result/model_%d.pdparams' % step)

preds = predict(model, test_loader, test_ds)
preds = predict(model, test_loader, test_ds, label_vocab)
file_path = "ernie_results.txt"
with open(file_path, "w", encoding="utf8") as fout:
fout.write("\n".join(preds))
Expand Down
2 changes: 1 addition & 1 deletion examples/named_entity_recognition/msra_ner/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def do_eval(args):
ignore_label = -100
batchify_fn = lambda samples, fn=Dict({
'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment
'seq_len': Stack(),
'labels': Pad(axis=0, pad_val=ignore_label) # label
}): fn(samples)
Expand Down
2 changes: 1 addition & 1 deletion examples/named_entity_recognition/msra_ner/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def do_predict(args):
ignore_label = -100
batchify_fn = lambda samples, fn=Dict({
'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment
'seq_len': Stack(),
'labels': Pad(axis=0, pad_val=ignore_label) # label
}): fn(samples)
Expand Down
2 changes: 1 addition & 1 deletion examples/named_entity_recognition/msra_ner/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def do_train(args):

batchify_fn = lambda samples, fn=Dict({
'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input
'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment
'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment
'seq_len': Stack(),
'labels': Pad(axis=0, pad_val=ignore_label) # label
}): fn(samples)
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@
from .cnndm import *
from .poetry import *
from .couplet import *
from .experimental import load_dataset, DatasetBuilder
from .experimental import load_dataset, DatasetBuilder, MapDataset, IterDataset
7 changes: 6 additions & 1 deletion paddlenlp/datasets/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,9 @@
from .ptb import *
from .squad import *
from .peoples_daily_ner import *
from .poetry import *
from .poetry import *
from .cmrc2018 import *
from .drcd import *
from .dureader_robust import *
from .glue import *
from .wmt14ende import *
7 changes: 6 additions & 1 deletion paddlenlp/datasets/experimental/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,12 @@ def import_main_class(module_path):
return module_main_cls


def load_dataset(path, name=None, data_files=None, splits=None, lazy=None):
def load_dataset(path,
name=None,
data_files=None,
splits=None,
lazy=None,
**kwargs):
module_path = DATASETS_MODULE_PATH + path

reader_cls = import_main_class(module_path)
Expand Down

0 comments on commit c3c3462

Please sign in to comment.