diff --git a/docs/datasets.md b/docs/datasets.md index dfad3ed6bf88c..93b3a96410f07 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -47,7 +47,7 @@ PaddleNLP提供了 | 数据集名称 | 简介 | 调用方法 | | ---- | --------- | ------ | -| [CSSE COVID-19](../examples/time_series) |约翰·霍普金斯大学系统科学与工程中心新冠病例数据 | [time_series](https://github.com/PaddlePaddle/models/tree/develop/PaddleNLP/examples/time_series)| +| [CSSE COVID-19](https://github.com/CSSEGISandData/COVID-19) |约翰·霍普金斯大学系统科学与工程中心新冠病例数据 | [time_series](../examples/time_series)| | [UCIHousing](https://archive.ics.uci.edu/ml/datasets/Housing) | 波士顿房价预测数据集 | `paddle.text.datasets.UCIHousing`| ## 语料库 diff --git a/examples/glue/run_glue.py b/examples/glue/run_glue.py index 1ef08166d4436..c5225cf716fbb 100644 --- a/examples/glue/run_glue.py +++ b/examples/glue/run_glue.py @@ -39,7 +39,7 @@ logging.basicConfig(level=logging.INFO, format=FORMAT) logger = logging.getLogger(__name__) -TASK_CLASSES = { +METRIC_CLASSES = { "cola": Mcc, "sst-2": Accuracy, "mrpc": AccuracyAndF1, diff --git a/examples/named_entity_recognition/express_ner/run_bigru_crf.py b/examples/named_entity_recognition/express_ner/run_bigru_crf.py index 969399dd557fa..fb3b7d1b3dfd7 100644 --- a/examples/named_entity_recognition/express_ner/run_bigru_crf.py +++ b/examples/named_entity_recognition/express_ner/run_bigru_crf.py @@ -15,21 +15,21 @@ import paddle import paddle.nn as nn +from paddlenlp.datasets import MapDataset from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.layers import LinearChainCrf, ViterbiDecoder, LinearChainCrfLoss from paddlenlp.metrics import ChunkEvaluator from paddlenlp.embeddings import TokenEmbedding -def parse_decodes(ds, decodes, lens): +def parse_decodes(ds, decodes, lens, label_vocab): decodes = [x for batch in decodes for x in batch] lens = [x for batch in lens for x in batch] - id_word = dict(zip(ds.word_vocab.values(), ds.word_vocab.keys())) - id_label = dict(zip(ds.label_vocab.values(), ds.label_vocab.keys())) + id_label = dict(zip(label_vocab.values(), label_vocab.keys())) outputs = [] for idx, end in enumerate(lens): - sent = [id_word[x] for x in ds.word_ids[idx][:end]] + sent = ds.data[idx][0][:end] tags = [id_label[x] for x in decodes[idx][:end]] sent_out = [] tags_out = [] @@ -66,33 +66,20 @@ def load_dict(dict_path): return vocab -class ExpressDataset(paddle.io.Dataset): - def __init__(self, data_path): - self.word_vocab = load_dict('./conf/word.dic') - self.label_vocab = load_dict('./conf/tag.dic') - self.word_ids = [] - self.label_ids = [] +def load_dataset(datafiles): + def read(data_path): with open(data_path, 'r', encoding='utf-8') as fp: next(fp) for line in fp.readlines(): words, labels = line.strip('\n').split('\t') words = words.split('\002') labels = labels.split('\002') - sub_word_ids = convert_tokens_to_ids(words, self.word_vocab, - 'OOV') - sub_label_ids = convert_tokens_to_ids(labels, self.label_vocab, - 'O') - self.word_ids.append(sub_word_ids) - self.label_ids.append(sub_label_ids) - self.word_num = max(self.word_vocab.values()) + 1 - self.label_num = max(self.label_vocab.values()) + 1 + yield words, labels - def __len__(self): - return len(self.word_ids) - - def __getitem__(self, index): - return self.word_ids[index], len(self.word_ids[index]), self.label_ids[ - index] + if isinstance(datafiles, str): + return MapDataset(list(read(datafiles))) + elif isinstance(datafiles, list) or isinstance(datafiles, tuple): + return [MapDataset(list(read(datafile))) for datafile in datafiles] class BiGRUWithCRF(nn.Layer): @@ -127,14 +114,26 @@ def forward(self, x, lens): if __name__ == '__main__': paddle.set_device('gpu') - train_ds = ExpressDataset('./data/train.txt') - dev_ds = ExpressDataset('./data/dev.txt') - test_ds = ExpressDataset('./data/test.txt') + train_ds, dev_ds, test_ds = load_dataset(datafiles=( + './data/train.txt', './data/dev.txt', './data/test.txt')) + + label_vocab = load_dict('./conf/tag.dic') + word_vocab = load_dict('./conf/word.dic') + + def convert_example(example): + tokens, labels = example + tokens_ids = convert_tokens_to_ids(tokens, word_vocab, 'OOV') + label_ids = convert_tokens_to_ids(labels, label_vocab, 'O') + return tokens_ids, len(tokens_ids), label_ids + + train_ds.map(convert_example) + dev_ds.map(convert_example) + test_ds.map(convert_example) batchify_fn = lambda samples, fn=Tuple( - Pad(axis=0, pad_val=train_ds.word_vocab.get('OOV')), + Pad(axis=0, pad_val=word_vocab.get('OOV')), Stack(), - Pad(axis=0, pad_val=train_ds.label_vocab.get('O')) + Pad(axis=0, pad_val=label_vocab.get('O')) ): fn(samples) train_loader = paddle.io.DataLoader( @@ -159,14 +158,13 @@ def forward(self, x, lens): return_list=True, collate_fn=batchify_fn) - network = BiGRUWithCRF(300, 300, train_ds.word_num, train_ds.label_num) + network = BiGRUWithCRF(300, 300, len(word_vocab), len(label_vocab)) model = paddle.Model(network) optimizer = paddle.optimizer.Adam( learning_rate=0.001, parameters=model.parameters()) crf_loss = LinearChainCrfLoss(network.crf) - chunk_evaluator = ChunkEvaluator( - label_list=train_ds.label_vocab.keys(), suffix=True) + chunk_evaluator = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True) model.prepare(optimizer, crf_loss, chunk_evaluator) model.fit(train_data=train_loader, @@ -177,7 +175,7 @@ def forward(self, x, lens): model.evaluate(eval_data=test_loader) outputs, lens, decodes = model.predict(test_data=test_loader) - preds = parse_decodes(test_ds, decodes, lens) + preds = parse_decodes(test_ds, decodes, lens, label_vocab) file_path = "bigru_results.txt" with open(file_path, "w", encoding="utf8") as fout: diff --git a/examples/named_entity_recognition/express_ner/run_ernie.py b/examples/named_entity_recognition/express_ner/run_ernie.py index 7a6c6eef7385c..cc5849f1e2be9 100644 --- a/examples/named_entity_recognition/express_ner/run_ernie.py +++ b/examples/named_entity_recognition/express_ner/run_ernie.py @@ -14,19 +14,20 @@ from functools import partial import paddle +from paddlenlp.datasets import MapDataset from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.transformers import ErnieTokenizer, ErnieForTokenClassification from paddlenlp.metrics import ChunkEvaluator -def parse_decodes(ds, decodes, lens): +def parse_decodes(ds, decodes, lens, label_vocab): decodes = [x for batch in decodes for x in batch] lens = [x for batch in lens for x in batch] - id_label = dict(zip(ds.label_vocab.values(), ds.label_vocab.keys())) + id_label = dict(zip(label_vocab.values(), label_vocab.keys())) outputs = [] for idx, end in enumerate(lens): - sent = ds.word_ids[idx][:end] + sent = ds.data[idx][0][:end] tags = [id_label[x] for x in decodes[idx][1:end]] sent_out = [] tags_out = [] @@ -60,7 +61,7 @@ def evaluate(model, metric, data_loader): (precision, recall, f1_score)) -def predict(model, data_loader, ds): +def predict(model, data_loader, ds, label_vocab): pred_list = [] len_list = [] for input_ids, seg_ids, lens, labels in data_loader: @@ -68,7 +69,7 @@ def predict(model, data_loader, ds): pred = paddle.argmax(logits, axis=-1) pred_list.append(pred.numpy()) len_list.append(lens.numpy()) - preds = parse_decodes(ds, pred_list, len_list) + preds = parse_decodes(ds, pred_list, len_list, label_vocab) return preds @@ -90,52 +91,49 @@ def load_dict(dict_path): return vocab -class ExpressDataset(paddle.io.Dataset): - def __init__(self, data_path): - self.label_vocab = load_dict('./conf/tag.dic') - self.word_ids = [] - self.label_ids = [] +def load_dataset(datafiles): + def read(data_path): with open(data_path, 'r', encoding='utf-8') as fp: next(fp) for line in fp.readlines(): words, labels = line.strip('\n').split('\t') words = words.split('\002') labels = labels.split('\002') - self.word_ids.append(words) - self.label_ids.append(labels) - self.label_num = max(self.label_vocab.values()) + 1 + yield words, labels - def __len__(self): - return len(self.word_ids) - - def __getitem__(self, index): - return self.word_ids[index], self.label_ids[index] + if isinstance(datafiles, str): + return MapDataset(list(read(datafiles))) + elif isinstance(datafiles, list) or isinstance(datafiles, tuple): + return [MapDataset(list(read(datafile))) for datafile in datafiles] if __name__ == '__main__': paddle.set_device('gpu') + train_ds, dev_ds, test_ds = load_dataset(datafiles=( + './data/train.txt', './data/dev.txt', './data/test.txt')) - train_ds = ExpressDataset('./data/train.txt') - dev_ds = ExpressDataset('./data/dev.txt') - test_ds = ExpressDataset('./data/test.txt') - + label_vocab = load_dict('./conf/tag.dic') tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial( - convert_example, tokenizer=tokenizer, label_vocab=train_ds.label_vocab) + convert_example, tokenizer=tokenizer, label_vocab=label_vocab) + + train_ds.map(trans_func) + dev_ds.map(trans_func) + test_ds.map(trans_func) ignore_label = -1 batchify_fn = lambda samples, fn=Tuple( - Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), - Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), + Pad(axis=0, pad_val=tokenizer.pad_token_id), + Pad(axis=0, pad_val=tokenizer.pad_token_type_id), Stack(), Pad(axis=0, pad_val=ignore_label) - ): fn(list(map(trans_func, samples))) + ): fn(samples) train_loader = paddle.io.DataLoader( dataset=train_ds, batch_size=200, - shuffle=True, + shuffle=False, return_list=True, collate_fn=batchify_fn) dev_loader = paddle.io.DataLoader( @@ -150,9 +148,9 @@ def __getitem__(self, index): collate_fn=batchify_fn) model = ErnieForTokenClassification.from_pretrained( - "ernie-1.0", num_classes=train_ds.label_num) + "ernie-1.0", num_classes=len(label_vocab)) - metric = ChunkEvaluator(label_list=train_ds.label_vocab.keys(), suffix=True) + metric = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True) loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) optimizer = paddle.optimizer.AdamW( learning_rate=2e-5, parameters=model.parameters()) @@ -163,7 +161,7 @@ def __getitem__(self, index): for idx, (input_ids, token_type_ids, length, labels) in enumerate(train_loader): logits = model(input_ids, token_type_ids).reshape( - [-1, train_ds.label_num]) + [-1, len(label_vocab)]) loss = paddle.mean(loss_fn(logits, labels.reshape([-1]))) loss.backward() optimizer.step() @@ -175,7 +173,7 @@ def __getitem__(self, index): paddle.save(model.state_dict(), './ernie_result/model_%d.pdparams' % step) - preds = predict(model, test_loader, test_ds) + preds = predict(model, test_loader, test_ds, label_vocab) file_path = "ernie_results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(preds)) diff --git a/examples/named_entity_recognition/msra_ner/eval.py b/examples/named_entity_recognition/msra_ner/eval.py index 95548d325998d..dd26d4cdd2f20 100644 --- a/examples/named_entity_recognition/msra_ner/eval.py +++ b/examples/named_entity_recognition/msra_ner/eval.py @@ -79,7 +79,7 @@ def do_eval(args): ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input - 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment + 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment 'seq_len': Stack(), 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) diff --git a/examples/named_entity_recognition/msra_ner/predict.py b/examples/named_entity_recognition/msra_ner/predict.py index 5386739b8021a..eec462ca4d570 100644 --- a/examples/named_entity_recognition/msra_ner/predict.py +++ b/examples/named_entity_recognition/msra_ner/predict.py @@ -108,7 +108,7 @@ def do_predict(args): ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input - 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment + 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment 'seq_len': Stack(), 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) diff --git a/examples/named_entity_recognition/msra_ner/train.py b/examples/named_entity_recognition/msra_ner/train.py index d33b2b95a6ff1..bc81c59520803 100644 --- a/examples/named_entity_recognition/msra_ner/train.py +++ b/examples/named_entity_recognition/msra_ner/train.py @@ -116,7 +116,7 @@ def do_train(args): batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input - 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment + 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment 'seq_len': Stack(), 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) diff --git a/paddlenlp/datasets/__init__.py b/paddlenlp/datasets/__init__.py index 4fa0069488da8..17f60c1e56a37 100644 --- a/paddlenlp/datasets/__init__.py +++ b/paddlenlp/datasets/__init__.py @@ -26,4 +26,4 @@ from .cnndm import * from .poetry import * from .couplet import * -from .experimental import load_dataset, DatasetBuilder \ No newline at end of file +from .experimental import load_dataset, DatasetBuilder, MapDataset, IterDataset \ No newline at end of file diff --git a/paddlenlp/datasets/experimental/__init__.py b/paddlenlp/datasets/experimental/__init__.py index 78c2fb429a26b..d220878ab1dd1 100644 --- a/paddlenlp/datasets/experimental/__init__.py +++ b/paddlenlp/datasets/experimental/__init__.py @@ -17,4 +17,9 @@ from .ptb import * from .squad import * from .peoples_daily_ner import * -from .poetry import * \ No newline at end of file +from .poetry import * +from .cmrc2018 import * +from .drcd import * +from .dureader_robust import * +from .glue import * +from .wmt14ende import * \ No newline at end of file diff --git a/paddlenlp/datasets/experimental/dataset.py b/paddlenlp/datasets/experimental/dataset.py index 6655b0a363747..7b03b7bf2ffa8 100644 --- a/paddlenlp/datasets/experimental/dataset.py +++ b/paddlenlp/datasets/experimental/dataset.py @@ -53,7 +53,12 @@ def import_main_class(module_path): return module_main_cls -def load_dataset(path, name=None, data_files=None, splits=None, lazy=None): +def load_dataset(path, + name=None, + data_files=None, + splits=None, + lazy=None, + **kwargs): module_path = DATASETS_MODULE_PATH + path reader_cls = import_main_class(module_path)