Merge branch 'develop' into add_bert_base_on_ipu

PaddlePaddle · Mar 23, 2022 · cf13496 · cf13496
2 parents d674e7e + 6d102f5
commit cf13496
Show file tree

Hide file tree

Showing 20 changed files with 425 additions and 166 deletions.
diff --git a/README.md b/README.md
@@ -13,9 +13,9 @@
 
 ## News  <img src="./docs/imgs/news_icon.png" width="40"/>
 
+* [2022-03-21] PaddleNLP**一键预测工具**[Taskflow](./docs/model_zoo/taskflow.md)全新升级！🚀欢迎体验更丰富的功能、更便捷的使用方式；新推出适合不同场景的中文分词、命名实体识别模式！
 * [2021-12-28] PaddleNLP新发**语义检索、问答、评论观点抽取和情感倾向分析** [产业化案例](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications)，🚀快速搭建系统！配套视频课程[直通车](https://aistudio.baidu.com/aistudio/course/introduce/24902)！
 * [2021-12-12] PaddleNLP v2.2版本已发布！:tada: 欢迎体验更快的文本处理[FasterTokenizer](./examples/faster/faster_ernie)、更快的预训练模型[FasterERNIE](./examples/faster/faster_ernie)、更快的文本生成[FasterGeneration](./examples/faster/faster_generation)；新推出『解语』名词短语标注工具[NPTag](./examples/text_to_knowledge/nptag)、超快中文小模型[PP-MiniLM](./examples/model_compression/pp-minilm)！ 更多详细升级信息请查看[Release Note](https://github.com/PaddlePaddle/PaddleNLP/releases/tag/v2.1.0)。
-* [2021-12-12] 飞桨新产品**端到端问答工具**🚀[RocketQA](https://github.com/PaddlePaddle/RocketQA)全新发布！:tada:
 
 ## 简介
 

diff --git a/docs/model_zoo/taskflow.md b/docs/model_zoo/taskflow.md
@@ -693,3 +693,5 @@ ner = Taskflow("ner", home_path="/workspace")
 ## 参考资料
 
 1. [fxsjy/jieba](https://github.com/fxsjy/jieba)
+2. [ZhuiyiTechnology/simbert]( https://github.com/ZhuiyiTechnology/simbert)
+3. [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413)
diff --git a/docs/model_zoo/transformers.rst b/docs/model_zoo/transformers.rst
@@ -818,14 +818,15 @@ Reference
   `ymcui/Chinese-XLNet <https://github.com/ymcui/Chinese-XLNet>`_,
   `huggingface/xlnet_chinese_large <https://huggingface.co/clue/xlnet_chinese_large>`_,
   `Knover/luge-dialogue <https://github.com/PaddlePaddle/Knover/tree/luge-dialogue/luge-dialogue>`_,
-  `huawei-noah/Pretrained-Language-Model/NEZHA-PyTorch/ <https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-PyTorch>`_
+  `huawei-noah/Pretrained-Language-Model/NEZHA-PyTorch/ <https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-PyTorch>`_,
   `ZhuiyiTechnology/simbert <https://github.com/ZhuiyiTechnology/simbert>`_
 - Lan, Zhenzhong, et al. "Albert: A lite bert for self-supervised learning of language representations." arXiv preprint arXiv:1909.11942 (2019).
 - Lewis, Mike, et al. "BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension." arXiv preprint arXiv:1910.13461 (2019).
 - Devlin, Jacob, et al. "Bert: Pre-training of deep bidirectional transformers for language understanding." arXiv preprint arXiv:1810.04805 (2018).
 - Zaheer, Manzil, et al. "Big bird: Transformers for longer sequences." arXiv preprint arXiv:2007.14062 (2020).
 - Stephon, Emily, et al. "Blenderbot: Recipes for building an open-domain chatbot." arXiv preprint arXiv:2004.13637 (2020).
 - Stephon, Emily, et al. "Blenderbot-Small: Recipes for building an open-domain chatbot." arXiv preprint arXiv:2004.13637 (2020).
+- Zhang, zhengyan, et al. "CPM: A Large-scale Generative Chinese Pre-trained Language Model." arXiv preprint arXiv:2012.00413 (2020).
 - Jiang, Zihang, et al. "ConvBERT: Improving BERT with Span-based Dynamic Convolution." arXiv preprint arXiv:2008.02496 (2020).
 - Nitish, Bryan, et al. "CTRL: A Conditional Transformer Language Model for Controllable Generation." arXiv preprint arXiv:1909.05858 (2019).
 - Sanh, Victor, et al. "DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter." arXiv preprint arXiv:1910.01108 (2019).

diff --git a/examples/dialogue/plato-xl/infer.py b/examples/dialogue/plato-xl/infer.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import time
 import argparse
 from pprint import pprint
@@ -73,6 +74,11 @@ def setup_args():
         default=4,
         type=int,
         help="The number of candidate to procedure beam search. ")
+    parser.add_argument(
+        "--num_return_sequences",
+        default=1,
+        type=int,
+        help="The number of returned sequences. ")
 
     args = parser.parse_args()
 
@@ -93,10 +99,17 @@ def postprocess_response(token_ids, tokenizer):
 
 
 def infer(args):
+    if args.faster and args.use_fp16_decoding and os.getenv("PPFG_QKV_MEM_OPT",
+                                                            "0") == "1":
+        paddle.set_default_dtype("float16")
+
     model_name = 'plato-xl'
-    model = UnifiedTransformerLMHeadModel.from_pretrained(model_name)
+    model = UnifiedTransformerLMHeadModel.from_pretrained(
+        model_name, load_state_as_np=True)
     tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name)
 
+    model.eval()
+
     context = [
         "Hi , Becky , what's up ?",
         "Not much , except that my mother-in-law is driving me up the wall .",
@@ -136,6 +149,7 @@ def infer(args):
             top_k=args.topk,
             top_p=args.topp,
             num_beams=args.num_beams,
+            num_return_sequences=args.num_return_sequences,
             use_fp16_decoding=args.use_fp16_decoding,
             use_faster=args.faster)
 

diff --git a/examples/machine_reading_comprehension/SQuAD/deploy/python/predict.py b/examples/machine_reading_comprehension/SQuAD/deploy/python/predict.py
@@ -23,7 +23,7 @@
 import paddle
 from paddle import inference
 from paddle.io import DataLoader
-from paddlenlp.datasets import load_dataset
+from datasets import load_dataset
 from paddlenlp.data import Pad, Stack, Dict
 from paddlenlp.metrics.squad import squad_evaluate, compute_prediction
 

diff --git a/paddlenlp/data/sampler.py b/paddlenlp/data/sampler.py
@@ -18,7 +18,6 @@
 import six
 
 import numpy as np
-import paddle.distributed as dist
 
 
 class SamplerHelper(object):
@@ -391,6 +390,8 @@ def __len__(self):
                 print(list(sampler))    # indices of dataset elements
                 # [0, 2]
         """
+        import paddle.distributed as dist
+
         if num_replicas is None:
             num_replicas = dist.get_world_size()
         if rank is None:

diff --git a/paddlenlp/datasets/cnn_dailymail.py b/paddlenlp/datasets/cnn_dailymail.py
@@ -20,7 +20,12 @@
 
 from paddle.dataset.common import md5file
 from paddle.utils.download import get_path_from_url, _decompress, _get_unique_endpoints
-from paddle.distributed import ParallelEnv
+try:
+    from paddle.distributed import ParallelEnv
+except Exception as e:
+    import warnings
+    warnings.warn("paddle.distributed is not contains in you paddle!")
+
 from paddlenlp.utils.env import DATA_HOME
 from paddlenlp.utils.log import logger
 from . import DatasetBuilder

diff --git a/paddlenlp/datasets/dataset.py b/paddlenlp/datasets/dataset.py
@@ -26,7 +26,12 @@
 import paddlenlp
 import datasets
 
-import paddle.distributed as dist
+try:
+    import paddle.distributed as dist
+except Exception as e:
+    import warnings
+    warnings.warn("paddle.distributed is not contains in you paddle!")
+
 from paddle.io import Dataset, IterableDataset
 from paddle.dataset.common import md5file
 from paddle.utils.download import get_path_from_url, _get_unique_endpoints
@@ -45,13 +50,15 @@
 
 def load_from_ppnlp(path, **kwargs):
     ppnlp_path = paddlenlp.datasets.__path__[0]
-    path = os.path.split(path)[-1]
-    path = os.path.join(ppnlp_path, path + '.py')
-    return origin_load_dataset(path, **kwargs)
+    new_path = os.path.split(path)[-1]
+    new_path = os.path.join(ppnlp_path, 'hf_datasets', new_path + '.py')
+    if os.path.exists(new_path):
+        return origin_load_dataset(new_path, **kwargs)
+    else:
+        return origin_load_dataset(path, **kwargs)
 
 
-if os.environ.get('ISINTRANET', '0') == '1':
-    datasets.load_dataset = load_from_ppnlp
+datasets.load_dataset = load_from_ppnlp
 
 
 class DatasetTuple:

diff --git a/paddlenlp/datasets/dureader_robust.py b/paddlenlp/datasets/dureader_robust.py
@@ -1,6 +1,4 @@
-# coding=utf-8
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,123 +12,74 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Lint as: python3
-
+import collections
 import json
 import os
 
-import datasets
-from datasets.tasks import QuestionAnsweringExtractive
-
-logger = datasets.logging.get_logger(__name__)
-
-_DESCRIPTION = """\
-DureaderRobust is a chinese reading comprehension \
-dataset, designed to evaluate the MRC models from \
-three aspects: over-sensitivity, over-stability \
-and generalization.
-"""
-
-_URL = "https://bj.bcebos.com/paddlenlp/datasets/dureader_robust-data.tar.gz"
-
-
-class DureaderRobustConfig(datasets.BuilderConfig):
-    """BuilderConfig for DureaderRobust."""
-
-    def __init__(self, **kwargs):
-        """BuilderConfig for DureaderRobust.
-
-        Args:
-          **kwargs: keyword arguments forwarded to super.
-        """
-        super(DureaderRobustConfig, self).__init__(**kwargs)
-
-
-class DureaderRobust(datasets.GeneratorBasedBuilder):
-    BUILDER_CONFIGS = [
-        DureaderRobustConfig(
-            name="plain_text",
-            version=datasets.Version("1.0.0", ""),
-            description="Plain text", ),
-    ]
-
-    def _info(self):
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=datasets.Features({
-                "id": datasets.Value("string"),
-                "title": datasets.Value("string"),
-                "context": datasets.Value("string"),
-                "question": datasets.Value("string"),
-                "answers": datasets.features.Sequence({
-                    "text": datasets.Value("string"),
-                    "answer_start": datasets.Value("int32"),
-                }),
-            }),
-            # No default supervised_keys (as we have to pass both question
-            # and context as input).
-            supervised_keys=None,
-            homepage="https://arxiv.org/abs/2004.11142",
-            task_templates=[
-                QuestionAnsweringExtractive(
-                    question_column="question",
-                    context_column="context",
-                    answers_column="answers")
-            ], )
-
-    def _split_generators(self, dl_manager):
-        dl_dir = dl_manager.download_and_extract(_URL)
-
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    "filepath": os.path.join(dl_dir, 'dureader_robust-data',
-                                             'train.json')
-                }),
-            datasets.SplitGenerator(
-                name=datasets.Split.VALIDATION,
-                gen_kwargs={
-                    "filepath": os.path.join(dl_dir, 'dureader_robust-data',
-                                             'dev.json')
-                }),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={
-                    "filepath": os.path.join(dl_dir, 'dureader_robust-data',
-                                             'test.json')
-                }),
-        ]
-
-    def _generate_examples(self, filepath):
-        """This function returns the examples in the raw (text) form."""
-        logger.info("generating examples from = %s", filepath)
-        key = 0
-        with open(filepath, encoding="utf-8") as f:
-            durobust = json.load(f)
-            for article in durobust["data"]:
-                title = article.get("title", "")
-                for paragraph in article["paragraphs"]:
-                    context = paragraph[
-                        "context"]  # do not strip leading blank spaces GH-2585
-                    for qa in paragraph["qas"]:
-                        answer_starts = [
-                            answer["answer_start"]
-                            for answer in qa.get("answers", '')
-                        ]
-                        answers = [
-                            answer["text"] for answer in qa.get("answers", '')
-                        ]
-                        # Features currently used are "context", "question", and "answers".
-                        # Others are extracted here for the ease of future expansions.
-                        yield key, {
-                            "title": title,
-                            "context": context,
-                            "question": qa["question"],
-                            "id": qa["id"],
-                            "answers": {
-                                "answer_start": answer_starts,
-                                "text": answers,
-                            },
-                        }
-                        key += 1
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+from paddlenlp.utils.env import DATA_HOME
+from . import DatasetBuilder
+
+__all__ = ['DuReaderRobust']
+
+
+class DuReaderRobust(DatasetBuilder):
+    '''
+    The machine reading comprehension dataset (i.e. DuReader robust) is designed
+    to measure the robustness of a reading comprehension model, including the
+    over-sensitivity, over-stability and generalization ability of the model.
+    '''
+
+    URL = 'https://bj.bcebos.com/paddlenlp/datasets/dureader_robust-data.tar.gz'
+    MD5 = '82f3d191a115ec17808856866787606e'
+    META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
+    SPLITS = {
+        'train': META_INFO(
+            os.path.join('dureader_robust-data', 'train.json'),
+            '800a3dcb742f9fdf9b11e0a83433d4be'),
+        'dev': META_INFO(
+            os.path.join('dureader_robust-data', 'dev.json'),
+            'ae73cec081eaa28a735204c4898a2222'),
+        'test': META_INFO(
+            os.path.join('dureader_robust-data', 'test.json'),
+            'e0e8aa5c7b6d11b6fc3935e29fc7746f')
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and
+                                            not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            input_data = json.load(f)["data"]
+        for entry in input_data:
+            title = entry.get("title", "").strip()
+            for paragraph in entry["paragraphs"]:
+                context = paragraph["context"].strip()
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question = qa["question"].strip()
+                    answer_starts = [
+                        answer["answer_start"]
+                        for answer in qa.get("answers", [])
+                    ]
+                    answers = [
+                        answer["text"].strip()
+                        for answer in qa.get("answers", [])
+                    ]
+
+                    yield {
+                        'id': qas_id,
+                        'title': title,
+                        'context': context,
+                        'question': question,
+                        'answers': answers,
+                        'answer_starts': answer_starts
+                    }