From e76fa2f20eed707ae14928ea0d849d6ea059ca01 Mon Sep 17 00:00:00 2001
From: smallv0221 <33639025+smallv0221@users.noreply.github.com>
Date: Wed, 3 Mar 2021 22:00:56 +0800
Subject: [PATCH] Add new glue dataset and example. update dataset. (#59)

* Add new glue dataset and example. update dataset.

* update load_dataet() args name
---
 examples/glue/run_glue.py                     | 132 +++----
 .../DuReader-robust/run_du.py                 |   5 +-
 .../SQuAD/run_squad.py                        |   4 +-
 paddlenlp/datasets/experimental/cmrc2018.py   |   3 +-
 paddlenlp/datasets/experimental/dataset.py    | 114 ++++---
 paddlenlp/datasets/experimental/drcd.py       |   3 +-
 .../datasets/experimental/dureader_robust.py  |   5 +-
 paddlenlp/datasets/experimental/glue.py       | 322 ++++++++++++++++++
 paddlenlp/datasets/experimental/ptb.py        |   3 +-
 paddlenlp/datasets/experimental/squad.py      |   3 +-
 paddlenlp/datasets/experimental/wmt14ende.py  |   2 +-
 11 files changed, 438 insertions(+), 158 deletions(-)
 create mode 100644 paddlenlp/datasets/experimental/glue.py

diff --git a/examples/glue/run_glue.py b/examples/glue/run_glue.py
index b2baee5f747c1..1ef08166d4436 100644
--- a/examples/glue/run_glue.py
+++ b/examples/glue/run_glue.py
@@ -26,8 +26,8 @@
 from paddle.io import DataLoader
 from paddle.metric import Metric, Accuracy, Precision, Recall
 
-from paddlenlp.datasets import GlueCoLA, GlueSST2, GlueMRPC, GlueSTSB, GlueQQP, GlueMNLI, GlueQNLI, GlueRTE
-from paddlenlp.data import Stack, Tuple, Pad
+from paddlenlp.datasets import load_dataset
+from paddlenlp.data import Stack, Tuple, Pad, Dict
 from paddlenlp.data.sampler import SamplerHelper
 from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
 from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer
@@ -40,14 +40,14 @@
 logger = logging.getLogger(__name__)
 
 TASK_CLASSES = {
-    "cola": (GlueCoLA, Mcc),
-    "sst-2": (GlueSST2, Accuracy),
-    "mrpc": (GlueMRPC, AccuracyAndF1),
-    "sts-b": (GlueSTSB, PearsonAndSpearman),
-    "qqp": (GlueQQP, AccuracyAndF1),
-    "mnli": (GlueMNLI, Accuracy),
-    "qnli": (GlueQNLI, Accuracy),
-    "rte": (GlueRTE, Accuracy),
+    "cola": Mcc,
+    "sst-2": Accuracy,
+    "mrpc": AccuracyAndF1,
+    "sts-b": PearsonAndSpearman,
+    "qqp": AccuracyAndF1,
+    "mnli": Accuracy,
+    "qnli": Accuracy,
+    "rte": Accuracy,
 }
 
 MODEL_CLASSES = {
@@ -211,66 +211,25 @@ def convert_example(example,
                     max_seq_length=512,
                     is_test=False):
     """convert a glue example into necessary features"""
-
-    def _truncate_seqs(seqs, max_seq_length):
-        if len(seqs) == 1:  # single sentence
-            # Account for [CLS] and [SEP] with "- 2"
-            seqs[0] = seqs[0][0:(max_seq_length - 2)]
-        else:  # Sentence pair
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            tokens_a, tokens_b = seqs
-            max_seq_length -= 3
-            while True:  # Truncate with longest_first strategy
-                total_length = len(tokens_a) + len(tokens_b)
-                if total_length <= max_seq_length:
-                    break
-                if len(tokens_a) > len(tokens_b):
-                    tokens_a.pop()
-                else:
-                    tokens_b.pop()
-        return seqs
-
-    def _concat_seqs(seqs, separators, seq_mask=0, separator_mask=1):
-        concat = sum((seq + sep for sep, seq in zip(separators, seqs)), [])
-        segment_ids = sum(
-            ([i] * (len(seq) + len(sep))
-             for i, (sep, seq) in enumerate(zip(separators, seqs))), [])
-        if isinstance(seq_mask, int):
-            seq_mask = [[seq_mask] * len(seq) for seq in seqs]
-        if isinstance(separator_mask, int):
-            separator_mask = [[separator_mask] * len(sep) for sep in separators]
-        p_mask = sum((s_mask + mask
-                      for sep, seq, s_mask, mask in zip(
-                          separators, seqs, seq_mask, separator_mask)), [])
-        return concat, segment_ids, p_mask
-
     if not is_test:
         # `label_list == None` is for regression task
         label_dtype = "int64" if label_list else "float32"
         # Get the label
-        label = example[-1]
-        example = example[:-1]
-        # Create label maps if classification task
-        if label_list:
-            label_map = {}
-            for (i, l) in enumerate(label_list):
-                label_map[l] = i
-            label = label_map[label]
+        label = example['labels']
         label = np.array([label], dtype=label_dtype)
-
-    # Tokenize raw text
-    if len(example) == 1:
-        example = tokenizer(example[0], max_seq_len=max_seq_length)
+    # Convert raw text to feature
+    if len(example) == 2:
+        example = tokenizer(example['sentence'], max_seq_len=max_seq_length)
     else:
         example = tokenizer(
-            example[0], text_pair=example[1], max_seq_len=max_seq_length)
+            example['sentence1'],
+            text_pair=example['sentence2'],
+            max_seq_len=max_seq_length)
 
     if not is_test:
-        return example['input_ids'], example['token_type_ids'], len(example[
-            'input_ids']), label
+        return example['input_ids'], example['token_type_ids'], label
     else:
-        return example['input_ids'], example['token_type_ids'], len(example[
-            'input_ids'])
+        return example['input_ids'], example['token_type_ids']
 
 
 def do_train(args):
@@ -281,69 +240,67 @@ def do_train(args):
     set_seed(args)
 
     args.task_name = args.task_name.lower()
-    dataset_class, metric_class = TASK_CLASSES[args.task_name]
+    metric_class = TASK_CLASSES[args.task_name]
     args.model_type = args.model_type.lower()
     model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
 
-    train_dataset = dataset_class.get_datasets(["train"])
+    train_ds = load_dataset('glue', args.task_name, splits="train")
     tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
 
     trans_func = partial(
         convert_example,
         tokenizer=tokenizer,
-        label_list=train_dataset.get_labels(),
+        label_list=train_ds.label_list,
         max_seq_length=args.max_seq_length)
-    train_dataset = train_dataset.apply(trans_func, lazy=True)
+    train_ds = train_ds.map(trans_func, lazy=True)
     train_batch_sampler = paddle.io.DistributedBatchSampler(
-        train_dataset, batch_size=args.batch_size, shuffle=True)
+        train_ds, batch_size=args.batch_size, shuffle=True)
     batchify_fn = lambda samples, fn=Tuple(
         Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
         Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # segment
-        Stack(),  # length
-        Stack(dtype="int64" if train_dataset.get_labels() else "float32")  # label
-    ): [data for i, data in enumerate(fn(samples)) if i != 2]
+        Stack(dtype="int64" if train_ds.label_list else "float32")  # label
+    ): fn(samples)
     train_data_loader = DataLoader(
-        dataset=train_dataset,
+        dataset=train_ds,
         batch_sampler=train_batch_sampler,
         collate_fn=batchify_fn,
         num_workers=0,
         return_list=True)
     if args.task_name == "mnli":
-        dev_dataset_matched, dev_dataset_mismatched = dataset_class.get_datasets(
-            ["dev_matched", "dev_mismatched"])
-        dev_dataset_matched = dev_dataset_matched.apply(trans_func, lazy=True)
-        dev_dataset_mismatched = dev_dataset_mismatched.apply(
-            trans_func, lazy=True)
+        dev_ds_matched, dev_ds_mismatched = load_dataset(
+            'glue', args.task_name, splits=["dev_matched", "dev_mismatched"])
+
+        dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True)
+        dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True)
         dev_batch_sampler_matched = paddle.io.BatchSampler(
-            dev_dataset_matched, batch_size=args.batch_size, shuffle=False)
+            dev_ds_matched, batch_size=args.batch_size, shuffle=False)
         dev_data_loader_matched = DataLoader(
-            dataset=dev_dataset_matched,
+            dataset=dev_ds_matched,
             batch_sampler=dev_batch_sampler_matched,
             collate_fn=batchify_fn,
             num_workers=0,
             return_list=True)
         dev_batch_sampler_mismatched = paddle.io.BatchSampler(
-            dev_dataset_mismatched, batch_size=args.batch_size, shuffle=False)
+            dev_ds_mismatched, batch_size=args.batch_size, shuffle=False)
         dev_data_loader_mismatched = DataLoader(
-            dataset=dev_dataset_mismatched,
+            dataset=dev_ds_mismatched,
             batch_sampler=dev_batch_sampler_mismatched,
             collate_fn=batchify_fn,
             num_workers=0,
             return_list=True)
     else:
-        dev_dataset = dataset_class.get_datasets(["dev"])
-        dev_dataset = dev_dataset.apply(trans_func, lazy=True)
+        dev_ds = load_dataset('glue', args.task_name, splits='dev')
+        dev_ds = dev_ds.map(trans_func, lazy=True)
         dev_batch_sampler = paddle.io.BatchSampler(
-            dev_dataset, batch_size=args.batch_size, shuffle=False)
+            dev_ds, batch_size=args.batch_size, shuffle=False)
         dev_data_loader = DataLoader(
-            dataset=dev_dataset,
+            dataset=dev_ds,
             batch_sampler=dev_batch_sampler,
             collate_fn=batchify_fn,
             num_workers=0,
             return_list=True)
 
-    num_classes = 1 if train_dataset.get_labels() == None else len(
-        train_dataset.get_labels())
+    num_classes = 1 if train_ds.label_list == None else len(train_ds.label_list)
     model = model_class.from_pretrained(
         args.model_name_or_path, num_classes=num_classes)
     if paddle.distributed.get_world_size() > 1:
@@ -368,8 +325,8 @@ def do_train(args):
             if not any(nd in n for nd in ["bias", "norm"])
         ])
 
-    loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_dataset.get_labels(
-    ) else paddle.nn.loss.MSELoss()
+    loss_fct = paddle.nn.loss.CrossEntropyLoss(
+    ) if train_ds.label_list else paddle.nn.loss.MSELoss()
 
     metric = metric_class()
 
@@ -378,6 +335,7 @@ def do_train(args):
     for epoch in range(args.num_train_epochs):
         for step, batch in enumerate(train_data_loader):
             global_step += 1
+
             input_ids, segment_ids, labels = batch
             logits = model(input_ids, segment_ids)
             loss = loss_fct(logits, labels)
@@ -392,7 +350,7 @@ def do_train(args):
                        paddle.distributed.get_rank(), loss, optimizer.get_lr(),
                        args.logging_steps / (time.time() - tic_train)))
                 tic_train = time.time()
-            if global_step % args.save_steps == 0:
+            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                 tic_eval = time.time()
                 if args.task_name == "mnli":
                     evaluate(model, loss_fct, metric, dev_data_loader_matched)
diff --git a/examples/machine_reading_comprehension/DuReader-robust/run_du.py b/examples/machine_reading_comprehension/DuReader-robust/run_du.py
index d139de74cf8b4..6c70f82428a13 100644
--- a/examples/machine_reading_comprehension/DuReader-robust/run_du.py
+++ b/examples/machine_reading_comprehension/DuReader-robust/run_du.py
@@ -123,7 +123,6 @@ def prepare_train_features(examples):
             questions,
             contexts,
             stride=args.doc_stride,
-            pad_to_max_seq_len=True,
             max_seq_len=args.max_seq_length)
 
         # Let's label those examples!
@@ -154,9 +153,11 @@ def prepare_train_features(examples):
                 token_start_index += 1
 
             # End token index of the current span in the text.
-            token_end_index = len(input_ids) - 2
+            token_end_index = len(input_ids) - 1
             while sequence_ids[token_end_index] != 1:
                 token_end_index -= 1
+            # Minus one more to reach actual text
+            token_end_index -= 1
 
             # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
             if not (offsets[token_start_index][0] <= start_char and
diff --git a/examples/machine_reading_comprehension/SQuAD/run_squad.py b/examples/machine_reading_comprehension/SQuAD/run_squad.py
index 579d6687234fb..800ff8a5ee89d 100644
--- a/examples/machine_reading_comprehension/SQuAD/run_squad.py
+++ b/examples/machine_reading_comprehension/SQuAD/run_squad.py
@@ -158,9 +158,11 @@ def prepare_train_features(examples):
                     token_start_index += 1
 
                 # End token index of the current span in the text.
-                token_end_index = len(input_ids) - 2
+                token_end_index = len(input_ids) - 1
                 while sequence_ids[token_end_index] != 1:
                     token_end_index -= 1
+                # Minus one more to reach actual text
+                token_end_index -= 1
 
                 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                 if not (offsets[token_start_index][0] <= start_char and
diff --git a/paddlenlp/datasets/experimental/cmrc2018.py b/paddlenlp/datasets/experimental/cmrc2018.py
index d7312c5c6f76e..cc58a5caedb90 100644
--- a/paddlenlp/datasets/experimental/cmrc2018.py
+++ b/paddlenlp/datasets/experimental/cmrc2018.py
@@ -31,11 +31,10 @@ def _get_data(self, mode, **kwargs):
         if not os.path.exists(fullname) or (data_hash and
                                             not md5file(fullname) == data_hash):
             get_path_from_url(URL, default_root)
-            fullname = os.path.join(default_root, filename)
 
         return fullname
 
-    def _read(self, filename):
+    def _read(self, filename, *args):
         with open(filename, "r", encoding="utf8") as f:
             input_data = json.load(f)["data"]
         for entry in input_data:
diff --git a/paddlenlp/datasets/experimental/dataset.py b/paddlenlp/datasets/experimental/dataset.py
index 7f8616e679f95..6655b0a363747 100644
--- a/paddlenlp/datasets/experimental/dataset.py
+++ b/paddlenlp/datasets/experimental/dataset.py
@@ -53,11 +53,14 @@ def import_main_class(module_path):
     return module_main_cls
 
 
-def load_dataset(name, data_files=None, splits=None, lazy=None):
-    module_path = DATASETS_MODULE_PATH + name
+def load_dataset(path, name=None, data_files=None, splits=None, lazy=None):
+    module_path = DATASETS_MODULE_PATH + path
 
     reader_cls = import_main_class(module_path)
-    reader_instance = reader_cls(lazy)
+    if not name:
+        reader_instance = reader_cls(lazy=lazy)
+    else:
+        reader_instance = reader_cls(lazy=lazy, name=name)
 
     datasets = reader_instance.read_datasets(
         data_files=data_files, splits=splits)
@@ -317,25 +320,28 @@ class DatasetBuilder:
     """
     lazy = False
 
-    def __init__(self, lazy=None, max_examples: Optional[int]=None):
+    def __init__(self, lazy=None, name=None):
         if lazy is not None:
             self.lazy = lazy
-        self.max_examples = max_examples
+        self.name = name
 
     def read_datasets(self, splits=None, data_files=None):
         datasets = []
         assert splits or data_files, "`data_files` and `splits` can not both be None."
 
         if data_files:
-            assert isinstance(data_files, str) or (
-                isinstance(data_files, list) and isinstance(data_files[0], str)
-            ) or (
-                isinstance(data_files, tuple) and isinstance(data_files[0], str)
-            ), "`data_files` should be a string or list of string or a tuple of string."
+            assert isinstance(data_files, str) or isinstance(
+                data_files, dict
+            ), "`data_files` should be a string or a dictionary whose key is split name ande value is a path of data file."
             if isinstance(data_files, str):
-                datasets.append(self.read(data_files))
+                split = 'train'
+                datasets.append(self.read(filename=data_files, split=split))
             else:
-                datasets += [self.read(data_file) for data_file in data_files]
+                datasets += [
+                    self.read(
+                        filename=filename, split=split)
+                    for split, filename in data_files.items()
+                ]
 
         if splits:
             assert isinstance(splits, str) or (
@@ -344,16 +350,16 @@ def read_datasets(self, splits=None, data_files=None):
                 isinstance(splits, tuple) and isinstance(splits[0], str)
             ), "`splits` should be a string or list of string or a tuple of string."
             if isinstance(splits, str):
-                root = self._get_data(splits)
-                datasets.append(self.read(root))
+                filename = self._get_data(splits)
+                datasets.append(self.read(filename=filename, split=splits))
             else:
                 for split in splits:
-                    root = self._get_data(split)
-                    datasets.append(self.read(root))
+                    filename = self._get_data(split)
+                    datasets.append(self.read(filename=filename, split=split))
 
         return datasets if len(datasets) > 1 else datasets[0]
 
-    def read(self, root):
+    def read(self, filename, split='train'):
         """
         Returns an dataset containing all the examples that can be read from the file path.
         If `self.lazy` is `False`, this eagerly reads all instances from `self._read()`
@@ -367,35 +373,34 @@ def read(self, root):
         if self.lazy:
             label_list = self.get_labels()
 
-            if label_list is not None:
-                label_dict = {}
-                for i, label in enumerate(label_list):
-                    label_dict[label] = i
-
-                def generate_examples():
-                    for example in self._read(root):
-                        if 'labels' not in example.keys():
-                            raise ValueError(
-                                "Keyword 'labels' should be in example if get_label() is specified."
-                            )
-                        else:
+            def generate_examples():
+                generator = self._read(
+                    filename, split
+                ) if self._read.__code__.co_argcount > 2 else self._read(
+                    filename)
+                for example in generator:
+                    if label_list is not None and 'labels' in example.keys():
+                        label_dict = {}
+                        for i, label in enumerate(label_list):
+                            label_dict[label] = i
+                        if isinstance(example['labels'], list) or isinstance(
+                                examples[idx]['labels'], tuple):
                             for label_idx in range(len(example['labels'])):
                                 example['labels'][label_idx] = label_dict[
                                     example['labels'][label_idx]]
+                        else:
+                            example['labels'] = label_dict[example['labels']]
 
-                            yield example
-
-                return IterDataset(generate_examples, label_list=label_list)
-            else:
-
-                def generate_examples():
-                    for example in self._read(root):
+                        yield example
+                    else:
                         yield example
 
-                return IterDataset(generate_examples)
-
+            return IterDataset(generate_examples, label_list=label_list)
         else:
-            examples = self._read(root)
+            examples = self._read(
+                filename,
+                split) if self._read.__code__.co_argcount > 2 else self._read(
+                    filename)
 
             # Then some validation.
             if not isinstance(examples, list):
@@ -404,30 +409,27 @@ def generate_examples():
             if not examples:
                 raise ValueError(
                     "No instances were read from the given filepath {}. "
-                    "Is the path correct?".format(root))
+                    "Is the path correct?".format(filename))
 
             label_list = self.get_labels()
             # Convert class label to label ids.
-            if label_list is not None:
-                if 'labels' not in examples[0].keys():
-                    raise ValueError(
-                        "Key 'labels' should be in example if get_label() is specified."
-                    )
-
+            if label_list is not None and 'labels' in examples[0].keys():
                 label_dict = {}
                 for i, label in enumerate(label_list):
                     label_dict[label] = i
-
                 for idx in range(len(examples)):
-                    for label_idx in range(len(examples[idx]['labels'])):
-                        examples[idx]['labels'][label_idx] = label_dict[
-                            examples[idx]['labels'][label_idx]]
-
-            return MapDataset(
-                examples,
-                label_list=label_list) if label_list else MapDataset(examples)
-
-    def _read(self, file_path: str):
+                    if isinstance(examples[idx]['labels'], list) or isinstance(
+                            examples[idx]['labels'], tuple):
+                        for label_idx in range(len(examples[idx]['labels'])):
+                            examples[idx]['labels'][label_idx] = label_dict[
+                                examples[idx]['labels'][label_idx]]
+                    else:
+                        examples[idx]['labels'] = label_dict[examples[idx][
+                            'labels']]
+
+            return MapDataset(examples, label_list=label_list)
+
+    def _read(self, filename: str, *args):
         """
         Reads examples from the given file_path and returns them as an
         `Iterable` (which could be a list or could be a generator).
diff --git a/paddlenlp/datasets/experimental/drcd.py b/paddlenlp/datasets/experimental/drcd.py
index cacb8cc6a78ab..d67a9597aa4d0 100644
--- a/paddlenlp/datasets/experimental/drcd.py
+++ b/paddlenlp/datasets/experimental/drcd.py
@@ -31,11 +31,10 @@ def _get_data(self, mode, **kwargs):
         if not os.path.exists(fullname) or (data_hash and
                                             not md5file(fullname) == data_hash):
             get_path_from_url(URL, default_root)
-            fullname = os.path.join(default_root, filename)
 
         return fullname
 
-    def _read(self, filename):
+    def _read(self, filename, *args):
         with open(filename, "r", encoding="utf8") as f:
             input_data = json.load(f)["data"]
         for entry in input_data:
diff --git a/paddlenlp/datasets/experimental/dureader_robust.py b/paddlenlp/datasets/experimental/dureader_robust.py
index a22b0a4a4bb61..e1b699d2c4ee8 100644
--- a/paddlenlp/datasets/experimental/dureader_robust.py
+++ b/paddlenlp/datasets/experimental/dureader_robust.py
@@ -32,12 +32,11 @@ def _get_data(self, mode, **kwargs):
         fullname = os.path.join(default_root, filename)
         if not os.path.exists(fullname) or (data_hash and
                                             not md5file(fullname) == data_hash):
-            get_path_from_url(self.URL, default_root)
-            fullname = os.path.join(default_root, filename)
+            get_path_from_url(self.URL, default_root, self.MD5)
 
         return fullname
 
-    def _read(self, filename):
+    def _read(self, filename, *args):
         with open(filename, "r", encoding="utf8") as f:
             input_data = json.load(f)["data"]
         for entry in input_data:
diff --git a/paddlenlp/datasets/experimental/glue.py b/paddlenlp/datasets/experimental/glue.py
new file mode 100644
index 0000000000000..328ba3ccdd425
--- /dev/null
+++ b/paddlenlp/datasets/experimental/glue.py
@@ -0,0 +1,322 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+from paddlenlp.utils.env import DATA_HOME
+from . import DatasetBuilder
+
+
+class Glue(DatasetBuilder):
+    BUILDER_CONFIGS = {
+        'cola': {
+            'url': "https://dataset.bj.bcebos.com/glue/CoLA.zip",
+            'md5': 'b178a7c2f397b0433c39c7caf50a3543',
+            'splits': {
+                'train': [
+                    os.path.join('CoLA', 'train.tsv'),
+                    'c79d4693b8681800338aa044bf9e797b', (3, 1), 0
+                ],
+                'dev': [
+                    os.path.join('CoLA', 'dev.tsv'),
+                    'c5475ccefc9e7ca0917294b8bbda783c', (3, 1), 0
+                ],
+                'test': [
+                    os.path.join('CoLA', 'test.tsv'),
+                    'd8721b7dedda0dcca73cebb2a9f4259f', (1, ), 1
+                ]
+            },
+            'labels': ["0", "1"]
+        },
+        'sst-2': {
+            'url': "https://dataset.bj.bcebos.com/glue/SST.zip",
+            'md5': '9f81648d4199384278b86e315dac217c',
+            'splits': {
+                'train': [
+                    os.path.join('SST-2', 'train.tsv'),
+                    'da409a0a939379ed32a470bc0f7fe99a', (0, 1), 1
+                ],
+                'dev': [
+                    os.path.join('SST-2', 'dev.tsv'),
+                    '268856b487b2a31a28c0a93daaff7288', (0, 1), 1
+                ],
+                'test': [
+                    os.path.join('SST-2', 'test.tsv'),
+                    '3230e4efec76488b87877a56ae49675a', (1, ), 1
+                ]
+            },
+            'labels': ["0", "1"]
+        },
+        'sts-b': {
+            'url': 'https://dataset.bj.bcebos.com/glue/STS.zip',
+            'md5': 'd573676be38f1a075a5702b90ceab3de',
+            'splits': {
+                'train': [
+                    os.path.join('STS-B', 'train.tsv'),
+                    '4f7a86dde15fe4832c18e5b970998672', (7, 8, 9), 1
+                ],
+                'dev': [
+                    os.path.join('STS-B', 'dev.tsv'),
+                    '5f4d6b0d2a5f268b1b56db773ab2f1fe', (7, 8, 9), 1
+                ],
+                'test': [
+                    os.path.join('STS-B', 'test.tsv'),
+                    '339b5817e414d19d9bb5f593dd94249c', (7, 8), 1
+                ]
+            },
+            'labels': None
+        },
+        'qqp': {
+            'url': 'https://dataset.bj.bcebos.com/glue/QQP.zip',
+            'md5': '884bf26e39c783d757acc510a2a516ef',
+            'splits': {
+                'train': [
+                    os.path.join('QQP', 'train.tsv'),
+                    'e003db73d277d38bbd83a2ef15beb442', (3, 4, 5), 1
+                ],
+                'dev': [
+                    os.path.join('QQP', 'dev.tsv'),
+                    'cff6a448d1580132367c22fc449ec214', (3, 4, 5), 1
+                ],
+                'test': [
+                    os.path.join('QQP', 'test.tsv'),
+                    '73de726db186b1b08f071364b2bb96d0', (1, 2), 1
+                ]
+            },
+            'labels': ["0", "1"]
+        },
+        'mnli': {
+            'url': 'https://dataset.bj.bcebos.com/glue/MNLI.zip',
+            'md5': 'e343b4bdf53f927436d0792203b9b9ff',
+            'splits': {
+                'train': [
+                    os.path.join('MNLI', 'train.tsv'),
+                    '220192295e23b6705f3545168272c740', (8, 9, 11), 1
+                ],
+                'dev_matched': [
+                    os.path.join('MNLI', 'dev_matched.tsv'),
+                    'c3fa2817007f4cdf1a03663611a8ad23', (8, 9, 15), 1
+                ],
+                'dev_mismatched': [
+                    os.path.join('MNLI', 'dev_mismatched.tsv'),
+                    'b219e6fe74e4aa779e2f417ffe713053', (8, 9, 15), 1
+                ],
+                'test_matched': [
+                    os.path.join('MNLI', 'test_matched.tsv'),
+                    '33ea0389aedda8a43dabc9b3579684d9', (8, 9), 1
+                ],
+                'test_mismatched': [
+                    os.path.join('MNLI', 'test_mismatched.tsv'),
+                    '7d2f60a73d54f30d8a65e474b615aeb6', (8, 9), 1
+                ]
+            },
+            'labels': ["contradiction", "entailment", "neutral"]
+        },
+        'qnli': {
+            'url': 'https://dataset.bj.bcebos.com/glue/QNLI.zip',
+            'md5': 'b4efd6554440de1712e9b54e14760e82',
+            'splits': {
+                'train': [
+                    os.path.join('QNLI', 'train.tsv'),
+                    '5e6063f407b08d1f7c7074d049ace94a', (1, 2, 3), 1
+                ],
+                'dev': [
+                    os.path.join('QNLI', 'dev.tsv'),
+                    '1e81e211959605f144ba6c0ad7dc948b', (1, 2, 3), 1
+                ],
+                'test': [
+                    os.path.join('QNLI', 'test.tsv'),
+                    'f2a29f83f3fe1a9c049777822b7fa8b0', (1, 2), 1
+                ]
+            },
+            'labels': ["entailment", "not_entailment"]
+        },
+        'rte': {
+            'url': 'https://dataset.bj.bcebos.com/glue/RTE.zip',
+            'md5': 'bef554d0cafd4ab6743488101c638539',
+            'splits': {
+                'train': [
+                    os.path.join('RTE', 'train.tsv'),
+                    'd2844f558d111a16503144bb37a8165f', (1, 2, 3), 1
+                ],
+                'dev': [
+                    os.path.join('RTE', 'dev.tsv'),
+                    '973cb4178d4534cf745a01c309d4a66c', (1, 2, 3), 1
+                ],
+                'test': [
+                    os.path.join('RTE', 'test.tsv'),
+                    '6041008f3f3e48704f57ce1b88ad2e74', (1, 2), 1
+                ]
+            },
+            'labels': ["entailment", "not_entailment"]
+        },
+        'wnli': {
+            'url': 'https://dataset.bj.bcebos.com/glue/WNLI.zip',
+            'md5': 'a1b4bd2861017d302d29e42139657a42',
+            'splits': {
+                'train': [
+                    os.path.join('WNLI', 'train.tsv'),
+                    '5cdc5a87b7be0c87a6363fa6a5481fc1', (1, 2, 3), 1
+                ],
+                'dev': [
+                    os.path.join('WNLI', 'dev.tsv'),
+                    'a79a6dd5d71287bcad6824c892e517ee', (1, 2, 3), 1
+                ],
+                'test': [
+                    os.path.join('WNLI', 'test.tsv'),
+                    'a18789ba4f60f6fdc8cb4237e4ba24b5', (1, 2), 1
+                ]
+            },
+            'labels': ["0", "1"]
+        },
+        'mrpc': {
+            'url': {
+                'train_data':
+                'https://dataset.bj.bcebos.com/glue/mrpc/msr_paraphrase_train.txt',
+                'dev_id': 'https://dataset.bj.bcebos.com/glue/mrpc/dev_ids.tsv',
+                'test_data':
+                'https://dataset.bj.bcebos.com/glue/mrpc/msr_paraphrase_test.txt'
+            },
+            'md5': {
+                'train_data': '793daf7b6224281e75fe61c1f80afe35',
+                'dev_id': '7ab59a1b04bd7cb773f98a0717106c9b',
+                'test_data': 'e437fdddb92535b820fe8852e2df8a49'
+            },
+            'splits': {
+                'train': [
+                    os.path.join('MRPC', 'train.tsv'),
+                    'dc2dac669a113866a6480a0b10cd50bf', (3, 4, 0), 1
+                ],
+                'dev': [
+                    os.path.join('MRPC', 'dev.tsv'),
+                    '185958e46ba556b38c6a7cc63f3a2135', (3, 4, 0), 1
+                ],
+                'test': [
+                    os.path.join('MRPC', 'test.tsv'),
+                    '4825dab4b4832f81455719660b608de5', (3, 4), 1
+                ]
+            },
+            'labels': ["0", "1"]
+        }
+    }
+
+    def _get_data(self, mode, **kwargs):
+        builder_config = self.BUILDER_CONFIGS[self.name]
+        if self.name != 'mrpc':
+            default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+            filename, data_hash, _, _ = builder_config['splits'][mode]
+            fullname = os.path.join(default_root, filename)
+            if not os.path.exists(fullname) or (
+                    data_hash and not md5file(fullname) == data_hash):
+                get_path_from_url(builder_config['url'], default_root,
+                                  builder_config['md5'])
+
+        else:
+            default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+            filename, data_hash, _, _ = builder_config['splits'][mode]
+            fullname = os.path.join(default_root, filename)
+            if not os.path.exists(fullname) or (
+                    data_hash and not md5file(fullname) == data_hash):
+                if mode in ('train', 'dev'):
+                    dev_id_path = get_path_from_url(
+                        builder_config['url']['dev_id'],
+                        os.path.join(default_root, 'MRPC'),
+                        builder_config['md5']['dev_id'])
+                    train_data_path = get_path_from_url(
+                        builder_config['url']['train_data'],
+                        os.path.join(default_root, 'MRPC'),
+                        builder_config['md5']['train_data'])
+                    # read dev data ids
+                    dev_ids = []
+                    print(dev_id_path)
+                    with open(dev_id_path, encoding='utf-8') as ids_fh:
+                        for row in ids_fh:
+                            dev_ids.append(row.strip().split('\t'))
+
+                    # generate train and dev set
+                    train_path = os.path.join(default_root, 'MRPC', 'train.tsv')
+                    dev_path = os.path.join(default_root, 'MRPC', 'dev.tsv')
+                    with open(train_data_path, encoding='utf-8') as data_fh:
+                        with open(
+                                train_path, 'w', encoding='utf-8') as train_fh:
+                            with open(dev_path, 'w', encoding='utf8') as dev_fh:
+                                header = data_fh.readline()
+                                train_fh.write(header)
+                                dev_fh.write(header)
+                                for row in data_fh:
+                                    label, id1, id2, s1, s2 = row.strip().split(
+                                        '\t')
+                                    example = '%s\t%s\t%s\t%s\t%s\n' % (
+                                        label, id1, id2, s1, s2)
+                                    if [id1, id2] in dev_ids:
+                                        dev_fh.write(example)
+                                    else:
+                                        train_fh.write(example)
+
+                else:
+                    test_data_path = get_path_from_url(
+                        builder_config['url']['test_data'],
+                        os.path.join(default_root, 'MRPC'),
+                        builder_config['md5']['test_data'])
+                    test_path = os.path.join(default_root, 'MRPC', 'test.tsv')
+                    with open(test_data_path, encoding='utf-8') as data_fh:
+                        with open(test_path, 'w', encoding='utf-8') as test_fh:
+                            header = data_fh.readline()
+                            test_fh.write(
+                                'index\t#1 ID\t#2 ID\t#1 String\t#2 String\n')
+                            for idx, row in enumerate(data_fh):
+                                label, id1, id2, s1, s2 = row.strip().split(
+                                    '\t')
+                                test_fh.write('%d\t%s\t%s\t%s\t%s\n' %
+                                              (idx, id1, id2, s1, s2))
+
+        return fullname
+
+    def _read(self, filename, split):
+        _, _, field_indices, num_discard_samples = self.BUILDER_CONFIGS[
+            self.name]['splits'][split]
+        with open(filename, 'r', encoding='utf-8') as f:
+            for idx, line in enumerate(f):
+                if idx < num_discard_samples:
+                    continue
+                line_stripped = line.strip().split('\t')
+                if not line_stripped:
+                    break
+                example = [line_stripped[indice] for indice in field_indices]
+                if self.name in ['cola', 'sst-2']:
+                    yield {
+                        'sentence': example[0]
+                    } if 'test' in split else {
+                        'sentence': example[0],
+                        'labels': example[-1]
+                    }
+                else:
+                    yield {
+                        'sentence1': example[0],
+                        'sentence2': example[1]
+                    } if 'test' in split else {
+                        'sentence1': example[0],
+                        'sentence2': example[1],
+                        'labels': example[-1]
+                    }
+
+    def get_labels(self):
+        """
+        Return labels of the Glue task.
+        """
+        return self.BUILDER_CONFIGS[self.name]['labels']
diff --git a/paddlenlp/datasets/experimental/ptb.py b/paddlenlp/datasets/experimental/ptb.py
index adceb57510443..28541579d68f2 100644
--- a/paddlenlp/datasets/experimental/ptb.py
+++ b/paddlenlp/datasets/experimental/ptb.py
@@ -35,11 +35,10 @@ def _get_data(self, mode, **kwargs):
                                             not md5file(fullname) == data_hash):
 
             get_path_from_url(self.URL, default_root, self.MD5)
-            fullname = os.path.join(default_root, filename)
 
         return fullname
 
-    def _read(self, filename):
+    def _read(self, filename, *args):
         with open(filename, 'r', encoding='utf-8') as f:
             for line in f:
                 line_stripped = line.strip()
diff --git a/paddlenlp/datasets/experimental/squad.py b/paddlenlp/datasets/experimental/squad.py
index 1949f75b33b8b..50d47adccc252 100644
--- a/paddlenlp/datasets/experimental/squad.py
+++ b/paddlenlp/datasets/experimental/squad.py
@@ -34,11 +34,10 @@ def _get_data(self, mode, **kwargs):
         if not os.path.exists(fullname) or (data_hash and
                                             not md5file(fullname) == data_hash):
             get_path_from_url(URL, default_root)
-            fullname = os.path.join(default_root, filename)
 
         return fullname
 
-    def _read(self, filename):
+    def _read(self, filename, *args):
         with open(filename, "r", encoding="utf8") as f:
             input_data = json.load(f)["data"]
         for entry in input_data:
diff --git a/paddlenlp/datasets/experimental/wmt14ende.py b/paddlenlp/datasets/experimental/wmt14ende.py
index 74551fecdd87d..5b1effec7946f 100644
--- a/paddlenlp/datasets/experimental/wmt14ende.py
+++ b/paddlenlp/datasets/experimental/wmt14ende.py
@@ -82,7 +82,7 @@ def _get_data(self, mode, **kwargs):
 
         return src_fullname, tgt_fullname
 
-    def _read(self, filename):
+    def _read(self, filename, *args):
         src_filename, tgt_filename = filename
         with open(src_filename, 'r', encoding='utf-8') as src_f:
             with open(tgt_filename, 'r', encoding='utf-8') as tgt_f: