diff --git a/examples/language_model/mpnet/README.md b/examples/language_model/mpnet/README.md index 464a7fbecefc..f959121c6963 100644 --- a/examples/language_model/mpnet/README.md +++ b/examples/language_model/mpnet/README.md @@ -17,15 +17,14 @@ BERT采用掩码语言建模(MLM)进行预训练,是最成功的预训练 ##### (1)模型微调: ```shell unset CUDA_VISIBLE_DEVICES -cd glue python -m paddle.distributed.launch --gpus "0" run_glue.py \ --model_type mpnet \ --model_name_or_path mpnet-base \ --task_name qqp \ --max_seq_length 128 \ - --batch_size 32 \ + --per_device_train_batch_size 32 \ --learning_rate 1e-5 \ - --scheduler_type linear \ + --lr_scheduler_type linear \ --weight_decay 0.1 \ --warmup_steps 5666 \ --max_steps 113272 \ @@ -33,6 +32,8 @@ python -m paddle.distributed.launch --gpus "0" run_glue.py \ --save_steps 2000 \ --seed 42 \ --output_dir qqp/ \ + --do_train \ + --do_eval \ --device gpu ``` 其中参数释义如下: @@ -40,14 +41,17 @@ python -m paddle.distributed.launch --gpus "0" run_glue.py \ - `model_name_or_path` 模型名称或者路径,其中mpnet模型当前仅支持mpnet-base几种规格。 - `task_name` 表示 Fine-tuning 的任务,当前支持CoLA、SST-2、MRPC、STS-B、QQP、MNLI、QNLI、RTE和WNLI。 - `max_seq_length` 表示最大句子长度,超过该长度将被截断。 -- `batch_size` 表示每次迭代**每张卡**上的样本数目。 +- `per_device_train_batch_size` 表示每次迭代**每张卡**上的样本数目。 - `learning_rate` 表示基础学习率大小,将于learning rate scheduler产生的值相乘作为当前学习率。 -- `scheduler_type` scheduler类型,可选linear和cosine。 +- `lr_scheduler_type` scheduler类型,可选linear和cosine。 +- `weight_decay` 权重衰减比例。 - `warmup_steps` warmup步数。 - `max_steps` 表示最大训练步数。 - `logging_steps` 表示日志打印间隔。 - `save_steps` 表示模型保存及评估间隔。 - `output_dir` 表示模型保存路径。 +- `do_train` 表示是否需要训练。 +- `do_eval` 表示是否需要评测。 - `device` 表示使用的设备类型。默认为GPU,可以配置为CPU、GPU、XPU。若希望使用多GPU训练,将其设置为GPU,同时环境变量CUDA_VISIBLE_DEVICES配置要使用的GPU id。 ##### (2)模型预测: @@ -81,24 +85,23 @@ python run_predict.py --task_name qqp --ckpt_path qqp/best-qqp_ft_model_106000. ```bash unset CUDA_VISIBLE_DEVICES -cd squad python -m paddle.distributed.launch --gpus "0" run_squad.py \ --model_type mpnet \ --model_name_or_path mpnet-base \ --max_seq_length 512 \ - --batch_size 16 \ + --per_device_train_batch_size 16 \ --learning_rate 2e-5 \ --num_train_epochs 4 \ - --scheduler_type linear \ + --lr_scheduler_type linear \ --logging_steps 25 \ --save_steps 25 \ - --warmup_proportion 0.1 \ + --warmup_ratio 0.1 \ --weight_decay 0.1 \ --output_dir squad1.1/ \ --device gpu \ --do_train \ - --seed 42 \ - --do_predict + --do_eval \ + --seed 42 ``` 训练过程中模型会自动对结果进行评估,其中最好的结果如下所示: @@ -124,19 +127,19 @@ python -m paddle.distributed.launch --gpus "0" run_squad.py \ --model_type mpnet \ --model_name_or_path mpnet-base \ --max_seq_length 512 \ - --batch_size 16 \ + --per_device_train_batch_size 16 \ --learning_rate 2e-5 \ --num_train_epochs 4 \ - --scheduler_type linear \ + --lr_scheduler_type linear \ --logging_steps 200 \ --save_steps 200 \ - --warmup_proportion 0.1 \ + --warmup_ratio 0.1 \ --weight_decay 0.1 \ --output_dir squad2/ \ --device gpu \ --do_train \ + --do_eval \ --seed 42 \ - --do_predict \ --version_2_with_negative ``` diff --git a/examples/language_model/mpnet/glue/run_glue.py b/examples/language_model/mpnet/glue/run_glue.py index bde5fb98b8a4..742c43b7dba7 100644 --- a/examples/language_model/mpnet/glue/run_glue.py +++ b/examples/language_model/mpnet/glue/run_glue.py @@ -12,45 +12,196 @@ # See the License for the specific language governing permissions and # limitations under the License. -import argparse -import json -import logging import math -import os -import random import time from collections import OrderedDict +from dataclasses import dataclass, field from functools import partial +from typing import Dict, List, Optional import numpy as np import paddle -from paddle.io import DataLoader +from paddle.io import DataLoader, Dataset from paddle.metric import Accuracy -from paddlenlp.data import Pad, Stack, Tuple +from paddlenlp.data import Pad, Stack from paddlenlp.datasets import load_dataset from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman +from paddlenlp.trainer import PdArgumentParser, Trainer, TrainingArguments, set_seed +from paddlenlp.trainer.trainer_utils import speed_metrics from paddlenlp.transformers import ( BertForSequenceClassification, BertTokenizer, - CosineDecayWithWarmup, ElectraForSequenceClassification, ElectraTokenizer, ErnieForSequenceClassification, ErnieTokenizer, - LinearDecayWithWarmup, MPNetForSequenceClassification, MPNetTokenizer, ) -FORMAT = "%(asctime)s-%(levelname)s: %(message)s" -logging.basicConfig(level=logging.INFO, format=FORMAT) -logger = logging.getLogger(__name__) +class MPNetTrainer(Trainer): + def evaluate( + self, + eval_dataset: Optional[Dataset] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + ) -> Dict[str, float]: + """ + Run evaluation and returns metrics. + + The calling script will be responsible for providing a method to compute metrics, as they are task-dependent + (pass it to the init `compute_metrics` argument). + + You can also subclass and override this method to inject custom behavior. + + Args: + eval_dataset (`Dataset`, *optional*): + Pass a dataset if you wish to override `self.eval_dataset`. If it is an `datasets.Dataset`, columns not + accepted by the `model.forward()` method are automatically removed. It must implement the `__len__` + method. + ignore_keys (`Lst[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (`str`, *optional*, defaults to `"eval"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "eval_bleu" if the prefix is "eval" (default) + + Returns: + A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The + dictionary also contains the epoch number which comes from the training state. + """ + # memory metrics - must set up as early as possible + self._memory_tracker.start() + + trans_func = partial( + convert_example, + tokenizer=self.tokenizer, + label_list=self.args.label_list, + max_seq_length=self.args.max_seq_length, + ) + if self.args.task_name == "mnli": + dev_ds_matched, dev_ds_mismatched = load_dataset( + "glue", self.args.task_name, splits=["dev_matched", "dev_mismatched"] + ) + + dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True) + dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True) + dev_batch_sampler_matched = paddle.io.BatchSampler( + dev_ds_matched, batch_size=self.args.per_device_eval_batch_size * 2, shuffle=False + ) + dev_data_loader_matched = DataLoader( + dataset=dev_ds_matched, + batch_sampler=dev_batch_sampler_matched, + collate_fn=self.data_collator, + num_workers=2, + return_list=True, + ) + dev_batch_sampler_mismatched = paddle.io.BatchSampler( + dev_ds_mismatched, batch_size=self.args.per_device_eval_batch_size * 2, shuffle=False + ) + dev_data_loader_mismatched = DataLoader( + dataset=dev_ds_mismatched, + batch_sampler=dev_batch_sampler_mismatched, + collate_fn=self.data_collator, + num_workers=2, + return_list=True, + ) + else: + dev_ds = load_dataset("glue", self.args.task_name, splits="dev") + dev_ds = dev_ds.map(trans_func, lazy=True) + dev_batch_sampler = paddle.io.BatchSampler( + dev_ds, batch_size=self.args.per_device_eval_batch_size * 2, shuffle=False + ) + dev_data_loader = DataLoader( + dataset=dev_ds, + batch_sampler=dev_batch_sampler, + collate_fn=self.data_collator, + num_workers=2, + return_list=True, + ) + + start_time = time.time() + + if self.args.task_name == "mnli": + output = self.evaluation_loop( + dev_data_loader_matched, + description="Evaluation", + # No point gathering the predictions if there are no metrics, otherwise we defer to + # self.args.prediction_loss_only + prediction_loss_only=True if self.compute_metrics is None else None, + ignore_keys=ignore_keys, + metric_key_prefix=metric_key_prefix, + ) + + total_batch_size = self.args.eval_batch_size * self.args.dataset_world_size + output.metrics.update( + speed_metrics( + metric_key_prefix, + start_time, + num_samples=output.num_samples, + num_steps=math.ceil(output.num_samples / total_batch_size), + ) + ) + + self.log(output.metrics) + + output = self.evaluation_loop( + dev_data_loader_mismatched, + description="Evaluation", + # No point gathering the predictions if there are no metrics, otherwise we defer to + # self.args.prediction_loss_only + prediction_loss_only=True if self.compute_metrics is None else None, + ignore_keys=ignore_keys, + metric_key_prefix=metric_key_prefix, + ) + + total_batch_size = self.args.eval_batch_size * self.args.dataset_world_size + output.metrics.update( + speed_metrics( + metric_key_prefix, + start_time, + num_samples=output.num_samples, + num_steps=math.ceil(output.num_samples / total_batch_size), + ) + ) + + self.log(output.metrics) + + self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics) + + self._memory_tracker.stop_and_update_metrics(output.metrics) + + return output.metrics + else: + output = self.evaluation_loop( + dev_data_loader, + description="Evaluation", + # No point gathering the predictions if there are no metrics, otherwise we defer to + # self.args.prediction_loss_only + prediction_loss_only=True if self.compute_metrics is None else None, + ignore_keys=ignore_keys, + metric_key_prefix=metric_key_prefix, + ) + + total_batch_size = self.args.eval_batch_size * self.args.dataset_world_size + output.metrics.update( + speed_metrics( + metric_key_prefix, + start_time, + num_samples=output.num_samples, + num_steps=math.ceil(output.num_samples / total_batch_size), + ) + ) -def save_json(data, file_path): - with open(str(file_path), "w", encoding="utf8") as f: - json.dump(data, f, ensure_ascii=False) + self.log(output.metrics) + + self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics) + + self._memory_tracker.stop_and_update_metrics(output.metrics) + + return output.metrics METRIC_CLASSES = { @@ -73,176 +224,83 @@ def save_json(data, file_path): } -def parse_args(): - parser = argparse.ArgumentParser() - - # Required parameters - parser.add_argument( - "--task_name", +@dataclass +class ModelArguments: + max_seq_length: Optional[int] = field( + default=128, + metadata={ + "help": ( + "The maximum total input sequence length after tokenization. " + "Sequences longer than this will be truncated, sequences shorter will be padded." + ) + }, + ) + task_name: Optional[str] = field( default=None, - type=str, - required=True, - help="The name of the task to train selected in the list: " + ", ".join(METRIC_CLASSES.keys()), + metadata={"help": ("The name of the task to train selected in the list: " + ", ".join(METRIC_CLASSES.keys()))}, ) - parser.add_argument( - "--model_type", + model_type: Optional[str] = field( default="convbert", - type=str, - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), + metadata={"help": ("Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))}, ) - parser.add_argument( - "--model_name_or_path", + model_name_or_path: Optional[str] = field( default="convbert-base", - type=str, - help="Path to pre-trained model or shortcut name selected in the list: " - + ", ".join( - sum( - [list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values()], - [], + metadata={ + "help": ( + "Path to pre-trained model or shortcut name selected in the list: " + + ", ".join( + sum( + [list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values()], + [], + ) + ), ) - ), - ) - parser.add_argument( - "--output_dir", - default=None, - type=str, - help="The output directory where the model predictions and checkpoints will be written.", - ) - parser.add_argument( - "--scheduler_type", - default="linear", - type=str, - help="scheduler_type.", - ) - parser.add_argument( - "--max_seq_length", - default=128, - type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.", - ) - parser.add_argument( - "--learning_rate", - default=1e-4, - type=float, - help="The initial learning rate for Adam.", + }, ) - parser.add_argument("--layer_lr_decay", default=1.0, type=float, help="layer_lr_decay") - parser.add_argument( - "--num_train_epochs", - default=3, - type=int, - help="Total number of training epochs to perform.", + layer_lr_decay: Optional[float] = field( + default=1.0, + metadata={"help": ("layer_lr_decay")}, ) - parser.add_argument("--logging_steps", type=int, default=100, help="Log every X updates steps.") - parser.add_argument( - "--save_steps", - type=int, - default=100, - help="Save checkpoint every X updates steps.", - ) - parser.add_argument( - "--batch_size", - default=32, - type=int, - help="Batch size per GPU/CPU for training.", - ) - parser.add_argument( - "--weight_decay", - default=0.01, - type=float, - help="Weight decay if we apply some.", - ) - parser.add_argument( - "--warmup_steps", - default=0, - type=int, - help="Linear warmup over warmup_steps. If > 0: Override warmup_proportion", - ) - parser.add_argument( - "--warmup_proportion", - default=0.1, - type=float, - help="Linear warmup proportion over total steps.", - ) - parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") - parser.add_argument( - "--max_steps", - default=-1, - type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.", - ) - parser.add_argument("--seed", default=42, type=int, help="random seed for initialization") - parser.add_argument( - "--device", - default="gpu", - type=str, - choices=["cpu", "gpu"], - help="The device to select to train the model, is must be cpu/gpu.", - ) - args = parser.parse_args() - return args -def _get_layer_lr_radios(layer_decay=0.8, n_layers=12): - """Have lower learning rates for layers closer to the input.""" - key_to_depths = OrderedDict( - { - "mpnet.embeddings.": 0, - "mpnet.encoder.relative_attention_bias.": 0, - "mpnet.pooler.": n_layers + 2, - "mpnet.classifier.": n_layers + 2, - } +@dataclass +class DataArguments: + data_path: Optional[str] = field( + default="./data", + metadata={"help": "The path of datasets to be loaded."}, ) - for layer in range(n_layers): - key_to_depths[f"mpnet.encoder.layer.{str(layer)}."] = layer + 1 - return {key: (layer_decay ** (n_layers + 2 - depth)) for key, depth in key_to_depths.items()} -def set_seed(args): - # Use the same data seed(for data shuffle) for all procs to guarantee data - # consistency after sharding. - random.seed(args.seed) - np.random.seed(args.seed) - # Maybe different op seeds(for dropout) for different procs is better. By: - # `paddle.seed(args.seed + paddle.distributed.get_rank())` - paddle.seed(args.seed) - - -@paddle.no_grad() -def evaluate(model, loss_fct, metric, data_loader): - model.eval() - metric.reset() - for batch in data_loader: - input_ids, segment_ids, labels = batch - logits = model(input_ids) - loss = loss_fct(logits, labels) - correct = metric.compute(logits, labels) - metric.update(correct) +def compute_metrics(eval_preds, metric): + labels = paddle.to_tensor(eval_preds.label_ids, dtype="int64") + preds = paddle.to_tensor(eval_preds.predictions) + preds = paddle.nn.functional.softmax(preds, axis=-1) + labels = paddle.argmax(labels, axis=-1) + correct = metric.compute(preds, labels) + metric.update(correct) res = metric.accumulate() if isinstance(metric, AccuracyAndF1): - print( - "eval loss: %f, acc: %s, precision: %s, recall: %s, f1: %s, acc and f1: %s, " - % ( - loss.numpy(), - res[0], - res[1], - res[2], - res[3], - res[4], - ) - ) + return { + "acc": res[0], + "precision": res[1], + "recall": res[2], + "f1": res[3], + "acc and f1": res[4], + } elif isinstance(metric, Mcc): - print("eval loss: %f, mcc: %s, " % (loss.numpy(), res[0])) + return { + "mcc": res[0], + } elif isinstance(metric, PearsonAndSpearman): - print( - "eval loss: %f, pearson: %s, spearman: %s, pearson and spearman: %s, " - % (loss.numpy(), res[0], res[1], res[2]), - end="", - ) + return { + "pearson": res[0], + "spearman": res[1], + "pearson and spearman": res[2], + } else: - print("eval loss: %f, acc: %s, " % (loss.numpy(), res)) - model.train() + return { + "acc": res, + } def convert_example(example, tokenizer, label_list, max_seq_length=512, is_test=False): @@ -255,12 +313,13 @@ def convert_example(example, tokenizer, label_list, max_seq_length=512, is_test= label = np.array([label], dtype=label_dtype) # Convert raw text to feature if (int(is_test) + len(example)) == 2: - example = tokenizer(example["sentence"], max_seq_len=max_seq_length) + example = tokenizer(example["sentence"], max_length=max_seq_length, return_token_type_ids=True) else: example = tokenizer( example["sentence1"], text_pair=example["sentence2"], - max_seq_len=max_seq_length, + max_length=max_seq_length, + return_token_type_ids=True, ) if not is_test: @@ -269,88 +328,89 @@ def convert_example(example, tokenizer, label_list, max_seq_length=512, is_test= return example["input_ids"], example["token_type_ids"] -def do_train(args): - paddle.set_device(args.device) +@dataclass +class DataCollator: + def __init__(self, tokenizer, train_ds): + self.tokenizer = (tokenizer,) + self.train_ds = (train_ds,) + + def __call__(self, features): + input_ids = [] + labels = [] + batch = {} + + for feature in features: + input_idx, _, label = feature + input_ids.append(input_idx) + labels.append(label) + + if not isinstance(self.tokenizer, MPNetTokenizer): + self.tokenizer = self.tokenizer[0] + self.train_ds = self.train_ds[0] + input_ids = (Pad(axis=0, pad_val=self.tokenizer.pad_token_id)(input_ids),) # input_ids + labels = (Stack(dtype="int64" if self.train_ds.label_list else "float32")(labels),) # labels + + batch["input_ids"] = input_ids[0] + batch["labels"] = labels[0] + + return batch + + +def _get_layer_lr_radios(layer_decay=0.8, n_layers=12): + """Have lower learning rates for layers closer to the input.""" + key_to_depths = OrderedDict( + { + "mpnet.embeddings.": 0, + "mpnet.encoder.relative_attention_bias.": 0, + "mpnet.pooler.": n_layers + 2, + "mpnet.classifier.": n_layers + 2, + } + ) + for layer in range(n_layers): + key_to_depths[f"mpnet.encoder.layer.{str(layer)}."] = layer + 1 + return {key: (layer_decay ** (n_layers + 2 - depth)) for key, depth in key_to_depths.items()} + + +def do_train(): + parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + if training_args.output_dir is None: + training_args.output_dir = model_args.task_name.lower() + if model_args.task_name is not None: + training_args.task_name = model_args.task_name + if model_args.max_seq_length is not None: + training_args.max_seq_length = model_args.max_seq_length + + paddle.set_device(training_args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() - set_seed(args) + set_seed(training_args.seed) - args.task_name = args.task_name.lower() - metric_class = METRIC_CLASSES[args.task_name] - args.model_type = args.model_type.lower() - model_class, tokenizer_class = MODEL_CLASSES[args.model_type] + model_args.task_name = model_args.task_name.lower() + metric_class = METRIC_CLASSES[model_args.task_name] + model_args.model_type = model_args.model_type.lower() + model_class, tokenizer_class = MODEL_CLASSES[model_args.model_type] - train_ds = load_dataset("glue", args.task_name, splits="train") - tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) + train_ds = load_dataset("glue", model_args.task_name, splits="train") + tokenizer = tokenizer_class.from_pretrained(model_args.model_name_or_path) + training_args.label_list = train_ds.label_list trans_func = partial( convert_example, tokenizer=tokenizer, - label_list=train_ds.label_list, - max_seq_length=args.max_seq_length, + label_list=training_args.label_list, + max_seq_length=model_args.max_seq_length, ) train_ds = train_ds.map(trans_func, lazy=True) - train_batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=args.batch_size, shuffle=True) - batchify_fn = lambda samples, fn=Tuple( - Pad(axis=0, pad_val=tokenizer.pad_token_id), # input - Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment - Stack(dtype="int64" if train_ds.label_list else "float32"), # label - ): fn(samples) - train_data_loader = DataLoader( - dataset=train_ds, - batch_sampler=train_batch_sampler, - collate_fn=batchify_fn, - num_workers=0, - return_list=True, - ) - if args.task_name == "mnli": - dev_ds_matched, dev_ds_mismatched = load_dataset( - "glue", args.task_name, splits=["dev_matched", "dev_mismatched"] - ) - - dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True) - dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True) - dev_batch_sampler_matched = paddle.io.BatchSampler( - dev_ds_matched, batch_size=args.batch_size * 2, shuffle=False - ) - dev_data_loader_matched = DataLoader( - dataset=dev_ds_matched, - batch_sampler=dev_batch_sampler_matched, - collate_fn=batchify_fn, - num_workers=2, - return_list=True, - ) - dev_batch_sampler_mismatched = paddle.io.BatchSampler( - dev_ds_mismatched, batch_size=args.batch_size * 2, shuffle=False - ) - dev_data_loader_mismatched = DataLoader( - dataset=dev_ds_mismatched, - batch_sampler=dev_batch_sampler_mismatched, - collate_fn=batchify_fn, - num_workers=2, - return_list=True, - ) - else: - dev_ds = load_dataset("glue", args.task_name, splits="dev") - dev_ds = dev_ds.map(trans_func, lazy=True) - dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size * 2, shuffle=False) - dev_data_loader = DataLoader( - dataset=dev_ds, - batch_sampler=dev_batch_sampler, - collate_fn=batchify_fn, - num_workers=2, - return_list=True, - ) + batchify_fn = DataCollator(tokenizer, train_ds) - num_classes = 1 if train_ds.label_list is None else len(train_ds.label_list) - model = model_class.from_pretrained(args.model_name_or_path, num_classes=num_classes) - if paddle.distributed.get_world_size() > 1: - model = paddle.DataParallel(model) + num_classes = 1 if training_args.label_list is None else len(training_args.label_list) + model = model_class.from_pretrained(model_args.model_name_or_path, num_classes=num_classes) - # layer_lr for base - if args.layer_lr_decay != 1.0: - layer_lr_radios_map = _get_layer_lr_radios(args.layer_lr_decay, n_layers=12) + if model_args.layer_lr_decay != 1.0: + layer_lr_radios_map = _get_layer_lr_radios(model_args.layer_lr_decay, n_layers=12) for name, parameter in model.named_parameters(): layer_lr_radio = 1.0 for k, radio in layer_lr_radios_map.items(): @@ -359,107 +419,36 @@ def do_train(args): break parameter.optimize_attr["learning_rate"] *= layer_lr_radio - num_training_steps = args.max_steps if args.max_steps > 0 else (len(train_data_loader) * args.num_train_epochs) - args.num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) - - warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion - - if args.scheduler_type == "linear": - lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) - elif args.scheduler_type == "cosine": - lr_scheduler = CosineDecayWithWarmup(args.learning_rate, num_training_steps, warmup) - # Generate parameter names needed to perform weight decay. - # All bias and LayerNorm parameters are excluded. - decay_params = [p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"])] - - optimizer = paddle.optimizer.AdamW( - learning_rate=lr_scheduler, - beta1=0.9, - beta2=0.98, - epsilon=args.adam_epsilon, - parameters=model.parameters(), - weight_decay=args.weight_decay, - apply_decay_param_fun=lambda x: x in decay_params, - ) - - loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_ds.label_list else paddle.nn.loss.MSELoss() + loss_fct = paddle.nn.loss.CrossEntropyLoss() if training_args.label_list else paddle.nn.loss.MSELoss() metric = metric_class() + compute_metrics_func = partial( + compute_metrics, + metric=metric, + ) - global_step = 0 - tic_train = time.time() - - print("num_training_steps", num_training_steps) - os.mkdir(args.output_dir) - save_json(args.__dict__, os.path.join(args.output_dir, "args.json")) - - for epoch in range(args.num_train_epochs): - for step, batch in enumerate(train_data_loader): - global_step += 1 - - input_ids, segment_ids, labels = batch - logits = model(input_ids) - loss = loss_fct(logits, labels) - loss.backward() - optimizer.step() - lr_scheduler.step() - optimizer.clear_grad() - if global_step % args.logging_steps == 0 or global_step == num_training_steps: - print( - "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" - % ( - global_step, - num_training_steps, - epoch, - step, - paddle.distributed.get_rank(), - loss, - optimizer.get_lr(), - args.logging_steps / (time.time() - tic_train), - ) - ) - tic_train = time.time() - if global_step % args.save_steps == 0 or global_step == num_training_steps: - tic_eval = time.time() - if args.task_name == "mnli": - evaluate(model, loss_fct, metric, dev_data_loader_matched) - evaluate(model, loss_fct, metric, dev_data_loader_mismatched) - print("eval done total : %s s" % (time.time() - tic_eval)) - else: - evaluate(model, loss_fct, metric, dev_data_loader) - print("eval done total : %s s" % (time.time() - tic_eval)) - print("=" * 100) - if paddle.distributed.get_rank() == 0: - output_dir = os.path.join( - args.output_dir, - "%s_ft_model_%d.pdparams" % (args.task_name, global_step), - ) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - # Need better way to get inner model of DataParallel - model_to_save = model._layers if isinstance(model, paddle.DataParallel) else model - model_to_save.save_pretrained(output_dir) - tokenizer.save_pretrained(output_dir) - - if global_step >= num_training_steps: - return + trainer = MPNetTrainer( + model=model, + args=training_args, + train_dataset=train_ds if training_args.do_train else None, + tokenizer=tokenizer, + data_collator=batchify_fn, + criterion=loss_fct, + compute_metrics=compute_metrics_func, + ) + if training_args.do_train: + train_results = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + metrics = train_results.metrics + trainer.save_model() + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() -def print_arguments(args): - """print arguments""" - print("----------- Configuration Arguments -----------") - for arg, value in sorted(vars(args).items()): - print("%s: %s" % (arg, value)) - print("------------------------------------------------") + if training_args.do_eval: + eval_metrics = trainer.evaluate() + trainer.log_metrics("eval", eval_metrics) if __name__ == "__main__": - args = parse_args() - if args.output_dir is None: - args.output_dir = args.task_name.lower() - print_arguments(args) - n_gpu = len(os.getenv("CUDA_VISIBLE_DEVICES", "").split(",")) - if args.device in "gpu" and n_gpu > 1: - paddle.distributed.spawn(do_train, args=(args,), nprocs=n_gpu) - else: - do_train(args) + do_train() diff --git a/examples/language_model/mpnet/glue/train.sh b/examples/language_model/mpnet/glue/train.sh index c17ed9313180..e068d234c8c1 100644 --- a/examples/language_model/mpnet/glue/train.sh +++ b/examples/language_model/mpnet/glue/train.sh @@ -1,3 +1,17 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # ["cola","sst-2","mrpc","sts-b","qqp","mnli", "rte", "qnli"] unset CUDA_VISIBLE_DEVICES # QQP @@ -7,17 +21,19 @@ python -m paddle.distributed.launch --gpus "0" run_glue.py \ --model_name_or_path mpnet-base \ --task_name qqp \ --max_seq_length 128 \ - --batch_size 32 \ + --per_device_train_batch_size 32 \ --learning_rate 1e-5 \ - --scheduler_type linear \ + --lr_scheduler_type linear \ --layer_lr_decay 1.0 \ --weight_decay 0.1 \ --warmup_steps 5666 \ --max_steps 113272 \ - --logging_steps 500 \ - --save_steps 2000 \ + --logging_steps 1 \ + --save_steps 3 \ --seed 42 \ - --output_dir qqp/ \ + --output_dir qqp \ + --do_train \ + --do_eval \ --device gpu # COLA @@ -26,17 +42,19 @@ python -m paddle.distributed.launch --gpus "0" run_glue.py \ --model_name_or_path mpnet-base \ --task_name cola \ --max_seq_length 128 \ - --batch_size 16 \ + --per_device_train_batch_size 16 \ --learning_rate 1e-5 \ - --scheduler_type linear \ + --lr_scheduler_type linear \ --layer_lr_decay 1.0 \ --weight_decay 0.1 \ - --warmup_proportion 0.06 \ + --warmup_ratio 0.06 \ --num_train_epochs 10 \ --logging_steps 200 \ --save_steps 200 \ --seed 42 \ --output_dir cola \ + --do_train \ + --do_eval \ --device gpu # QNLI @@ -45,17 +63,19 @@ python -m paddle.distributed.launch --gpus "0" run_glue.py \ --model_name_or_path mpnet-base \ --task_name qnli \ --max_seq_length 128 \ - --batch_size 32 \ + --per_device_train_batch_size 32 \ --learning_rate 1e-5 \ - --scheduler_type linear \ + --lr_scheduler_type linear \ --layer_lr_decay 1.0 \ --weight_decay 0.1 \ - --warmup_proportion 0.06 \ + --warmup_ratio 0.06 \ --num_train_epochs 10 \ --logging_steps 1000 \ --save_steps 1000 \ --seed 42 \ --output_dir qnli \ + --do_train \ + --do_eval \ --device gpu # SST2 @@ -64,17 +84,19 @@ python -m paddle.distributed.launch --gpus "0" run_glue.py \ --model_name_or_path mpnet-base \ --task_name sst-2 \ --max_seq_length 128 \ - --batch_size 32 \ + --per_device_train_batch_size 32 \ --learning_rate 1e-5 \ - --scheduler_type linear \ + --lr_scheduler_type linear \ --layer_lr_decay 1.0 \ --weight_decay 0.1 \ - --warmup_proportion 0.06 \ + --warmup_ratio 0.06 \ --num_train_epochs 10 \ --logging_steps 400 \ --save_steps 400 \ --seed 42 \ --output_dir sst-2 \ + --do_train \ + --do_eval \ --device gpu @@ -86,17 +108,19 @@ python -m paddle.distributed.launch --gpus "0" run_glue.py \ --model_name_or_path mpnet-base \ --task_name mnli \ --max_seq_length 128 \ - --batch_size 32 \ + --per_device_train_batch_size 32 \ --learning_rate 1e-5 \ - --scheduler_type linear \ + --lr_scheduler_type linear \ --layer_lr_decay 1.0 \ --weight_decay 0.1 \ - --warmup_proportion 0.06 \ + --warmup_ratio 0.06 \ --num_train_epochs 10 \ --logging_steps 1000 \ --save_steps 1000 \ --seed 42 \ --output_dir mnli \ + --do_train \ + --do_eval \ --device gpu ######################################################## @@ -107,17 +131,19 @@ python -m paddle.distributed.launch --gpus "0" run_glue.py \ --model_name_or_path $MNLI_BEST_CKPT \ --task_name rte \ --max_seq_length 128 \ - --batch_size 16 \ + --per_device_train_batch_size 16 \ --learning_rate 2e-5 \ - --scheduler_type linear \ + --lr_scheduler_type linear \ --layer_lr_decay 1.0 \ --weight_decay 0.1 \ - --warmup_proportion 0.06 \ + --warmup_ratio 0.06 \ --num_train_epochs 13 \ --logging_steps 100 \ --save_steps 100 \ --seed 42 \ --output_dir rte \ + --do_train \ + --do_eval \ --device gpu ############################################################ @@ -127,17 +153,19 @@ python -m paddle.distributed.launch --gpus "0" run_glue.py \ --model_name_or_path $MNLI_BEST_CKPT \ --task_name mrpc \ --max_seq_length 128 \ - --batch_size 16 \ + --per_device_train_batch_size 16 \ --learning_rate 1e-5 \ - --scheduler_type linear \ + --lr_scheduler_type linear \ --layer_lr_decay 1.0 \ --weight_decay 0.1 \ - --warmup_proportion 0.06 \ + --warmup_ratio 0.06 \ --num_train_epochs 10 \ --logging_steps 100 \ --save_steps 100 \ --seed 42 \ --output_dir mrpc \ + --do_train \ + --do_eval \ --device gpu ############################################################ @@ -147,17 +175,19 @@ python -m paddle.distributed.launch --gpus "0" run_glue.py \ --model_name_or_path $MNLI_BEST_CKPT \ --task_name rte \ --max_seq_length 128 \ - --batch_size 16 \ + --per_device_train_batch_size 16 \ --learning_rate 2e-5 \ - --scheduler_type linear \ + --lr_scheduler_type linear \ --layer_lr_decay 1.0 \ --weight_decay 0.1 \ - --warmup_proportion 0.06 \ + --warmup_ratio 0.06 \ --num_train_epochs 10 \ --logging_steps 100 \ --save_steps 100 \ --seed 42 \ --output_dir rte \ + --do_train \ + --do_eval \ --device gpu ############################################################ diff --git a/examples/language_model/mpnet/squad/args.py b/examples/language_model/mpnet/squad/args.py deleted file mode 100644 index 1f025641e6f2..000000000000 --- a/examples/language_model/mpnet/squad/args.py +++ /dev/null @@ -1,94 +0,0 @@ -import argparse - - -def parse_args(): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--train_file", type=str, required=False, default=None, help="Train data path.") - parser.add_argument("--predict_file", type=str, required=False, default=None, help="Predict data path.") - parser.add_argument("--model_type", default="convbert", type=str, help="Type of pre-trained model.") - parser.add_argument( - "--model_name_or_path", - default="convbert-base", - type=str, - help="Path to pre-trained model or shortcut name of model.", - ) - parser.add_argument( - "--output_dir", - default="outputs", - type=str, - help="The output directory where the model predictions and checkpoints will be written. " - "Default as `outputs`", - ) - parser.add_argument( - "--max_seq_length", - default=512, - type=int, - help="The maximum total input sequence length after tokenization. Sequences longer " - "than this will be truncated, sequences shorter will be padded.", - ) - parser.add_argument("--batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") - parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") - parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay if we apply some.") - parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") - parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") - parser.add_argument("--num_train_epochs", default=2, type=int, help="Total number of training epochs to perform.") - parser.add_argument( - "--max_steps", - default=-1, - type=int, - help="If > 0: set total number of training steps to perform. Override num_train_epochs.", - ) - parser.add_argument( - "--warmup_proportion", - default=0.1, - type=float, - help="Proportion of training steps to perform linear learning rate warmup for.", - ) - parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") - parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") - parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") - parser.add_argument( - "--device", choices=["cpu", "gpu"], default="gpu", help="Select which device to train model, defaults to gpu." - ) - parser.add_argument( - "--doc_stride", - type=int, - default=128, - help="When splitting up a long document into chunks, how much stride to take between chunks.", - ) - parser.add_argument( - "--n_best_size", - type=int, - default=20, - help="The total number of n-best predictions to generate in the nbest_predictions.json output file.", - ) - parser.add_argument( - "--null_score_diff_threshold", - type=float, - default=0.0, - help="If null_score - best_non_null is greater than the threshold predict null.", - ) - parser.add_argument("--max_query_length", type=int, default=64, help="Max query length.") - parser.add_argument("--max_answer_length", type=int, default=30, help="Max answer length.") - parser.add_argument( - "--do_lower_case", - action="store_false", - help="Whether to lower case the input text. Should be True for uncased models and False for cased models.", - ) - parser.add_argument("--verbose", action="store_true", help="Whether to output verbose log.") - parser.add_argument( - "--version_2_with_negative", - action="store_true", - help="If true, the SQuAD examples contain some that do not have an answer. If using squad v2.0, it should be set true.", - ) - parser.add_argument("--do_train", action="store_true", help="Whether to train the model.") - parser.add_argument("--do_predict", action="store_true", help="Whether to predict.") - parser.add_argument( - "--scheduler_type", - default="linear", - type=str, - help="scheduler_type.", - ) - parser.add_argument("--layer_lr_decay", default=1.0, type=float, help="layer_lr_decay") - args = parser.parse_args() - return args diff --git a/examples/language_model/mpnet/squad/run_squad.py b/examples/language_model/mpnet/squad/run_squad.py index 78fc94c6fd6a..502bbea9e1bd 100644 --- a/examples/language_model/mpnet/squad/run_squad.py +++ b/examples/language_model/mpnet/squad/run_squad.py @@ -13,31 +13,269 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import random -import time import json -import math - +from collections import OrderedDict +from dataclasses import dataclass, field from functools import partial +from typing import List, Optional + import numpy as np import paddle -from paddle.io import DataLoader -from args import parse_args -from paddlenlp.data import Pad, Stack, Dict +from datasets import load_dataset +from paddle.io import DataLoader, Dataset + +from paddlenlp.data import Pad, Stack +from paddlenlp.metrics.squad import compute_prediction, squad_evaluate +from paddlenlp.trainer import PdArgumentParser, Trainer, TrainingArguments, set_seed +from paddlenlp.trainer.trainer_utils import ( + EvalLoopOutput, + EvalPrediction, + IterableDatasetShard, + find_batch_size, + has_length, +) +from paddlenlp.trainer.utils.helper import ( + nested_concat, + nested_numpify, + nested_truncate, +) from paddlenlp.transformers import ( BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, - MPNetForQuestionAnswering, ErnieTokenizer, + MPNetForQuestionAnswering, MPNetTokenizer, ) +from paddlenlp.utils.batch_sampler import ( + DistributedBatchSampler as NlpDistributedBatchSampler, +) +from paddlenlp.utils.log import logger + + +def is_datasets_available(): + import importlib + + return importlib.util.find_spec("datasets") is not None + + +class MPNetTrainer(Trainer): + def set_eval_collator(self, collator): + self.eval_collate_fn = collator + + def set_eval_raw_dataset(self, raw_dataset): + self.eval_raw_dataset = raw_dataset + + def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader: + """ + Returns the evaluation [`~paddle.io.DataLoader`]. + Subclass and override this method if you want to inject some custom behavior. + Args: + eval_dataset (`paddle.io.Dataset`, *optional*): + If provided, will override `self.eval_dataset`. If it is an `datasets.Dataset`, columns not accepted by + the `model.forward()` method are automatically removed. It must implement `__len__`. + """ + if eval_dataset is None and self.eval_dataset is None: + raise ValueError("Trainer: evaluation requires an eval_dataset.") + eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset + + if self._is_iterable_dataset(eval_dataset): + if self.args.world_size > 1: + eval_dataset = IterableDatasetShard( + eval_dataset, + batch_size=self.args.per_device_eval_batch_size, + drop_last=self.args.dataloader_drop_last, + num_processes=self.args.world_size, + process_index=self.args.process_index, + ) + + return DataLoader( + eval_dataset, + batch_size=self.args.per_device_eval_batch_size, + collate_fn=self.eval_collate_fn, + num_workers=self.args.dataloader_num_workers, + ) + + eval_sampler = self._get_eval_sampler(eval_dataset) + + return DataLoader( + eval_dataset, + batch_sampler=eval_sampler, + collate_fn=self.eval_collate_fn, + num_workers=self.args.dataloader_num_workers, + ) + + def evaluation_loop( + self, + dataloader: DataLoader, + description: str, + prediction_loss_only: Optional[bool] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + max_eval_iters: Optional[int] = -1, + ) -> EvalLoopOutput: + """ + Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`. + Works both with or without labels. + """ + args = self.args + + prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only + + model = self.model + + if isinstance(dataloader, paddle.io.DataLoader): + batch_size = dataloader.batch_sampler.batch_size + elif isinstance(dataloader, paddle.fluid.dataloader.dataloader_iter._DataLoaderIterBase): + # support for inner dataloader + batch_size = dataloader._batch_sampler.batch_size + # alias for inner dataloader + dataloader.dataset = dataloader._dataset + else: + raise ValueError("Only support for paddle.io.DataLoader") + + num_samples = None + if max_eval_iters > 0: + # on eval limit steps + num_samples = batch_size * self.args.world_size * max_eval_iters + if isinstance(dataloader, paddle.fluid.dataloader.dataloader_iter._DataLoaderIterBase) and isinstance( + dataloader._batch_sampler, NlpDistributedBatchSampler + ): + consumed_samples = ( + ((self.state.global_step) // args.eval_steps) + * max_eval_iters + * args.per_device_eval_batch_size + * args.world_size + ) + dataloader._batch_sampler.set_epoch(consumed_samples=consumed_samples) + + logger.info(f"***** Running {description} *****") + if has_length(dataloader): + logger.info(f" Num examples = {self.num_examples(dataloader)}") + if max_eval_iters > 0: + logger.info(f" Total prediction steps = {max_eval_iters}") + else: + logger.info(f" Total prediction steps = {len(dataloader)}") + else: + logger.info(" Num examples: Unknown") + if max_eval_iters > 0: + logger.info(f" Total prediction steps = {max_eval_iters}") + + logger.info(f" Pre device batch size = {batch_size}") + logger.info(f" Total Batch size = {batch_size * self.args.world_size}") + + model.eval() + + self.callback_handler.eval_dataloader = dataloader + # Do this before wrapping. + eval_dataset = dataloader.dataset + + if args.past_index >= 0: + self._past = None + + # Initialize containers + # losses/preds/labels on GPU (accumulated for eval_accumulation_steps) + losses_host = None + preds_host = None + labels_host = None + # losses/preds/labels on CPU (final containers) + all_losses = None + all_preds = None + all_labels = None + # Will be useful when we have an iterable dataset so don't know its length. + + observed_num_examples = 0 + # Main evaluation loop + losses = [] + for step, inputs in enumerate(dataloader): + # Update the observed num examples + observed_batch_size = find_batch_size(inputs) + if observed_batch_size is not None: + observed_num_examples += observed_batch_size + # For batch samplers, batch_size is not known by the dataloader in advance. + if batch_size is None: + batch_size = observed_batch_size + + # Prediction step + loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) + + # Update containers on host + if loss is not None: + # losses = self._nested_gather(loss.repeat(batch_size)) + # losses = self._nested_gather(loss) + losses = self._nested_gather(paddle.tile(loss, repeat_times=[batch_size, 1])) + losses_host = losses if losses_host is None else paddle.concat((losses_host, losses), axis=0) + + if labels is not None: + labels = self._pad_across_processes(labels) + labels = self._nested_gather(labels) + labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) + if logits is not None: + logits = self._pad_across_processes(logits) + logits = self._nested_gather(logits) + if self.preprocess_logits_for_metrics is not None: + logits = self.preprocess_logits_for_metrics(logits, labels) + preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) + self.control = self.callback_handler.on_prediction_step(args, self.state, self.control) + if max_eval_iters > 0 and step >= max_eval_iters - 1: + break + + # Gather all remaining tensors and put them back on the CPU + if losses_host is not None: + losses = nested_numpify(losses_host) + all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) + if preds_host is not None: + logits = nested_numpify(preds_host) + all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) + if labels_host is not None: + labels = nested_numpify(labels_host) + all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) + + # Number of samples + if num_samples is not None: + pass + elif has_length(eval_dataset): + num_samples = len(eval_dataset) + # The instance check is weird and does not actually check for the type, but whether the dataset has the right + # methods. Therefore we need to make sure it also has the attribute. + elif isinstance(eval_dataset, IterableDatasetShard) and hasattr(eval_dataset, "num_examples"): + num_samples = eval_dataset.num_examples + else: + if has_length(dataloader): + num_samples = self.num_examples(dataloader) + else: # both len(dataloader.dataset) and len(dataloader) fail + num_samples = observed_num_examples + + # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of + # samplers has been rounded to a multiple of batch_size, so we truncate. + if all_losses is not None: + all_losses = all_losses[:num_samples] + if all_preds is not None: + all_preds = nested_truncate(all_preds, num_samples) + if all_labels is not None: + all_labels = nested_truncate(all_labels, num_samples) + + model.train() + + if self.compute_metrics is not None and all_preds is not None: + metrics = self.compute_metrics( + EvalPrediction(predictions=all_preds, label_ids=all_labels), + data_loader=dataloader, + raw_dataset=self.eval_raw_dataset, + ) + else: + metrics = {} + + if all_losses is not None: + metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item() + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + + return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples) -from paddlenlp.transformers import LinearDecayWithWarmup, CosineDecayWithWarmup -from paddlenlp.metrics.squad import squad_evaluate, compute_prediction -from datasets import load_dataset -from collections import OrderedDict MODEL_CLASSES = { "bert": (BertForQuestionAnswering, BertTokenizer), @@ -46,6 +284,96 @@ } +@dataclass +class ModelArguments: + max_seq_length: Optional[int] = field( + default=128, + metadata={ + "help": ( + "The maximum total input sequence length after tokenization. " + "Sequences longer than this will be truncated, sequences shorter will be padded." + ) + }, + ) + model_type: Optional[str] = field( + default="convbert", + metadata={"help": ("Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))}, + ) + model_name_or_path: Optional[str] = field( + default="convbert-base", + metadata={ + "help": ( + "Path to pre-trained model or shortcut name selected in the list: " + + ", ".join( + sum( + [list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values()], + [], + ) + ), + ) + }, + ) + layer_lr_decay: Optional[float] = field( + default=1.0, + metadata={"help": ("layer_lr_decay")}, + ) + doc_stride: Optional[int] = field( + default=128, + metadata={"help": ("When splitting up a long document into chunks, how much stride to take between chunks.")}, + ) + n_best_size: Optional[int] = field( + default=20, + metadata={ + "help": ("The total number of n-best predictions to generate in the nbest_predictions.json output file.") + }, + ) + null_score_diff_threshold: Optional[float] = field( + default=0.0, + metadata={"help": ("If null_score - best_non_null is greater than the threshold predict null.")}, + ) + max_query_length: Optional[int] = field( + default=64, + metadata={"help": ("Max query length.")}, + ) + max_answer_length: Optional[int] = field( + default=30, + metadata={"help": ("Max answer length.")}, + ) + do_lower_case: Optional[bool] = field( + default=False, + metadata={ + "help": ( + "Whether to lower case the input text. Should be True for uncased models and False for cased models." + ) + }, + ) + verbose: Optional[bool] = field( + default=False, + metadata={"help": ("Whether to output verbose log.")}, + ) + version_2_with_negative: Optional[bool] = field( + default=False, + metadata={ + "help": ( + "If true, the SQuAD examples contain some that do not have an answer.", + "If using squad v2.0, it should be set true.", + ) + }, + ) + + +@dataclass +class DataArguments: + train_file: Optional[str] = field( + default=None, + metadata={"help": "Train data path."}, + ) + predict_file: Optional[str] = field( + default=None, + metadata={"help": "Predict data path."}, + ) + + def _get_layer_lr_radios(layer_decay=0.8, n_layers=12): """Have lower learning rates for layers closer to the input.""" key_to_depths = OrderedDict( @@ -71,7 +399,7 @@ def prepare_train_features(examples, tokenizer, args): # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( - questions, contexts, max_seq_len=args.max_seq_length, stride=args.doc_stride, return_attention_mask=True + questions, contexts, max_length=args.max_seq_length, stride=args.doc_stride, return_attention_mask=True ) # Since one example might give us several features if it has a long context, we need a map from a feature to @@ -145,7 +473,7 @@ def prepare_validation_features(examples, tokenizer, args): questions = examples["question"] tokenized_examples = tokenizer( - questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length, return_attention_mask=True + questions, contexts, stride=args.doc_stride, max_length=args.max_seq_length, return_attention_mask=True ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. @@ -177,10 +505,65 @@ def prepare_validation_features(examples, tokenizer, args): return tokenized_examples -def set_seed(args): - random.seed(args.seed) - np.random.seed(args.seed) - paddle.seed(args.seed) +@dataclass +class TrainDataCollator: + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + def __call__(self, features): + input_ids = [] + start_positions = [] + end_positions = [] + batch = {} + + for feature in features: + input_ids.append(feature["input_ids"]) + start_positions.append(feature["start_positions"]) + end_positions.append(feature["end_positions"]) + + input_ids = (Pad(axis=0, pad_val=self.tokenizer.pad_token_id)(input_ids),) # input_ids + start_positions = (Stack(dtype="int64")(start_positions),) # start_positions + end_positions = (Stack(dtype="int64")(end_positions),) # end_positions + + batch["input_ids"] = input_ids[0] + batch["start_positions"] = start_positions[0] + batch["end_positions"] = end_positions[0] + + return batch + + +@dataclass +class EvalDataCollator: + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + def __call__(self, features): + input_ids = [] + batch = {} + + for feature in features: + input_ids.append(feature["input_ids"]) + + input_ids = (Pad(axis=0, pad_val=self.tokenizer.pad_token_id)(input_ids),) # input_ids + + batch["input_ids"] = input_ids[0] + + return batch + + +def compute_metrics(eval_preds, data_loader=None, raw_dataset=None, model_args=None): + start_logits, end_logits = eval_preds.predictions + all_predictions, all_nbest_json, scores_diff_json = compute_prediction( + raw_dataset, + data_loader.dataset, + (start_logits, end_logits), + model_args.version_2_with_negative, + model_args.n_best_size, + model_args.max_answer_length, + model_args.null_score_diff_threshold, + ) + squad_evaluate(examples=[raw_data for raw_data in raw_dataset], preds=all_predictions, na_probs=scores_diff_json) + return {} @paddle.no_grad() @@ -234,55 +617,26 @@ def forward(self, y, label): return loss -def run(args): - paddle.set_device(args.device) - if paddle.distributed.get_world_size() > 1: - paddle.distributed.init_parallel_env() - rank = paddle.distributed.get_rank() - args.model_type = args.model_type.lower() - model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) - - set_seed(args) - if rank == 0: - if os.path.exists(args.model_name_or_path): - print("init checkpoint from %s" % args.model_name_or_path) - - model = model_class.from_pretrained(args.model_name_or_path) +def do_train(): + parser = PdArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + paddle.set_device(training_args.device) if paddle.distributed.get_world_size() > 1: - model = paddle.DataParallel(model) + paddle.distributed.init_parallel_env() - if args.do_predict: - if args.version_2_with_negative: - dev_examples = load_dataset("squad_v2", split="validation") - else: - dev_examples = load_dataset("squad", split="validation") - column_names = dev_examples.column_names - dev_ds = dev_examples.map( - partial(prepare_validation_features, tokenizer=tokenizer, args=args), - batched=True, - remove_columns=column_names, - num_proc=4, - ) - dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) + set_seed(training_args.seed) - dev_batchify_fn = lambda samples, fn=Dict({"input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id)}): fn( - samples - ) + model_args.model_type = model_args.model_type.lower() + model_class, tokenizer_class = MODEL_CLASSES[model_args.model_type] - dev_data_loader = DataLoader( - dataset=dev_ds, - batch_sampler=dev_batch_sampler, - collate_fn=dev_batchify_fn, - num_workers=4, - return_list=True, - ) + tokenizer = tokenizer_class.from_pretrained(model_args.model_name_or_path) + model = model_class.from_pretrained(model_args.model_name_or_path) - if args.do_train: + if training_args.do_train: # layer_lr for base - if args.layer_lr_decay != 1.0: - layer_lr_radios_map = _get_layer_lr_radios(args.layer_lr_decay, n_layers=12) + if model_args.layer_lr_decay != 1.0: + layer_lr_radios_map = _get_layer_lr_radios(model_args.layer_lr_decay, n_layers=12) for name, parameter in model.named_parameters(): layer_lr_radio = 1.0 for k, radio in layer_lr_radios_map.items(): @@ -291,106 +645,65 @@ def run(args): break parameter.optimize_attr["learning_rate"] *= layer_lr_radio - if args.version_2_with_negative: + if model_args.version_2_with_negative: train_examples = load_dataset("squad_v2", split="train") else: train_examples = load_dataset("squad", split="train") column_names = train_examples.column_names train_ds = train_examples.map( - partial(prepare_train_features, tokenizer=tokenizer, args=args), + partial(prepare_train_features, tokenizer=tokenizer, args=model_args), batched=True, remove_columns=column_names, num_proc=4, ) - train_batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=args.batch_size, shuffle=True) - train_batchify_fn = lambda samples, fn=Dict( - { - "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), - "start_positions": Stack(dtype="int64"), - "end_positions": Stack(dtype="int64"), - } - ): fn(samples) - - train_data_loader = DataLoader( - dataset=train_ds, - batch_sampler=train_batch_sampler, - collate_fn=train_batchify_fn, - num_workers=4, - return_list=True, - ) - num_training_steps = args.max_steps if args.max_steps > 0 else len(train_data_loader) * args.num_train_epochs - num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) - - if args.scheduler_type == "linear": - lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) - elif args.scheduler_type == "cosine": - lr_scheduler = CosineDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) - - # Generate parameter names needed to perform weight decay. - # All bias and LayerNorm parameters are excluded. - decay_params = [p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"])] - - optimizer = paddle.optimizer.AdamW( - learning_rate=lr_scheduler, - beta1=0.9, - beta2=0.98, - epsilon=args.adam_epsilon, - parameters=model.parameters(), - weight_decay=args.weight_decay, - apply_decay_param_fun=lambda x: x in decay_params, + if training_args.do_eval: + if model_args.version_2_with_negative: + dev_examples = load_dataset("squad_v2", split="validation") + else: + dev_examples = load_dataset("squad", split="validation") + column_names = dev_examples.column_names + dev_ds = dev_examples.map( + partial(prepare_validation_features, tokenizer=tokenizer, args=model_args), + batched=True, + remove_columns=column_names, + num_proc=4, ) - criterion = CrossEntropyLossForSQuAD() - - global_step = 0 - tic_train = time.time() - for epoch in range(num_train_epochs): - for step, batch in enumerate(train_data_loader): - global_step += 1 - input_ids, start_positions, end_positions = batch - - logits = model(input_ids=input_ids) - loss = criterion(logits, (start_positions, end_positions)) - - if global_step % args.logging_steps == 0: - print( - "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" - % (global_step, epoch + 1, step + 1, loss, args.logging_steps / (time.time() - tic_train)) - ) - tic_train = time.time() - loss.backward() - optimizer.step() - lr_scheduler.step() - optimizer.clear_grad() - - if global_step % args.save_steps == 0 or global_step == num_training_steps: - if rank == 0: - output_dir = os.path.join(args.output_dir, "model_%d" % global_step) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - # need better way to get inner model of DataParallel - model_to_save = model._layers if isinstance(model, paddle.DataParallel) else model - model_to_save.save_pretrained(output_dir) - tokenizer.save_pretrained(output_dir) - print("Saving checkpoint to:", output_dir) - - if args.do_predict and rank == 0: - evaluate(model, dev_data_loader, dev_examples, args, global_step, False) - print("=" * 50) - - if global_step == num_training_steps: - return - - -def print_arguments(args): - """print arguments""" - print("----------- Configuration Arguments -----------") - for arg, value in sorted(vars(args).items()): - print("%s: %s" % (arg, value)) - print("------------------------------------------------") + + batchify_fn_train = TrainDataCollator(tokenizer) + batchify_fn_eval = EvalDataCollator(tokenizer) + criterion = CrossEntropyLossForSQuAD() + + compute_metrics_func = partial( + compute_metrics, + model_args=model_args, + ) + + trainer = MPNetTrainer( + model=model, + args=training_args, + train_dataset=train_ds if training_args.do_train else None, + eval_dataset=dev_ds if training_args.do_eval else None, + tokenizer=tokenizer, + data_collator=batchify_fn_train, + criterion=criterion, + compute_metrics=compute_metrics_func, + ) + trainer.set_eval_collator(batchify_fn_eval) + trainer.set_eval_raw_dataset(dev_examples) + + if training_args.do_train: + train_results = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + metrics = train_results.metrics + trainer.save_model() + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + if training_args.do_eval: + eval_metrics = trainer.evaluate() + trainer.log_metrics("eval", eval_metrics) if __name__ == "__main__": - args = parse_args() - print_arguments(args) - run(args) + do_train() diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index e6abdcd4e1a7..4d655242d73c 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -127,6 +127,7 @@ from .mobilebert.configuration import * from .mobilebert.modeling import * from .mobilebert.tokenizer import * +from .mpnet.configuration import * from .mpnet.modeling import * from .mpnet.tokenizer import * from .mt5.configuration import * diff --git a/paddlenlp/transformers/mpnet/configuration.py b/paddlenlp/transformers/mpnet/configuration.py new file mode 100644 index 000000000000..e443a4568fec --- /dev/null +++ b/paddlenlp/transformers/mpnet/configuration.py @@ -0,0 +1,117 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MPNet model configuration""" +from __future__ import annotations + +from paddlenlp.transformers.configuration_utils import PretrainedConfig + +__all__ = [ + "MPNET_PRETRAINED_INIT_CONFIGURATION", + "MPNetConfig", +] + +MPNET_PRETRAINED_INIT_CONFIGURATION = {} + + +class MPNetConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MPNetModel`]. It is used to + instantiate a MPNet model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the MPNet. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30527): + Vocabulary size of the MPNet model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`MPNetModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 514): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-5): + The epsilon used by the layer normalization layers. + relative_attention_num_buckets (`int`, *optional*, defaults to 32): + The number of buckets to use for each attention layer. + + Examples: + + ```python + >>> from paddlenlp.transformers import MPNetModel, MPNetConfig + + >>> # Initializing a MPNet mpnet-base style configuration + >>> configuration = MPNetConfig() + + >>> # Initializing a model from the MPNet mpnet-base style configuration + >>> model = MPNetModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "mpnet" + attribute_map = { + "num_classes": "num_labels", + } + + def __init__( + self, + vocab_size: int = 30527, + hidden_size: int = 768, + num_hidden_layers: int = 12, + num_attention_heads: int = 12, + intermediate_size: int = 3072, + hidden_act: str = "gelu", + hidden_dropout_prob: float = 0.1, + attention_probs_dropout_prob: float = 0.1, + max_position_embeddings: int = 514, + initializer_range: float = 0.02, + layer_norm_eps: float = 1e-5, + relative_attention_num_buckets: int = 32, + pad_token_id: int = 1, + bos_token_id: int = 0, + eos_token_id: int = 2, + **kwargs + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.relative_attention_num_buckets = relative_attention_num_buckets diff --git a/paddlenlp/transformers/mpnet/modeling.py b/paddlenlp/transformers/mpnet/modeling.py index cb28c0d2bd53..048e1f1415e7 100644 --- a/paddlenlp/transformers/mpnet/modeling.py +++ b/paddlenlp/transformers/mpnet/modeling.py @@ -23,6 +23,7 @@ from .. import PretrainedModel, register_base_model from ..activations import ACT2FN +from .configuration import MPNET_PRETRAINED_INIT_CONFIGURATION, MPNetConfig __all__ = [ "MPNetModel", @@ -50,21 +51,15 @@ class MPNetEmbeddings(nn.Layer): Include embeddings from word and position embeddings. """ - def __init__( - self, - vocab_size, - hidden_size=768, - hidden_dropout_prob=0.1, - max_position_embeddings=514, - layer_norm_eps=1e-5, - pad_token_id=1, - ): + def __init__(self, config: MPNetConfig): super(MPNetEmbeddings, self).__init__() - self.padding_idx = pad_token_id - self.word_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=self.padding_idx) - self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size, padding_idx=self.padding_idx) - self.layer_norm = nn.LayerNorm(hidden_size, epsilon=layer_norm_eps) - self.dropout = nn.Dropout(hidden_dropout_prob) + self.padding_idx = config.pad_token_id + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx) + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, input_ids, position_ids=None): @@ -82,32 +77,25 @@ def forward(self, input_ids, position_ids=None): class MPNetAttention(nn.Layer): - def __init__( - self, - hidden_size=768, - num_attention_heads=12, - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - layer_norm_eps=1e-5, - ): + def __init__(self, config: MPNetConfig): super(MPNetAttention, self).__init__() - if hidden_size % num_attention_heads != 0: + if config.hidden_size % config.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (hidden_size, num_attention_heads) + "heads (%d)" % (config.hidden_size, config.num_attention_heads) ) - self.num_attention_heads = num_attention_heads - self.attention_head_size = hidden_size // num_attention_heads + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = config.hidden_size // config.num_attention_heads self.all_head_size = self.num_attention_heads * self.attention_head_size self.scale = self.attention_head_size**-0.5 - self.q = nn.Linear(hidden_size, self.all_head_size) - self.k = nn.Linear(hidden_size, self.all_head_size) - self.v = nn.Linear(hidden_size, self.all_head_size) - self.o = nn.Linear(hidden_size, hidden_size) + self.q = nn.Linear(config.hidden_size, self.all_head_size) + self.k = nn.Linear(config.hidden_size, self.all_head_size) + self.v = nn.Linear(config.hidden_size, self.all_head_size) + self.o = nn.Linear(config.hidden_size, config.hidden_size) - self.attention_dropout = nn.Dropout(attention_probs_dropout_prob) - self.layer_norm = nn.LayerNorm(hidden_size, epsilon=layer_norm_eps) - self.output_dropout = nn.Dropout(hidden_dropout_prob) + self.attention_dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.output_dropout = nn.Dropout(config.hidden_dropout_prob) def transpose_for_scores(self, x): new_x_shape = x.shape[:-1] + [ @@ -152,29 +140,14 @@ def forward(self, hidden_states, attention_mask=None, position_bias=None): class MPNetLayer(nn.Layer): - def __init__( - self, - hidden_size, - num_attention_heads, - intermediate_size, - hidden_act, - hidden_dropout_prob, - attention_probs_dropout_prob, - layer_norm_eps, - ): + def __init__(self, config: MPNetConfig): super(MPNetLayer, self).__init__() - self.attention = MPNetAttention( - hidden_size, - num_attention_heads, - hidden_dropout_prob, - attention_probs_dropout_prob, - layer_norm_eps, - ) - self.ffn = nn.Linear(hidden_size, intermediate_size) - self.ffn_output = nn.Linear(intermediate_size, hidden_size) - self.activation = ACT2FN[hidden_act] - self.layer_norm = nn.LayerNorm(hidden_size, epsilon=layer_norm_eps) - self.dropout = nn.Dropout(hidden_dropout_prob) + self.attention = MPNetAttention(config) + self.ffn = nn.Linear(config.hidden_size, config.intermediate_size) + self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size) + self.activation = ACT2FN[config.hidden_act] + self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, attention_mask=None, position_bias=None): attention_output, layer_att = self.attention( @@ -192,30 +165,11 @@ def forward(self, hidden_states, attention_mask=None, position_bias=None): class MPNetEncoder(nn.Layer): - def __init__( - self, - hidden_size, - num_hidden_layers, - num_attention_heads, - intermediate_size, - hidden_act, - hidden_dropout_prob, - attention_probs_dropout_prob, - relative_attention_num_buckets, - layer_norm_eps, - ): + def __init__(self, config: MPNetConfig): super(MPNetEncoder, self).__init__() - layer = MPNetLayer( - hidden_size, - num_attention_heads, - intermediate_size, - hidden_act, - hidden_dropout_prob, - attention_probs_dropout_prob, - layer_norm_eps, - ) - self.layer = nn.LayerList([copy.deepcopy(layer) for _ in range(num_hidden_layers)]) - self.relative_attention_bias = nn.Embedding(relative_attention_num_buckets, num_attention_heads) + layer = MPNetLayer(config) + self.layer = nn.LayerList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) + self.relative_attention_bias = nn.Embedding(config.relative_attention_num_buckets, config.num_attention_heads) def forward(self, hidden_states, attention_mask=None): position_bias = self.compute_position_bias(hidden_states) @@ -274,9 +228,9 @@ class MPNetPooler(nn.Layer): Pool the result of MPNetEncoder. """ - def __init__(self, hidden_size): + def __init__(self, config: MPNetConfig): super(MPNetPooler, self).__init__() - self.dense = nn.Linear(hidden_size, hidden_size) + self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh() def forward(self, hidden_states): @@ -297,29 +251,14 @@ class MPNetPretrainedModel(PretrainedModel): See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details. """ - pretrained_init_configuration = { - "mpnet-base": { - "vocab_size": 30527, - "hidden_size": 768, - "num_hidden_layers": 12, - "num_attention_heads": 12, - "intermediate_size": 3072, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "attention_probs_dropout_prob": 0.1, - "max_position_embeddings": 514, - "relative_attention_num_buckets": 32, - "layer_norm_eps": 1e-05, - "initializer_range": 0.02, - "pad_token_id": 1, - } - } + base_model_prefix = "mpnet" pretrained_resource_files_map = { "model_state": { "mpnet-base": "https://bj.bcebos.com/paddlenlp/models/transformers/mpnet/mpnet-base/model_state.pdparams", } } - base_model_prefix = "mpnet" + pretrained_init_configuration = MPNET_PRETRAINED_INIT_CONFIGURATION + config_class = MPNetConfig def init_weights(self, layer): """Initialization hook""" @@ -330,9 +269,7 @@ def init_weights(self, layer): layer.weight.set_value( paddle.tensor.normal( mean=0.0, - std=self.initializer_range - if hasattr(self, "initializer_range") - else self.mpnet.config["initializer_range"], + std=self.config.initializer_range, shape=layer.weight.shape, ) ) @@ -351,95 +288,16 @@ class MPNetModel(MPNetPretrainedModel): and refer to the Paddle documentation for all matter related to general usage and behavior. Args: - vocab_size (int): - Vocabulary size of `inputs_ids` in `MPNetModel`. Also is the vocab size of token embedding matrix. - Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `MPNetModel`. - hidden_size (int, optional): - Dimensionality of the embedding layer, encoder layer and pooler layer. Defaults to `768`. - num_hidden_layers (int, optional): - Number of hidden layers in the Transformer encoder. Defaults to `12`. - num_attention_heads (int, optional): - Number of attention heads for each attention layer in the Transformer encoder. - Defaults to `12`. - intermediate_size (int, optional): - Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors - to ff layers are firstly projected from `hidden_size` to `intermediate_size`, - and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`. - Defaults to `3072`. - hidden_act (str, optional): - The non-linear activation function in the feed-forward layer. - ``"gelu"``, ``"relu"`` and any other paddle supported activation functions - are supported. Defaults to `"gelu"`. - hidden_dropout_prob (float, optional): - The dropout probability for all fully connected layers in the embeddings and encoder. - Defaults to `0.1`. - attention_probs_dropout_prob (float, optional): - The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target. - Defaults to `0.1`. - max_position_embeddings (int, optional): - The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input - sequence. Defaults to `514`. - initializer_range (float, optional): - The standard deviation of the normal initializer. - Defaults to 0.02. - - .. note:: - A normal_initializer initializes weight matrices as normal distributions. - See :meth:`MPNetPretrainedModel.init_weights()` for how weights are initialized in `MPNetModel`. - - relative_attention_num_buckets (int, optional): - The number of buckets to use for each attention layer. - Defaults to `32`. - - layer_norm_eps (float, optional): - The epsilon used by the layer normalization layers. - Defaults to `1e-5`. - - pad_token_id (int, optional): - The index of padding token in the token vocabulary. - Defaults to `1`. - + config (:class:`MPNetConfig`): + An instance of MPNetConfig used to construct MPNetModel. """ - def __init__( - self, - vocab_size, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=514, - initializer_range=0.02, - relative_attention_num_buckets=32, - layer_norm_eps=1e-5, - pad_token_id=1, - ): - super(MPNetModel, self).__init__() - self.initializer_range = initializer_range - self.embeddings = MPNetEmbeddings( - vocab_size, - hidden_size, - hidden_dropout_prob, - max_position_embeddings, - layer_norm_eps, - pad_token_id, - ) - self.encoder = MPNetEncoder( - hidden_size, - num_hidden_layers, - num_attention_heads, - intermediate_size, - hidden_act, - hidden_dropout_prob, - attention_probs_dropout_prob, - relative_attention_num_buckets, - layer_norm_eps, - ) - - self.pooler = MPNetPooler(hidden_size) + def __init__(self, config: MPNetConfig): + super(MPNetModel, self).__init__(config) + self.initializer_range = config.initializer_range + self.embeddings = MPNetEmbeddings(config) + self.encoder = MPNetEncoder(config) + self.pooler = MPNetPooler(config) self.apply(self.init_weights) def forward(self, input_ids, position_ids=None, attention_mask=None): @@ -511,6 +369,12 @@ def forward(self, input_ids, position_ids=None, attention_mask=None): return sequence_output, pooled_output + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + class MPNetLMHead(nn.Layer): """ @@ -519,19 +383,24 @@ class MPNetLMHead(nn.Layer): def __init__( self, - hidden_size, - vocab_size, - hidden_act="gelu", + config: MPNetConfig, embedding_weights=None, - layer_norm_eps=1e-5, ): super(MPNetLMHead, self).__init__() - self.dense = nn.Linear(hidden_size, hidden_size) - self.activation = ACT2FN[hidden_act] - self.layer_norm = nn.LayerNorm(hidden_size, epsilon=layer_norm_eps) + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = ACT2FN[config.hidden_act] + self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) - self.decoder_weight = embedding_weights - self.decoder_bias = self.create_parameter(shape=[vocab_size], dtype=self.decoder_weight.dtype, is_bias=True) + self.decoder_weight = ( + self.create_parameter( + shape=[config.vocab_size, config.hidden_size], dtype=self.dense.weight.dtype, is_bias=False + ) + if embedding_weights is None + else embedding_weights + ) + self.decoder_bias = self.create_parameter( + shape=[config.vocab_size], dtype=self.decoder_weight.dtype, is_bias=True + ) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) @@ -548,21 +417,15 @@ class MPNetForMaskedLM(MPNetPretrainedModel): MPNet Model with a `language modeling` head on top. Args: - MPNet (:class:`MPNetModel`): - An instance of :class:`MPNetModel`. + config (:class:`MPNetConfig`): + An instance of MPNetConfig used to construct MPNetModel. """ - def __init__(self, mpnet): - super(MPNetForMaskedLM, self).__init__() - self.mpnet = mpnet - self.lm_head = MPNetLMHead( - self.mpnet.config["hidden_size"], - self.mpnet.config["vocab_size"], - self.mpnet.config["hidden_act"], - self.mpnet.embeddings.word_embeddings.weight, - self.mpnet.config["layer_norm_eps"], - ) + def __init__(self, config: MPNetConfig): + super(MPNetForMaskedLM, self).__init__(config) + self.mpnet = MPNetModel(config) + self.lm_head = MPNetLMHead(config, embedding_weights=self.mpnet.embeddings.word_embeddings.weight) self.apply(self.init_weights) @@ -625,22 +488,18 @@ class MPNetForSequenceClassification(MPNetPretrainedModel): designed for sequence classification/regression tasks like GLUE tasks. Args: - mpnet (:class:`MPNetModel`): - An instance of MPNetModel. - num_classes (int, optional): - The number of classes. Defaults to `2`. - dropout (float, optional): - The dropout probability for output of MPNet. - If None, use the same value as `hidden_dropout_prob` of `MPNetModel` - instance `mpnet`. Defaults to None. + config (:class:`MPNetConfig`): + An instance of MPNetConfig used to construct MPNetModel. """ - def __init__(self, mpnet, num_classes=2, dropout=None): - super(MPNetForSequenceClassification, self).__init__() - self.num_classes = num_classes - self.mpnet = mpnet - self.dropout = nn.Dropout(dropout if dropout is not None else self.mpnet.config["hidden_dropout_prob"]) - self.classifier = nn.Linear(self.mpnet.config["hidden_size"], num_classes) + def __init__(self, config: MPNetConfig): + super(MPNetForSequenceClassification, self).__init__(config) + self.num_labels = config.num_labels + self.mpnet = MPNetModel(config) + self.dropout = nn.Dropout( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.apply(self.init_weights) def forward(self, input_ids, position_ids=None, attention_mask=None): @@ -689,22 +548,20 @@ class MPNetForMultipleChoice(MPNetPretrainedModel): designed for multiple choice tasks like RocStories/SWAG tasks. Args: - mpnet (:class:`MPNetModel`): - An instance of MPNetModel. + config (:class:`MPNetConfig`): + An instance of MPNetConfig used to construct MPNetModel. num_choices (int, optional): The number of choices. Defaults to `2`. - dropout (float, optional): - The dropout probability for output of MPNet. - If None, use the same value as `hidden_dropout_prob` of `MPNetModel` - instance `mpnet`. Defaults to None. """ - def __init__(self, mpnet, num_choices=2, dropout=None): - super(MPNetForMultipleChoice, self).__init__() + def __init__(self, config: MPNetConfig, num_choices=2): + super(MPNetForMultipleChoice, self).__init__(config) self.num_choices = num_choices - self.mpnet = mpnet - self.dropout = nn.Dropout(dropout if dropout is not None else self.mpnet.config["hidden_dropout_prob"]) - self.classifier = nn.Linear(self.mpnet.config["hidden_size"], 1) + self.mpnet = MPNetModel(config) + self.dropout = nn.Dropout( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.classifier = nn.Linear(config.hidden_size, 1) self.apply(self.init_weights) def forward(self, input_ids, position_ids=None, attention_mask=None): @@ -761,22 +618,18 @@ class MPNetForTokenClassification(MPNetPretrainedModel): designed for token classification tasks like NER tasks. Args: - mpnet (:class:`MPNetModel`): - An instance of MPNetModel. - num_classes (int, optional): - The number of classes. Defaults to `2`. - dropout (float, optional): - The dropout probability for output of MPNet. - If None, use the same value as `hidden_dropout_prob` of `MPNetModel` - instance `mpnet`. Defaults to None. + config (:class:`MPNetConfig`): + An instance of MPNetConfig used to construct MPNetModel. """ - def __init__(self, mpnet, num_classes=2, dropout=None): - super(MPNetForTokenClassification, self).__init__() - self.num_classes = num_classes - self.mpnet = mpnet - self.dropout = nn.Dropout(dropout if dropout is not None else self.mpnet.config["hidden_dropout_prob"]) - self.classifier = nn.Linear(self.mpnet.config["hidden_size"], num_classes) + def __init__(self, config: MPNetConfig): + super(MPNetForTokenClassification, self).__init__(config) + self.mpnet = MPNetModel(config) + self.num_labels = config.num_labels + self.dropout = nn.Dropout( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.apply(self.init_weights) def forward(self, input_ids, position_ids=None, attention_mask=None): @@ -823,18 +676,14 @@ class MPNetForQuestionAnswering(MPNetPretrainedModel): and `span_end_logits`, designed for question-answering tasks like SQuAD. Args: - mpnet (:class:`MPNetModel`): - An instance of MPNetModel. - num_classes (int, optional): - The number of classes. Defaults to `2`. + config (:class:`MPNetConfig`): + An instance of MPNetConfig used to construct MPNetModel. """ - def __init__(self, mpnet, num_classes=2): - super(MPNetForQuestionAnswering, self).__init__() - self.mpnet = mpnet - self.num_classes = num_classes - self.qa_outputs = nn.Linear(self.mpnet.config["hidden_size"], num_classes) - + def __init__(self, config: MPNetConfig): + super(MPNetForQuestionAnswering, self).__init__(config) + self.mpnet = MPNetModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, 2) self.apply(self.init_weights) def forward(self, input_ids, position_ids=None, attention_mask=None): diff --git a/paddlenlp/transformers/mpnet/tokenizer.py b/paddlenlp/transformers/mpnet/tokenizer.py index 07f4e90826db..f91bb00dde87 100644 --- a/paddlenlp/transformers/mpnet/tokenizer.py +++ b/paddlenlp/transformers/mpnet/tokenizer.py @@ -14,8 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ..bert.tokenizer import BertTokenizer from .. import AddedToken +from ..bert.tokenizer import BertTokenizer __all__ = ["MPNetTokenizer"] @@ -59,6 +59,7 @@ def __init__( pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, + **kwargs, ) bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token @@ -84,32 +85,40 @@ def __call__( self, text, text_pair=None, - max_seq_len=None, + max_length=None, stride=0, + padding=False, is_split_into_words=False, pad_to_max_seq_len=False, - truncation_strategy="longest_first", + truncation=False, return_position_ids=False, return_token_type_ids=False, return_attention_mask=False, return_length=False, return_overflowing_tokens=False, return_special_tokens_mask=False, + add_special_tokens=True, + pad_to_multiple_of=None, + return_offsets_mapping=False, ): return super().__call__( text, text_pair=text_pair, - max_seq_len=max_seq_len, + max_length=max_length, stride=stride, + padding=padding, is_split_into_words=is_split_into_words, pad_to_max_seq_len=pad_to_max_seq_len, - truncation_strategy=truncation_strategy, + truncation=truncation, return_position_ids=return_position_ids, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, return_length=return_length, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, + add_special_tokens=add_special_tokens, + pad_to_multiple_of=pad_to_multiple_of, + return_offsets_mapping=return_offsets_mapping, ) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): @@ -184,3 +193,9 @@ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None): + if offset_mapping_1 is None: + return [(0, 0)] + offset_mapping_0 + [(0, 0)] + + return [(0, 0)] + offset_mapping_0 + [(0, 0)] + [(0, 0)] + offset_mapping_1 + [(0, 0)] diff --git a/tests/transformers/mpnet/__init__.py b/tests/transformers/mpnet/__init__.py new file mode 100644 index 000000000000..595add0aed9e --- /dev/null +++ b/tests/transformers/mpnet/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/transformers/mpnet/test_modeling.py b/tests/transformers/mpnet/test_modeling.py new file mode 100644 index 000000000000..a3689d1cefc0 --- /dev/null +++ b/tests/transformers/mpnet/test_modeling.py @@ -0,0 +1,232 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle + +from paddlenlp.transformers import ( + MPNetConfig, + MPNetForMaskedLM, + MPNetForMultipleChoice, + MPNetForQuestionAnswering, + MPNetForSequenceClassification, + MPNetForTokenClassification, + MPNetModel, +) + +from ...transformers.test_configuration_common import ConfigTester +from ...transformers.test_modeling_common import ( + ModelTesterMixin, + ids_tensor, + random_attention_mask, +) + + +class MPNetModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + vocab_size=99, + hidden_size=64, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=64, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = random_attention_mask([self.batch_size, self.seq_length]) + + sequence_labels = None + token_labels = None + choice_labels = None + + if self.parent.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return MPNetConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + ) + + def create_and_check_mpnet_model( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = MPNetModel(config=config) + model.eval() + result = model(input_ids, input_mask) + result = model(input_ids) + self.parent.assertEqual(result[0].shape, [self.batch_size, self.seq_length, self.hidden_size]) + self.parent.assertEqual(result[1].shape, [self.batch_size, self.hidden_size]) + + def create_and_check_mpnet_for_question_answering( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = MPNetForQuestionAnswering(config=config) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + ) + start_logits, end_logits = result[0], result[1] + self.parent.assertEqual(start_logits.shape, [self.batch_size, self.seq_length]) + self.parent.assertEqual(end_logits.shape, [self.batch_size, self.seq_length]) + + def create_and_check_mpnet_for_sequence_classification( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = MPNetForSequenceClassification(config) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + ) + if paddle.is_tensor(result): + result = [result] + + self.parent.assertEqual(result[0].shape, [self.batch_size, self.num_labels]) + + def create_and_check_mpnet_for_multiple_choice( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = MPNetForMultipleChoice(config=config, num_choices=self.num_choices) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand((-1, self.num_choices, -1)) + multiple_choice_input_mask = input_mask.unsqueeze(1).expand((-1, self.num_choices, -1)) + result = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + ) + if paddle.is_tensor(result): + result = [result] + + self.parent.assertEqual(result[0].shape, [self.batch_size, self.num_choices]) + + def create_and_check_mpnet_for_token_classification( + self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = MPNetForTokenClassification(config=config) + model.eval() + result = model(input_ids, attention_mask=input_mask) + if paddle.is_tensor(result): + result = [result] + + self.parent.assertEqual(result[0].shape, [self.batch_size, self.seq_length, self.num_labels]) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +class MPNetModelTest(ModelTesterMixin, unittest.TestCase): + base_model_class = MPNetModel + all_model_classes = ( + MPNetForMaskedLM, + MPNetForMultipleChoice, + MPNetForQuestionAnswering, + MPNetForSequenceClassification, + MPNetForTokenClassification, + MPNetModel, + ) + use_test_model_name_list = False + return_dict = False + use_labels = False + + def setUp(self): + self.model_tester = MPNetModelTester(self) + self.config_tester = ConfigTester(self, config_class=MPNetConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_mpnet_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_mpnet_model(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_mpnet_for_sequence_classification(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_mpnet_for_multiple_choice(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_mpnet_for_token_classification(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_mpnet_for_question_answering(*config_and_inputs) diff --git a/tests/transformers/mpnet/test_tokenizer.py b/tests/transformers/mpnet/test_tokenizer.py new file mode 100644 index 000000000000..53eb952523b2 --- /dev/null +++ b/tests/transformers/mpnet/test_tokenizer.py @@ -0,0 +1,74 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +from paddlenlp.transformers import MPNetTokenizer + +from ...transformers.test_tokenizer_common import TokenizerTesterMixin + + +class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + tokenizer_class = MPNetTokenizer + space_between_special_tokens = True + + def setUp(self): + super().setUp() + + vocab_tokens = [ + "[UNK]", + "[CLS]", + "[SEP]", + "[PAD]", + "[MASK]", + "want", + "##want", + "##ed", + "wa", + "un", + "runn", + "##ing", + ",", + "low", + "lowest", + ] + self.vocab_file = os.path.join(self.tmpdirname, MPNetTokenizer.resource_files_names["vocab_file"]) + with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + def get_input_output_texts(self, tokenizer): + input_text = "UNwant\u00E9d,running" + output_text = "unwanted, running" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = self.tokenizer_class(self.vocab_file) + + tokens = tokenizer.tokenize("UNwant\u00E9d,running") + self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11]) + + def test_sequence_builders(self): + tokenizer = self.tokenizer_class.from_pretrained("mpnet-base") + + text = tokenizer.encode("sequence builders", add_special_tokens=False)["input_ids"] + text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)["input_ids"] + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + assert encoded_sentence == [0] + text + [2] + assert encoded_pair == [0] + text + [2] + [2] + text_2 + [2] diff --git a/tests/transformers/test_tokenizer_common.py b/tests/transformers/test_tokenizer_common.py index 328adb9e7209..082f612904a2 100644 --- a/tests/transformers/test_tokenizer_common.py +++ b/tests/transformers/test_tokenizer_common.py @@ -2171,7 +2171,7 @@ def test_consecutive_unk_string(self): string = tokenizer.convert_tokens_to_string(tokens) encoding = tokenizer( text=string, - runcation=True, + truncation=True, return_offsets_mapping=True, ) self.assertEqual(len(encoding["input_ids"]), 4)