Skip to content

Commit

Permalink
Upgrade from_pretrained (PaddlePaddle#223)
Browse files Browse the repository at this point in the history
* update base, derived args in from_pretrain

* remove bigbird-base-uncased-finetune

* add dependency description

* add some explanation why we need to set drop 0.0 in finetune-task

* add new comment for from_pretrained
  • Loading branch information
joey12300 authored Apr 7, 2021
1 parent c0eebb3 commit 7230488
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 35 deletions.
12 changes: 9 additions & 3 deletions examples/language_model/bigbird/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
```
## 快速开始

### 环境依赖

- sentencepiece

安装命令:`pip install sentencepiece`

### 数据准备
根据论文中的信息,目前 Big Bird 的预训练数据是主要是由 Books,CC-News,Stories, Wikipedia 4种预训练数据来构造,用户可以根据自己的需要来下载和清洗相应的数据。目前已提供一份示例数据在 data 目录。

Expand Down Expand Up @@ -61,7 +67,7 @@ python -m paddle.distributed.launch --gpus "0" --log_dir log run_pretrain.py --

```shell
export CUDA_VISIBLE_DEVICES=0
python run_classifier.py --model_name_or_path bigbird-base-uncased-finetune \
python run_classifier.py --model_name_or_path bigbird-base-uncased \
--output_dir "output" \
--batch_size 2 \
--learning_rate 1e-5 \
Expand All @@ -72,7 +78,7 @@ python run_classifier.py --model_name_or_path bigbird-base-uncased-finetune \

其中参数释义如下:

- `model_name_or_path` 指示了finetune使用的具体预训练模型以及预训练时使用的tokenizer,目前支持的预训练模型有:"bigbird-base-uncased", "bigbird-base-uncased-finetune"。若模型相关内容保存在本地,这里也可以提供相应目录地址,例如:"./checkpoint/model_xx/"。
- `model_name_or_path` 指示了finetune使用的具体预训练模型以及预训练时使用的tokenizer,目前支持的预训练模型有:"bigbird-base-uncased"。若模型相关内容保存在本地,这里也可以提供相应目录地址,例如:"./checkpoint/model_xx/"。
- `output_dir` 指定输出文件。
- `batch_size` 训练的batch大小。
- `learning_rate` 训练的学习率。
Expand All @@ -82,7 +88,7 @@ python run_classifier.py --model_name_or_path bigbird-base-uncased-finetune \
- `max_encoder_length` MLM任务的最大的token数目。


基于`bigbird-base-uncased-finetune`在IMDB评测任务上Fine-tuning后,在验证集上有如下结果:
基于`bigbird-base-uncased`在IMDB评测任务上Fine-tuning后,在验证集上有如下结果:

| Task | Metric | Result |
|:-----:|:----------------------------:|:-----------------:|
Expand Down
4 changes: 3 additions & 1 deletion examples/language_model/bigbird/run_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,10 @@ def main():
paddle.set_device(args.device)
set_seed(args)
# Define the model and metric
# In finetune task, bigbird performs better when setting dropout to zero.
model = BigBirdForSequenceClassification.from_pretrained(
args.model_name_or_path)
args.model_name_or_path, attn_dropout=0.0, hidden_dropout_prob=0.0)

criterion = nn.CrossEntropyLoss()
metric = paddle.metric.Accuracy()

Expand Down
23 changes: 0 additions & 23 deletions paddlenlp/transformers/bigbird/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,35 +231,12 @@ class BigBirdPretrainedModel(PretrainedModel):
"num_labels": 2,
"initializer_range": 0.02,
},
"bigbird-base-uncased-finetune": {
"num_layers": 12,
"vocab_size": 50358,
"nhead": 12,
"attn_dropout": 0.0,
"dim_feedforward": 3072,
"activation": "gelu",
"normalize_before": False,
"block_size": 16,
"window_size": 3,
"num_global_blocks": 2,
"num_rand_blocks": 3,
"seed": None,
"pad_token_id": 0,
"hidden_size": 768,
"hidden_dropout_prob": 0.0,
"max_position_embeddings": 4096,
"type_vocab_size": 2,
"num_labels": 2,
"initializer_range": 0.02,
},
}
resource_files_names = {"model_state": "model_state.pdparams"}
pretrained_resource_files_map = {
"model_state": {
"bigbird-base-uncased":
"https://paddlenlp.bj.bcebos.com/models/transformers/bigbird/bigbird-base-uncased.pdparams",
"bigbird-base-uncased-finetune":
"https://paddlenlp.bj.bcebos.com/models/transformers/bigbird/bigbird-base-uncased.pdparams",
}
}
base_model_prefix = "bigbird"
Expand Down
5 changes: 0 additions & 5 deletions paddlenlp/transformers/bigbird/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,12 @@ class BigBirdTokenizer(PretrainedTokenizer):
"sentencepiece_model_file": {
"bigbird-base-uncased":
"https://paddlenlp.bj.bcebos.com/models/transformers/bigbird/sentencepiece_gpt2.model",
"bigbird-base-uncased-finetune":
"https://paddlenlp.bj.bcebos.com/models/transformers/bigbird/sentencepiece_gpt2.model"
},
}
pretrained_init_configuration = {
"bigbird-base-uncased": {
"do_lower_case": True
},
"bigbird-base-uncased-finetune": {
"do_lower_case": True
}
}

def __init__(self,
Expand Down
17 changes: 14 additions & 3 deletions paddlenlp/transformers/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import os
import six
import logging
import inspect

import paddle
from paddle.nn import Layer
Expand Down Expand Up @@ -119,7 +120,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
this as position argument values for model initialization.
**kwargs (dict): keyword arguments for `__init__`. If provide, use
this to update pre-defined keyword argument values for model
initialization.
initialization. If the key is in base model `__init__`, update
keyword argument of base model; else update keyword argument of
derived model.
Returns:
PretrainedModel: An instance of PretrainedModel.
"""
Expand Down Expand Up @@ -178,7 +181,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
# class name corresponds to this configuration
init_class = init_kwargs.pop("init_class",
cls.base_model_class.__name__)

# Check if the loaded config matches the current model class's __init__
# arguments. If not match, the loaded config is for the base model class.
if init_class == cls.base_model_class.__name__:
Expand Down Expand Up @@ -209,6 +211,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
base_arg_index = arg_name
base_arg = arg
break

base_args = base_arg.pop("init_args", ())
base_kwargs = base_arg
if cls == cls.base_model_class:
Expand All @@ -218,13 +221,21 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
model = cls(*base_args, **base_kwargs)
else:
# Update with newly provided args and kwargs for derived model
base_parameters_dict = inspect.signature(
cls.base_model_class.__init__).parameters
for k, v in kwargs.items():
if k in base_parameters_dict:
base_kwargs[k] = v
base_model = cls.base_model_class(*base_args, **base_kwargs)
if base_arg_index is not None:
derived_args[base_arg_index] = base_model
else:
derived_args = (base_model, ) # assume at the first position
derived_args = derived_args if not args else args
derived_kwargs.update(kwargs)
derived_parameters_dict = inspect.signature(cls.__init__).parameters
for k, v in kwargs.items():
if k in derived_parameters_dict:
derived_kwargs[k] = v
model = cls(*derived_args, **derived_kwargs)

# Maybe need more ways to load resources.
Expand Down

0 comments on commit 7230488

Please sign in to comment.