diff --git a/docs/model_zoo/transformers.rst b/docs/model_zoo/transformers.rst index bec3747dbc5c..0b834b2fbfe2 100644 --- a/docs/model_zoo/transformers.rst +++ b/docs/model_zoo/transformers.rst @@ -9,7 +9,7 @@ PaddleNLP为用户提供了常用的 ``BERT``、``ERNIE``、``ALBERT``、``RoBER Transformer预训练模型汇总 ------------------------------------ -下表汇总了介绍了目前PaddleNLP支持的各类预训练模型以及对应预训练权重。我们目前提供了 **21** 种网络结构, **104** 种预训练的参数权重供用户使用, +下表汇总了介绍了目前PaddleNLP支持的各类预训练模型以及对应预训练权重。我们目前提供了 **21** 种网络结构, **107** 种预训练的参数权重供用户使用, 其中包含了 **52** 种中文语言模型的预训练权重。 +--------------------+----------------------------------------------------------------------------------+--------------+-----------------------------------------+ @@ -276,10 +276,22 @@ Transformer预训练模型汇总 | | | | The model distilled from | | | | | the GPT model ``gpt-cpm-large-cn`` | | +----------------------------------------------------------------------------------+--------------+-----------------------------------------+ +| |``gpt2-en`` | English | 12-layer, 768-hidden, | +| | | | 12-heads, 117M parameters. | +| | | | Trained on English text. | +| +----------------------------------------------------------------------------------+--------------+-----------------------------------------+ | |``gpt2-medium-en`` | English | 24-layer, 1024-hidden, | | | | | 16-heads, 345M parameters. | | | | | Trained on English text. | | +----------------------------------------------------------------------------------+--------------+-----------------------------------------+ +| |``gpt2-large-en`` | English | 36-layer, 1280-hidden, | +| | | | 20-heads, 774M parameters. | +| | | | Trained on English text. | +| +----------------------------------------------------------------------------------+--------------+-----------------------------------------+ +| |``gpt2-xl-en`` | English | 48-layer, 1600-hidden, | +| | | | 25-heads, 1558M parameters. | +| | | | Trained on English text. | +| +----------------------------------------------------------------------------------+--------------+-----------------------------------------+ | |``junnyu/distilgpt2`` | English | 6-layer, 768-hidden, | | | | | 12-heads, 81M parameters. | | | | | Trained on English text. | @@ -289,11 +301,11 @@ Transformer预训练模型汇总 | | | | Trained on English text. | | +----------------------------------------------------------------------------------+--------------+-----------------------------------------+ | |``junnyu/microsoft-DialoGPT-medium`` | English | 24-layer, 1024-hidden, | -| | | | 16-heads, 354M parameters. | +| | | | 16-heads, 354M parameters. | | | | | Trained on English text. | | +----------------------------------------------------------------------------------+--------------+-----------------------------------------+ | |``junnyu/microsoft-DialoGPT-large`` | English | 36-layer, 1280-hidden, | -| | | | 20-heads, 774M parameters. | +| | | | 20-heads, 774M parameters. | | | | | Trained on English text. | | +----------------------------------------------------------------------------------+--------------+-----------------------------------------+ | |``junnyu/uer-gpt2-chinese-poem`` | Chinese | 12-layer, 768-hidden, | @@ -619,4 +631,4 @@ Reference - Jiao, Xiaoqi, et al. "Tinybert: Distilling bert for natural language understanding." arXiv preprint arXiv:1909.10351 (2019). - Bao, Siqi, et al. "Plato-2: Towards building an open-domain chatbot via curriculum learning." arXiv preprint arXiv:2006.16779 (2020). - Yang, Zhilin, et al. "Xlnet: Generalized autoregressive pretraining for language understanding." arXiv preprint arXiv:1906.08237 (2019). -- Cui, Yiming, et al. "Pre-training with whole word masking for chinese bert." arXiv preprint arXiv:1906.08101 (2019). \ No newline at end of file +- Cui, Yiming, et al. "Pre-training with whole word masking for chinese bert." arXiv preprint arXiv:1906.08101 (2019). diff --git a/examples/language_model/gpt/README.md b/examples/language_model/gpt/README.md index 0f7ed6fdc851..21ce5b1cce11 100644 --- a/examples/language_model/gpt/README.md +++ b/examples/language_model/gpt/README.md @@ -339,7 +339,7 @@ F1 | 0.549810 | ## 其他 -本项目提供了Huggingface的权重转化示例`converter.py`,`python xxx-gpt.bin`即可完成转换。用户可以参考转化脚本,转换自己需要的模型权重。 +本项目提供了Huggingface的权重转化示例`converter.py`,`python converter.py xxx-gpt.bin`即可完成转换。用户可以参考转化脚本,转换自己需要的模型权重。 ## 参考文献 - [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) diff --git a/paddlenlp/transformers/gpt/modeling.py b/paddlenlp/transformers/gpt/modeling.py index 50d4e89b3e27..96a9d6f9c6de 100644 --- a/paddlenlp/transformers/gpt/modeling.py +++ b/paddlenlp/transformers/gpt/modeling.py @@ -503,6 +503,36 @@ class GPTPretrainedModel(PretrainedModel): "eos_token_id": 50256, "eol_token_id": 198, }, + "gpt2-xl-en": { # 1558M + "vocab_size": 50257, + "hidden_size": 1600, + "num_hidden_layers": 48, + "num_attention_heads": 25, + "intermediate_size": 6400, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 1024, + "type_vocab_size": 1, # no use + "initializer_range": 0.02, + "eos_token_id": 50256, + "eol_token_id": 198, + }, + "gpt2-large-en": { # 774M + "vocab_size": 50257, + "hidden_size": 1280, + "num_hidden_layers": 36, + "num_attention_heads": 20, + "intermediate_size": 5120, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "attention_probs_dropout_prob": 0.1, + "max_position_embeddings": 1024, + "type_vocab_size": 1, # no use + "initializer_range": 0.02, + "eos_token_id": 50256, + "eol_token_id": 198, + }, "gpt2-medium-en": { #345M "vocab_size": 50304, "hidden_size": 1024, @@ -519,7 +549,7 @@ class GPTPretrainedModel(PretrainedModel): "eol_token_id": 198, }, "gpt2-en": { #117M - "vocab_size": 50304, + "vocab_size": 50257, "hidden_size": 768, "num_hidden_layers": 12, "num_attention_heads": 12, @@ -556,8 +586,14 @@ class GPTPretrainedModel(PretrainedModel): "https://paddlenlp.bj.bcebos.com/models/transformers/gpt/gpt-cpm-large-cn.pdparams", "gpt-cpm-small-cn-distill": "https://paddlenlp.bj.bcebos.com/models/transformers/gpt/gpt-cpm-small-cn-distill.pdparams", + "gpt2-en": + "https://paddlenlp.bj.bcebos.com/models/transformers/gpt/gpt2-en.pdparams", "gpt2-medium-en": "https://paddlenlp.bj.bcebos.com/models/transformers/gpt/gpt2-medium-en.pdparams", + "gpt2-large-en": + "https://paddlenlp.bj.bcebos.com/models/transformers/gpt/gpt2-large-en.pdparams", + "gpt2-xl-en": + "https://paddlenlp.bj.bcebos.com/models/transformers/gpt/gpt2-xl-en.pdparams", } } base_model_prefix = "gpt" diff --git a/paddlenlp/transformers/gpt/tokenizer.py b/paddlenlp/transformers/gpt/tokenizer.py index fbc853a593e2..38f855c4ddcd 100644 --- a/paddlenlp/transformers/gpt/tokenizer.py +++ b/paddlenlp/transformers/gpt/tokenizer.py @@ -338,6 +338,8 @@ class GPTTokenizer(PretrainedTokenizer): "vocab_file": { "gpt3-13B-en": gpt_vocab_link, "gpt3-1.3B-en": gpt_vocab_link, + "gpt2-xl-en": gpt_vocab_link, + "gpt2-large-en": gpt_vocab_link, "gpt2-medium-en": gpt_vocab_link, "gpt2-en": gpt_vocab_link, "gpt2-small-en": gpt_vocab_link, @@ -345,6 +347,8 @@ class GPTTokenizer(PretrainedTokenizer): "merges_file": { "gpt3-13B-en": gpt_merges_link, "gpt3-1.3B-en": gpt_merges_link, + "gpt2-xl-en": gpt_merges_link, + "gpt2-large-en": gpt_merges_link, "gpt2-medium-en": gpt_merges_link, "gpt2-en": gpt_merges_link, "gpt2-small-en": gpt_merges_link, @@ -353,6 +357,8 @@ class GPTTokenizer(PretrainedTokenizer): pretrained_init_configuration = { "gpt3-13B-en": {}, "gpt3-1.3B-en": {}, + "gpt2-xl-en": {}, + "gpt2-large-en": {}, "gpt2-medium-en": {}, "gpt2-en": {}, "gpt2-small-en": {},