Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update general distill in ppminilm #1520

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/model_compression/pp-minilm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ PP-MiniLM 压缩方案以面向预训练模型的任务无关知识蒸馏(Task-a
│ └── run_clue.sh # CLUE 上的微调启动脚本
│ └── run_one_search.sh # 单数据集下精调脚本
│ └── run_all_search.sh # CLUE数据集下精调脚本
│ └── export_model.sh # 导出 fine-tuned 部署模型脚本
│ └── export_model.py # 导出 fine-tuned 部署模型脚本
├── pruning # 裁剪、蒸馏目录
│ └── prune.py # 裁剪、蒸馏脚本
│ └── prune.sh # 裁剪、蒸馏启动脚本
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ cd ..

其中 `general_distill.py` 参数释义如下:

- `model_type` 指示了学生模型类型,当前仅支持 'ernie'、'roberta'。
- `model_type` 指示了学生模型类型,当前仅支持 'ppminilm'、'roberta'。
- `num_relation_heads` relation head 的个数,一般对于 large-size 的教师模型是64,对于 base-size 的教师模型是 48。
- `teacher_model_type`指示了教师模型类型,当前仅支持 'ernie'、'roberta'。
- `teacher_model_type`指示了教师模型类型,当前仅支持 'roberta'。
- `teacher_layer_index`蒸馏时使用的教师模型的层
- `student_layer_index` 蒸馏时使用的学生模型的层
- `teacher_model_name_or_path`教师模型的名称,例如`'roberta-wwm-ext-large'`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@
from paddlenlp.utils.tools import TimeCostAverage
from paddlenlp.transformers import LinearDecayWithWarmup
from paddlenlp.transformers import RobertaModel, RobertaTokenizer
from paddlenlp.transformers import ErnieModel, ErnieForSequenceClassification, ErnieTokenizer
from paddlenlp.transformers import PPMiniLMModel, PPMiniLMForSequenceClassification, PPMiniLMTokenizer
from paddlenlp.transformers.distill_utils import to_distill, calc_multi_relation_loss

MODEL_CLASSES = {
"roberta": (RobertaModel, RobertaTokenizer),
"ernie": (ErnieForSequenceClassification, ErnieTokenizer)
"ppminilm": (PPMiniLMForSequenceClassification, PPMiniLMTokenizer)
}


Expand All @@ -47,14 +47,14 @@ def parse_args():
# Required parameters
parser.add_argument(
"--model_type",
default="ernie",
default="ppminilm",
type=str,
required=True,
help="Model type selected in the list: " +
", ".join(MODEL_CLASSES.keys()), )
parser.add_argument(
"--teacher_model_type",
default="ernie",
default="roberta",
type=str,
required=True,
help="Model type selected in the list: " +
Expand Down Expand Up @@ -276,28 +276,28 @@ def do_train(args):
# For student
model_class, _ = MODEL_CLASSES[args.model_type]
if args.num_layers == 6:
ernie = ErnieModel(
ppminilm = PPMiniLMModel(
vocab_size=tokenizer.vocab_size,
num_hidden_layers=6,
hidden_act='relu',
intermediate_size=3072,
hidden_size=768) # layer: 6
elif args.num_layers == 4:
ernie = ErnieModel(
ppminilm = PPMiniLMModel(
vocab_size=tokenizer.vocab_size,
num_hidden_layers=4,
hidden_act='relu',
intermediate_size=1024,
hidden_size=256,
num_attention_heads=16) # layer: 4
else:
ernie = ErnieModel(
ppminilm = PPMiniLMModel(
vocab_size=tokenizer.vocab_size,
num_hidden_layers=2,
hidden_act='relu',
hidden_size=128,
intermediate_size=512) # layer: 2
student = model_class(ernie)
student = model_class(ppminilm)

teacher = teacher_model_class.from_pretrained(
args.teacher_model_name_or_path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ cp ../../../../paddlenlp/transformers/distill_utils.py ${output_dir}/


python3 -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" general_distill.py \
--model_type ernie \
--model_type ppminilm \
--num_relation_heads ${numH} \
--teacher_model_type ${teacher} \
--teacher_layer_index ${teacher_layer_index} \
Expand Down
6 changes: 3 additions & 3 deletions paddlenlp/transformers/distill_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from paddle.fluid.data_feeder import convert_dtype

from paddlenlp.utils.log import logger
from paddlenlp.transformers import ErnieForSequenceClassification
from paddlenlp.transformers import PPMiniLMForSequenceClassification
from paddlenlp.transformers import TinyBertForPretraining
from paddlenlp.transformers import BertForSequenceClassification

Expand Down Expand Up @@ -208,15 +208,15 @@ def to_distill(self,
if return_qkv:
# forward function of student class should be replaced for distributed training.
TinyBertForPretraining._forward = minilm_pretraining_forward
ErnieForSequenceClassification._forward = minilm_pretraining_forward
PPMiniLMForSequenceClassification._forward = minilm_pretraining_forward
else:
TinyBertForPretraining._forward = tinybert_forward

def init_func(layer):
if isinstance(layer, (MultiHeadAttention, TransformerEncoderLayer,
TransformerEncoder, TinyBertForPretraining,
BertForSequenceClassification,
ErnieForSequenceClassification)):
PPMiniLMForSequenceClassification)):
layer.forward = layer._forward
if isinstance(layer, TransformerEncoder):
layer.return_layer_outputs = return_layer_outputs
Expand Down