diff --git a/applications/text_classification/hierarchical/few-shot/README.md b/applications/text_classification/hierarchical/few-shot/README.md index 347e522164f8..c1311eda6d1e 100644 --- a/applications/text_classification/hierarchical/few-shot/README.md +++ b/applications/text_classification/hierarchical/few-shot/README.md @@ -65,9 +65,9 @@ 内存: 630 GB -3. PaddlePaddle 版本:2.3.1 +3. PaddlePaddle 版本:2.4rc -4. PaddleNLP 版本:2.3.5 (develop) +4. PaddleNLP 版本:2.4.3 5. 评估设置 @@ -91,7 +91,7 @@ | model_name | 训练方式 | Micro F1分数 | Macro F1分数 | | ---------- | ------- | ----------- | ----------- | | ernie-3.0-base-zh | 微调学习 | 0.7172 | 0.3821 | - | ernie-3.0-base-zh | 提示学习 | 0.8855 | 0.8443 | + | ernie-3.0-base-zh | 提示学习 | 0.8945 | 0.8516 | @@ -102,10 +102,10 @@ ### 3.1 运行环境 -- python >= 3.6 -- paddlepaddle > 2.3(2.4版本发布前推荐安装[develop版本](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)) -- paddlenlp >= 2.3.5 -- paddle2onnx >= 1.0.0rc3 +- python >= 3.7 +- paddlepaddle >= 2.4rc +- paddlenlp >= 2.4.3 +- paddle2onnx >= 1.0.3 ### 3.2 代码结构 @@ -222,12 +222,12 @@ python train.py \ --do_export \ --num_train_epochs 100 \ --logging_steps 5 \ +--save_total_limit 1 \ --per_device_eval_batch_size 32 \ --per_device_train_batch_size 8 \ --metric_for_best_model macro_f1_score \ --load_best_model_at_end \ ---evaluation_strategy epoch \ ---save_strategy epoch +--eval_steps 100 ``` **多卡训练** @@ -247,12 +247,12 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \ --do_export \ --num_train_epochs 100 \ --logging_steps 5 \ +--save_total_limit 1 \ --per_device_eval_batch_size 32 \ --per_device_train_batch_size 8 \ --metric_for_best_model macro_f1_score \ --load_best_model_at_end \ ---evaluation_strategy epoch \ ---save_strategy epoch +--eval_steps 100 ``` 可配置参数说明: @@ -273,6 +273,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \ - `do_export`: 是否在运行结束时将模型导出为静态图,保存路径为`output_dir/export`。 - `num_train_epochs`: 训练的最大轮数。 - `max_steps`: 训练的最大步数。此设置将会覆盖`num_train_epochs`。 +- `save_total_limit`: 模型检查点保存数量。 - `device`: 使用的设备,默认为`gpu`。 - `eval_steps`: 评估模型的间隔步数。 - `logging_steps`: 打印日志的间隔步数。 @@ -352,9 +353,9 @@ python infer.py --model_path_prefix checkpoints/export/model --data_dir ./data - 可配置参数说明: - `model_path_prefix`: 导出的静态图模型路径及文件前缀。 -- `model_name_or_path`: 内置预训练模型名,或者模型参数配置目录路径,用于加载tokenizer。默认为`ernie-3.0-base-zh`。 +- `model_name`: 内置预训练模型名,用于加载tokenizer。默认为`ernie-3.0-base-zh`。 - `data_dir`: 待推理数据所在路径,数据应存放在该目录下的`data.txt`文件。 -- `max_seq_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。 +- `max_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。 - `batch_size`: 每次预测的样本数量。 - `device`: 选择推理设备,包括`cpu`和`gpu`。默认为`gpu`。 - `device_id`: 指定GPU设备ID。 diff --git a/applications/text_classification/hierarchical/few-shot/infer.py b/applications/text_classification/hierarchical/few-shot/infer.py index 7442641d502f..5fc92a9dd933 100644 --- a/applications/text_classification/hierarchical/few-shot/infer.py +++ b/applications/text_classification/hierarchical/few-shot/infer.py @@ -14,23 +14,24 @@ import os import six +import json import psutil import argparse import numpy as np from paddlenlp.utils.log import logger -from paddlenlp.prompt import AutoTemplate, Verbalizer, InputExample -from paddlenlp.transformers import AutoTokenizer +from paddlenlp.prompt import AutoTemplate, PromptDataCollatorWithPadding +from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM import paddle2onnx import onnxruntime as ort # yapf: disable parser = argparse.ArgumentParser() parser.add_argument("--model_path_prefix", type=str, required=True, help="The path prefix of inference model to be used.") -parser.add_argument("--model_name_or_path", default="ernie-3.0-base-zh", type=str, help="The directory or name of model.") +parser.add_argument("--model_name", default="ernie-3.0-base-zh", type=str, help="The name of pretrained model.") parser.add_argument("--data_dir", default=None, type=str, help="The path to the prediction data, including label.txt and data.txt.") -parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") +parser.add_argument("--max_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") parser.add_argument("--use_fp16", action='store_true', help="Whether to use fp16 inference, only takes effect when deploying on gpu.") parser.add_argument("--batch_size", default=200, type=int, help="Batch size per GPU/CPU for predicting.") parser.add_argument("--num_threads", default=psutil.cpu_count(logical=False), type=int, help="num_threads for cpu.") @@ -103,12 +104,6 @@ def __init__(self, 'device_id': device_id }]) - self.input_handles = [ - self.predictor.get_inputs()[0].name, - self.predictor.get_inputs()[1].name, - self.predictor.get_inputs()[2].name - ] - if device == "gpu": try: assert 'CUDAExecutionProvider' in self.predictor.get_providers() @@ -122,27 +117,52 @@ def __init__(self, logger.info(">>> [InferBackend] Engine Created ...") def infer(self, input_dict: dict): - input_dict = { - k: v - for k, v in input_dict.items() if k in self.input_handles - } result = self.predictor.run(None, input_dict) return result class HierachicalPredictor(object): - def __init__(self, args, label_list): - self._label_list = label_list - self._tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) - self._max_seq_length = args.max_seq_length - self._batch_size = args.batch_size - self.inference_backend = InferBackend(args.model_path_prefix, - args.device, args.device_id, - args.use_fp16, args.num_threads) - self._template = AutoTemplate.load_from( - os.path.dirname(args.model_path_prefix), self._tokenizer, - args.max_seq_length) + def __init__(self, args): + self.args = args + self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + self.model = AutoModelForMaskedLM.from_pretrained(args.model_name) + self.template, self.labels, self.input_handles = self.post_init() + self.collate_fn = PromptDataCollatorWithPadding( + self.tokenizer, + padding=True, + return_tensors="np", + return_attention_mask=True) + + self.inference_backend = InferBackend(self.args.model_path_prefix, + self.args.device, + self.args.device_id, + self.args.use_fp16, + self.args.num_threads) + + def post_init(self): + export_path = os.path.dirname(self.args.model_path_prefix) + template_path = os.path.join(export_path, "template_config.json") + with open(template_path, "r") as fp: + prompt = json.load(fp) + template = AutoTemplate.create_from(prompt, self.tokenizer, + self.args.max_length, + self.model) + keywords = template.extract_template_keywords(template.prompt) + inputs = [ + "input_ids", "token_type_ids", "position_ids", "attention_mask", + "masked_positions" + ] + if "soft" in keywords: + inputs.append("soft_token_ids") + if "encoder" in keywords: + inputs.append("encoder_ids") + verbalizer_path = os.path.join(export_path, "verbalizer_config.json") + with open(verbalizer_path, "r") as fp: + label_words = json.load(fp) + labels = sorted(list(label_words.keys())) + + return template, labels, inputs def predict(self, input_data: list): encoded_inputs = self.preprocess(input_data) @@ -155,14 +175,27 @@ def _infer(self, input_dict): infer_data = self.inference_backend.infer(input_dict) return infer_data - def infer_batch(self, encoded_inputs): - num_sample = len(encoded_inputs["input_ids"]) + def infer_batch(self, inputs): + num_sample = len(inputs) infer_data = None num_infer_data = None - for idx in range(0, num_sample, self._batch_size): - l, r = idx, idx + self._batch_size - keys = encoded_inputs.keys() - input_dict = {k: encoded_inputs[k][l:r] for k in keys} + for index in range(0, num_sample, self.args.batch_size): + left, right = index, index + self.args.batch_size + batch_dict = self.collate_fn(inputs[left:right]) + input_dict = {} + for key in self.input_handles: + value = batch_dict[key] + if key == "attention_mask": + if value.ndim == 2: + value = (1 - value[:, np.newaxis, np.newaxis, :]) * -1e4 + elif value.ndim != 4: + raise ValueError( + "Expect attention mask with ndim=2 or 4, but get ndim={}" + .format(value.ndim)) + value = value.astype("float32") + else: + value = value.astype("int64") + input_dict[key] = value results = self._infer(input_dict) if infer_data is None: infer_data = [[x] for x in results] @@ -175,16 +208,8 @@ def infer_batch(self, encoded_inputs): return infer_data def preprocess(self, input_data: list): - text = [InputExample(text_a=x) for x in input_data] - inputs = [self._template.wrap_one_example(x) for x in text] - inputs = { - "input_ids": - np.array([x["input_ids"] for x in inputs], dtype="int64"), - "mask_ids": - np.array([x["mask_ids"] for x in inputs], dtype="int64"), - "soft_token_ids": - np.array([x["soft_token_ids"] for x in inputs], dtype="int64") - } + text = [{"text_a": x} for x in input_data] + inputs = [self.template(x) for x in text] return inputs @staticmethod @@ -197,7 +222,7 @@ def postprocess(self, infer_data): label_ids = np.argwhere(probs > threshold) labels = [[] for _ in range(probs.shape[0])] for idx, label_id in label_ids: - labels[idx].append(self._label_list[label_id]) + labels[idx].append(self.labels[label_id]) return {"label": labels} def printer(self, result, input_data): @@ -212,12 +237,10 @@ def printer(self, result, input_data): for arg_name, arg_value in vars(args).items(): logger.info("{:20}: {}".format(arg_name, arg_value)) - export_path = os.path.dirname(args.model_path_prefix) - labels, _ = Verbalizer.load_from(export_path) + predictor = HierachicalPredictor(args) text_dir = os.path.join(args.data_dir, "data.txt") with open(text_dir, "r", encoding="utf-8") as f: text_list = [x.strip() for x in f.readlines()] - predictor = HierachicalPredictor(args, labels) predictor.predict(text_list) diff --git a/applications/text_classification/hierarchical/few-shot/requirements_cpu.txt b/applications/text_classification/hierarchical/few-shot/requirements_cpu.txt new file mode 100644 index 000000000000..bbe76e363f00 --- /dev/null +++ b/applications/text_classification/hierarchical/few-shot/requirements_cpu.txt @@ -0,0 +1,5 @@ +psutil +paddlepaddle>=2.4rc +paddlenlp>=2.4.3 +paddle2onnx>=1.0.3 +onnxruntime diff --git a/applications/text_classification/hierarchical/few-shot/requirements_gpu.txt b/applications/text_classification/hierarchical/few-shot/requirements_gpu.txt new file mode 100644 index 000000000000..66454bd8b6b5 --- /dev/null +++ b/applications/text_classification/hierarchical/few-shot/requirements_gpu.txt @@ -0,0 +1,7 @@ +psutil +paddlepaddle-gpu>=2.4rc +paddlenlp>=2.4.3 +paddle2onnx>=1.0.3 +onnxruntime-gpu +onnx +onnxconverter-common diff --git a/applications/text_classification/hierarchical/few-shot/train.py b/applications/text_classification/hierarchical/few-shot/train.py index c7c0a2daa38a..aac82bff9ab5 100644 --- a/applications/text_classification/hierarchical/few-shot/train.py +++ b/applications/text_classification/hierarchical/few-shot/train.py @@ -15,6 +15,7 @@ from dataclasses import dataclass, field import os import sys +from collections import defaultdict import paddle import paddle.nn.functional as F @@ -41,9 +42,6 @@ class DataArguments: data_dir: str = field(default="./data", metadata={"help": "The dataset dictionary includes train.txt, dev.txt, test.txt, label.txt and data.txt (optional) files."}) prompt: str = field(default=None, metadata={"help": "The input prompt for tuning."}) - soft_encoder: str = field(default="lstm", metadata={"help": "The encoder type of soft template, `lstm`, `mlp` or None."}) - encoder_hidden_size: int = field(default=200, metadata={"help": "The dimension of soft embeddings."}) - @dataclass class ModelArguments: @@ -67,17 +65,20 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) # Define the template for preprocess and the verbalizer for postprocess. - template = AutoTemplate.create_from( - data_args.prompt, - tokenizer, - training_args.max_seq_length, - model=model, - prompt_encoder=data_args.soft_encoder, - encoder_hidden_size=data_args.encoder_hidden_size) - logger.info("Using template: {}".format(template.template)) + template = AutoTemplate.create_from(data_args.prompt, + tokenizer, + training_args.max_seq_length, + model=model) + logger.info("Using template: {}".format(template.prompt)) label_file = os.path.join(data_args.data_dir, "label.txt") - verbalizer = SoftVerbalizer.from_file(tokenizer, model, label_file) + with open(label_file, "r", encoding="utf-8") as fp: + label_words = defaultdict(list) + for line in fp: + data = line.strip().split("==") + word = data[1] if len(data) > 1 else data[0].split("##")[-1] + label_words[data[0]].append(word) + verbalizer = SoftVerbalizer(label_words, tokenizer, model) # Load the few-shot datasets. train_ds, dev_ds, test_ds = load_local_dataset( @@ -139,11 +140,24 @@ def compute_metrics(eval_preds): # Export static model. if training_args.do_export: + template = prompt_model.template + template_keywords = template.extract_template_keywords(template.prompt) input_spec = [ - InputSpec(shape=[None, None], dtype="int64"), # input_ids - InputSpec(shape=[None, None], dtype="int64"), # mask_ids - InputSpec(shape=[None, None], dtype="int64"), # soft_token_ids + InputSpec(shape=[None, None], dtype="int64"), # input_ids, + InputSpec(shape=[None, None], dtype="int64"), # token_type_ids + InputSpec(shape=[None, None], dtype="int64"), # position_ids + InputSpec(shape=[None, None, None, None], + dtype="float32") # attention_mask ] + if "mask" in template_keywords: + input_spec.append(InputSpec(shape=[None], + dtype="int64")) # masked_positions + if "soft" in template_keywords: + input_spec.append(InputSpec(shape=[None, None], + dtype="int64")) # soft_token_ids + if "encoder" in template_keywords: + input_spec.append(InputSpec(shape=[None, None], + dtype="int64")) # encoder_ids export_path = os.path.join(training_args.output_dir, 'export') trainer.export_model(export_path, input_spec=input_spec, diff --git a/applications/text_classification/hierarchical/few-shot/utils.py b/applications/text_classification/hierarchical/few-shot/utils.py index 4880432310f3..e29aaa9db61d 100644 --- a/applications/text_classification/hierarchical/few-shot/utils.py +++ b/applications/text_classification/hierarchical/few-shot/utils.py @@ -15,7 +15,6 @@ import os from paddlenlp.datasets import load_dataset -from paddlenlp.prompt import InputExample def load_local_dataset(data_path, splits, label_list): @@ -39,14 +38,14 @@ def _reader(data_file, label_list): for idx, line in enumerate(fp): data = line.strip().split("\t") if len(data) == 1: - yield InputExample(text_a=data[0]) + yield {"text_a": data[0]} else: text, label = data label = label.strip().split(",") label = [ float(1) if x in label else float(0) for x in label_list ] - yield InputExample(text_a=text, labels=label) + yield {"text_a": text, "labels": label} split_map = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"} datasets = [] diff --git a/applications/text_classification/multi_class/few-shot/README.md b/applications/text_classification/multi_class/few-shot/README.md index 3f6e5f759bbf..45f9b009bf34 100644 --- a/applications/text_classification/multi_class/few-shot/README.md +++ b/applications/text_classification/multi_class/few-shot/README.md @@ -65,9 +65,9 @@ 内存: 630 GB -3. PaddlePaddle 版本:2.3.1 +3. PaddlePaddle 版本:2.4rc -4. PaddleNLP 版本:2.3.5 (develop) +4. PaddleNLP 版本:2.4.3 5. 评估设置 @@ -82,7 +82,7 @@ python train.py --dataset_dir "./data/" --save_dir "./checkpoints" --max_seq_len - 提示学习 ``` -python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这条新闻写的是" --model_name_or_path ernie-3.0-base-zh --max_seq_length 128 --learning_rate 3e-5 --ppt_learning_rate 3e-4 --do_train --do_eval --num_train_epochs 100 --logging_steps 5 --per_device_eval_batch_size 32 --per_device_train_batch_size 8 --do_predict --metric_for_best_model accuracy --load_best_model_at_end --evaluation_strategy epoch --save_strategy epoch +python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这条新闻写的是" --model_name_or_path ernie-3.0-base-zh --max_seq_length 128 --learning_rate 3e-5 --ppt_learning_rate 3e-4 --do_train --do_eval --num_train_epochs 100 --logging_steps 5 --per_device_eval_batch_size 32 --per_device_train_batch_size 8 --do_predict --metric_for_best_model accuracy --load_best_model_at_end --evaluation_strategy epoch --save_strategy epoch --save_total_limit 1 ``` 6. 精度评价指标:Accuracy @@ -102,10 +102,10 @@ python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这条 ### 3.1 运行环境 -- python >= 3.6 -- paddlepaddle > 2.3 (2.4版本发布前推荐安装[develop版本](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)) -- paddlenlp >= 2.3.5 -- paddle2onnx >= 1.0.0rc3 +- python >= 3.7 +- paddlepaddle >= 2.4rc +- paddlenlp >= 2.4.3 +- paddle2onnx >= 1.0.3 ### 3.2 代码结构 @@ -204,14 +204,15 @@ python train.py \ --output_dir ./checkpoints/ \ --prompt "这条新闻标题的主题是" \ --max_seq_length 128 \ ---learning_rate 3e-5 \ ---ppt_learning_rate 3e-4 \ +--learning_rate 3e-6 \ +--ppt_learning_rate 3e-5 \ --do_train \ --do_eval \ --use_rdrop \ --max_steps 1000 \ --eval_steps 10 \ --logging_steps 5 \ +--save_total_limit 1 \ --load_best_model_at_end True \ --per_device_eval_batch_size 32 \ --per_device_train_batch_size 8 \ @@ -227,8 +228,8 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \ --output_dir ./checkpoints/ \ --prompt "这条新闻标题的主题是" \ --max_seq_length 128 \ ---learning_rate 3e-5 \ ---ppt_learning_rate 3e-4 \ +--learning_rate 3e-6 \ +--ppt_learning_rate 3e-5 \ --do_train \ --do_eval \ --use_rdrop \ @@ -236,6 +237,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \ --max_steps 1000 \ --eval_steps 10 \ --logging_steps 5 \ +--save_total_limit 1 \ --load_best_model_at_end True \ --per_device_eval_batch_size 32 \ --per_device_train_batch_size 8 \ @@ -260,6 +262,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \ - `do_predict`: 是否进行预测。 - `do_export`: 是否在运行结束时将模型导出为静态图,保存路径为`output_dir/export`。 - `max_steps`: 训练的最大步数。此设置将会覆盖`num_train_epochs`。 +- `save_total_limit`: 模型检查点保存数量。 - `eval_steps`: 评估模型的间隔步数。 - `device`: 使用的设备,默认为`gpu`。 - `logging_steps`: 打印日志的间隔步数。 @@ -335,9 +338,9 @@ python infer.py --model_path_prefix checkpoints/export/model --data_dir ./data - 可配置参数说明: - `model_path_prefix`: 导出的静态图模型路径及文件前缀。 -- `model_name_or_path`: 内置预训练模型名,或者模型参数配置目录路径,用于加载tokenizer。默认为`ernie-3.0-base-zh`。 +- `model_name`: 内置预训练模型名,用于加载tokenizer。默认为`ernie-3.0-base-zh`。 - `data_dir`: 待推理数据所在路径,数据应存放在该目录下的`data.txt`文件。 -- `max_seq_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。 +- `max_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。 - `batch_size`: 每次预测的样本数量。 - `device`: 选择推理设备,包括`cpu`和`gpu`。默认为`gpu`。 - `device_id`: 指定GPU设备ID。 diff --git a/applications/text_classification/multi_class/few-shot/infer.py b/applications/text_classification/multi_class/few-shot/infer.py index 142a20d1ed76..03715bc29d0e 100644 --- a/applications/text_classification/multi_class/few-shot/infer.py +++ b/applications/text_classification/multi_class/few-shot/infer.py @@ -14,23 +14,24 @@ import os import six +import json import psutil import argparse import numpy as np from paddlenlp.utils.log import logger -from paddlenlp.prompt import AutoTemplate, Verbalizer, InputExample -from paddlenlp.transformers import AutoTokenizer +from paddlenlp.prompt import AutoTemplate, PromptDataCollatorWithPadding +from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM import paddle2onnx import onnxruntime as ort # yapf: disable parser = argparse.ArgumentParser() parser.add_argument("--model_path_prefix", type=str, required=True, help="The path prefix of inference model to be used.") -parser.add_argument("--model_name_or_path", default="ernie-3.0-base-zh", type=str, help="The directory or name of model.") +parser.add_argument("--model_name", default="ernie-3.0-base-zh", type=str, help="The name of pretrained model.") parser.add_argument("--data_dir", default=None, type=str, help="The path to the prediction data, including label.txt and data.txt.") -parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") +parser.add_argument("--max_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") parser.add_argument("--use_fp16", action='store_true', help="Whether to use fp16 inference, only takes effect when deploying on gpu.") parser.add_argument("--batch_size", default=200, type=int, help="Batch size per GPU/CPU for predicting.") parser.add_argument("--num_threads", default=psutil.cpu_count(logical=False), type=int, help="num_threads for cpu.") @@ -103,11 +104,6 @@ def __init__(self, 'device_id': device_id }]) - self.input_handles = [ - self.predictor.get_inputs()[0].name, - self.predictor.get_inputs()[1].name, - self.predictor.get_inputs()[2].name - ] if device == "gpu": try: @@ -122,27 +118,53 @@ def __init__(self, logger.info(">>> [InferBackend] Engine Created ...") def infer(self, input_dict: dict): - input_dict = { - k: v - for k, v in input_dict.items() if k in self.input_handles - } result = self.predictor.run(None, input_dict) return result class MultiClassPredictor(object): - def __init__(self, args, label_list): - self._label_list = label_list - self._tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) - self._max_seq_length = args.max_seq_length - self._batch_size = args.batch_size - self.inference_backend = InferBackend(args.model_path_prefix, - args.device, args.device_id, - args.use_fp16, args.num_threads) - self._template = AutoTemplate.load_from( - os.path.dirname(args.model_path_prefix), self._tokenizer, - args.max_seq_length) + def __init__(self, args): + self.args = args + self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + self.model = AutoModelForMaskedLM.from_pretrained(args.model_name) + self.template, self.labels, self.input_handles = self.post_init() + self.collate_fn = PromptDataCollatorWithPadding( + self.tokenizer, + padding=True, + return_tensors="np", + return_attention_mask=True) + + self.inference_backend = InferBackend(self.args.model_path_prefix, + self.args.device, + self.args.device_id, + self.args.use_fp16, + self.args.num_threads) + + def post_init(self): + export_path = os.path.dirname(self.args.model_path_prefix) + template_path = os.path.join(export_path, "template_config.json") + with open(template_path, "r") as fp: + prompt = json.load(fp) + template = AutoTemplate.create_from(prompt, self.tokenizer, + self.args.max_length, + self.model) + keywords = template.extract_template_keywords(template.prompt) + inputs = [ + "input_ids", "token_type_ids", "position_ids", "attention_mask" + ] + if "mask" in keywords: + inputs.append("masked_positions") + if "soft" in keywords: + inputs.append("soft_token_ids") + if "encoder" in keywords: + inputs.append("encoder_ids") + verbalizer_path = os.path.join(export_path, "verbalizer_config.json") + with open(verbalizer_path, "r") as fp: + label_words = json.load(fp) + labels = sorted(list(label_words.keys())) + + return template, labels, inputs def predict(self, input_data: list): encoded_inputs = self.preprocess(input_data) @@ -155,14 +177,27 @@ def _infer(self, input_dict): infer_data = self.inference_backend.infer(input_dict) return infer_data - def infer_batch(self, encoded_inputs): - num_sample = len(encoded_inputs["input_ids"]) + def infer_batch(self, inputs): + num_sample = len(inputs) infer_data = None num_infer_data = None - for idx in range(0, num_sample, self._batch_size): - l, r = idx, idx + self._batch_size - keys = encoded_inputs.keys() - input_dict = {k: encoded_inputs[k][l:r] for k in keys} + for index in range(0, num_sample, self.args.batch_size): + left, right = index, index + self.args.batch_size + batch_dict = self.collate_fn(inputs[left:right]) + input_dict = {} + for key in self.input_handles: + value = batch_dict[key] + if key == "attention_mask": + if value.ndim == 2: + value = (1 - value[:, np.newaxis, np.newaxis, :]) * -1e4 + elif value.ndim != 4: + raise ValueError( + "Expect attention mask with ndim=2 or 4, but get ndim={}" + .format(value.ndim)) + value = value.astype("float32") + else: + value = value.astype("int64") + input_dict[key] = value results = self._infer(input_dict) if infer_data is None: infer_data = [[x] for x in results] @@ -175,21 +210,13 @@ def infer_batch(self, encoded_inputs): return infer_data def preprocess(self, input_data: list): - text = [InputExample(text_a=x) for x in input_data] - inputs = [self._template.wrap_one_example(x) for x in text] - inputs = { - "input_ids": - np.array([x["input_ids"] for x in inputs], dtype="int64"), - "mask_ids": - np.array([x["mask_ids"] for x in inputs], dtype="int64"), - "soft_token_ids": - np.array([x["soft_token_ids"] for x in inputs], dtype="int64") - } + text = [{"text_a": x} for x in input_data] + inputs = [self.template(x) for x in text] return inputs def postprocess(self, infer_data): preds = np.argmax(infer_data[0], axis=-1) - labels = [self._label_list[x] for x in preds] + labels = [self.labels[x] for x in preds] return {"label": labels} def printer(self, result, input_data): @@ -204,12 +231,10 @@ def printer(self, result, input_data): for arg_name, arg_value in vars(args).items(): logger.info("{:20}: {}".format(arg_name, arg_value)) - export_path = os.path.dirname(args.model_path_prefix) - labels, _ = Verbalizer.load_from(export_path) + predictor = MultiClassPredictor(args) text_dir = os.path.join(args.data_dir, "data.txt") with open(text_dir, "r", encoding="utf-8") as f: text_list = [x.strip() for x in f.readlines()] - predictor = MultiClassPredictor(args, labels) predictor.predict(text_list) diff --git a/applications/text_classification/multi_class/few-shot/requirements_cpu.txt b/applications/text_classification/multi_class/few-shot/requirements_cpu.txt new file mode 100644 index 000000000000..bbe76e363f00 --- /dev/null +++ b/applications/text_classification/multi_class/few-shot/requirements_cpu.txt @@ -0,0 +1,5 @@ +psutil +paddlepaddle>=2.4rc +paddlenlp>=2.4.3 +paddle2onnx>=1.0.3 +onnxruntime diff --git a/applications/text_classification/multi_class/few-shot/requirements_gpu.txt b/applications/text_classification/multi_class/few-shot/requirements_gpu.txt new file mode 100644 index 000000000000..66454bd8b6b5 --- /dev/null +++ b/applications/text_classification/multi_class/few-shot/requirements_gpu.txt @@ -0,0 +1,7 @@ +psutil +paddlepaddle-gpu>=2.4rc +paddlenlp>=2.4.3 +paddle2onnx>=1.0.3 +onnxruntime-gpu +onnx +onnxconverter-common diff --git a/applications/text_classification/multi_class/few-shot/train.py b/applications/text_classification/multi_class/few-shot/train.py index a07d27ea5879..0e67750deb23 100644 --- a/applications/text_classification/multi_class/few-shot/train.py +++ b/applications/text_classification/multi_class/few-shot/train.py @@ -14,6 +14,9 @@ from dataclasses import dataclass, field import os +from collections import defaultdict + +import numpy as np import paddle from paddle.static import InputSpec @@ -37,8 +40,6 @@ class DataArguments: data_dir: str = field(default="./data/", metadata={"help": "Path to a dataset which includes train.txt, dev.txt, test.txt, label.txt and data.txt (optional)."}) prompt: str = field(default=None, metadata={"help": "The input prompt for tuning."}) - soft_encoder: str = field(default="lstm", metadata={"help": "The encoder type of soft template, `lstm`, `mlp` or None."}) - encoder_hidden_size: int = field(default=200, metadata={"help": "The dimension of soft embeddings."}) @dataclass @@ -63,17 +64,20 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) # Define the template for preprocess and the verbalizer for postprocess. - template = AutoTemplate.create_from( - data_args.prompt, - tokenizer, - training_args.max_seq_length, - model=model, - prompt_encoder=data_args.soft_encoder, - encoder_hidden_size=data_args.encoder_hidden_size) - logger.info("Using template: {}".format(template.template)) + template = AutoTemplate.create_from(data_args.prompt, + tokenizer, + training_args.max_seq_length, + model=model) + logger.info("Using template: {}".format(template.prompt)) label_file = os.path.join(data_args.data_dir, "label.txt") - verbalizer = SoftVerbalizer.from_file(tokenizer, model, label_file) + with open(label_file, "r", encoding="utf-8") as fp: + label_words = defaultdict(list) + for line in fp: + data = line.strip().split("==") + word = data[1] if len(data) > 1 else data[0].split("##")[-1] + label_words[data[0]].append(word) + verbalizer = SoftVerbalizer(label_words, tokenizer, model) # Load the few-shot datasets. train_ds, dev_ds, test_ds = load_local_dataset( @@ -133,11 +137,24 @@ def compute_metrics(eval_preds): # Export static model. if training_args.do_export: + template = prompt_model.template + template_keywords = template.extract_template_keywords(template.prompt) input_spec = [ - InputSpec(shape=[None, None], dtype="int64"), # input_ids - InputSpec(shape=[None, None], dtype="int64"), # mask_ids - InputSpec(shape=[None, None], dtype="int64"), # soft_token_ids + InputSpec(shape=[None, None], dtype="int64"), # input_ids, + InputSpec(shape=[None, None], dtype="int64"), # token_type_ids + InputSpec(shape=[None, None], dtype="int64"), # position_ids + InputSpec(shape=[None, None, None, None], + dtype="float32") # attention_mask ] + if "mask" in template_keywords: + input_spec.append(InputSpec(shape=[None], + dtype="int64")) # masked_positions + if "soft" in template_keywords: + input_spec.append(InputSpec(shape=[None, None], + dtype="int64")) # soft_token_ids + if "encoder" in template_keywords: + input_spec.append(InputSpec(shape=[None, None], + dtype="int64")) # encoder_ids export_path = os.path.join(training_args.output_dir, 'export') trainer.export_model(export_path, input_spec=input_spec, diff --git a/applications/text_classification/multi_class/few-shot/utils.py b/applications/text_classification/multi_class/few-shot/utils.py index 907b640ea96b..b440c9627917 100644 --- a/applications/text_classification/multi_class/few-shot/utils.py +++ b/applications/text_classification/multi_class/few-shot/utils.py @@ -15,7 +15,6 @@ import os from paddlenlp.datasets import load_dataset -from paddlenlp.prompt import InputExample def load_local_dataset(data_path, splits, label_list): @@ -38,10 +37,10 @@ def _reader(data_file, label_list): for idx, line in enumerate(fp): data = line.strip().split("\t") if len(data) == 1: - yield InputExample(text_a=data[0]) + yield {"text_a": data[0]} else: text, label = data - yield InputExample(text_a=text, labels=label_list[label]) + yield {"text_a": text, "labels": label_list[label]} assert isinstance(splits, list) and len(splits) > 0 diff --git a/applications/text_classification/multi_label/few-shot/README.md b/applications/text_classification/multi_label/few-shot/README.md index d7ee94ff68f2..4bb3bbf51fb5 100644 --- a/applications/text_classification/multi_label/few-shot/README.md +++ b/applications/text_classification/multi_label/few-shot/README.md @@ -70,9 +70,9 @@ 内存: 630 GB -3. PaddlePaddle 版本:2.3.1 +3. PaddlePaddle 版本:2.4rc -4. PaddleNLP 版本:2.3.5 (develop) +4. PaddleNLP 版本:2.4.3 5. 评估设置 @@ -88,7 +88,7 @@ - 提示学习 ``` - python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这句话包含的要素有" --model_name_or_path ernie-3.0-base-zh --max_seq_length 128 --learning_rate 3e-5 --ppt_learning_rate 3e-4 --do_train --do_eval --num_train_epochs 100 --logging_steps 5 --per_device_eval_batch_size 32 --per_device_train_batch_size 8 --do_predict --metric_for_best_model macro_f1_score --load_best_model_at_end --evaluation_strategy epoch --save_strategy epoch + python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这句话包含的要素有" --model_name_or_path ernie-3.0-base-zh --max_seq_length 128 --learning_rate 3e-5 --ppt_learning_rate 3e-4 --do_train --do_eval --num_train_epochs 100 --logging_steps 5 --per_device_eval_batch_size 32 --per_device_train_batch_size 8 --do_predict --metric_for_best_model macro_f1_score --load_best_model_at_end --eval_steps 100 --save_total_limit 1 ``` 6. 精度评价指标:Micro F1分数、Macro F1分数 @@ -96,7 +96,7 @@ | model_name | 训练方式 | Micro F1分数 | Macro F1分数 | | ---------- | ------- | ----------- | ----------- | | ernie-3.0-base-zh | 微调学习 | 0.7419 | 0.5105 | - | ernie-3.0-base-zh | 提示学习 | 0.7838 | 0.6985 | + | ernie-3.0-base-zh | 提示学习 | 0.7839 | 0.6003 | ## 3.定制训练 @@ -106,10 +106,10 @@ ### 3.1 运行环境 -- python >= 3.6 -- paddlepaddle > 2.3(2.4版本发布前推荐安装[develop版本](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)) -- paddlenlp >= 2.3.5 -- paddle2onnx >= 1.0.0rc3 +- python >= 3.7 +- paddlepaddle >= 2.4rc +- paddlenlp >= 2.4.3 +- paddle2onnx >= 1.0.3 ### 3.2 代码结构 @@ -223,6 +223,7 @@ python train.py \ --do_export \ --num_train_epochs 100 \ --logging_steps 5 \ +--save_total_limit 1 \ --per_device_eval_batch_size 32 \ --per_device_train_batch_size 8 \ --metric_for_best_model macro_f1_score \ @@ -248,6 +249,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \ --do_export \ --num_train_epochs 100 \ --logging_steps 5 \ +--save_total_limit 1 \ --per_device_eval_batch_size 32 \ --per_device_train_batch_size 8 \ --metric_for_best_model macro_f1_score \ @@ -274,6 +276,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \ - `do_export`: 是否在运行结束时将模型导出为静态图,保存路径为`output_dir/export`。 - `num_train_epochs`: 训练的最大轮数。 - `max_steps`: 训练的最大步数。此设置将会覆盖`num_train_epochs`。 +- `save_total_limit`: 模型检查点保存数量。 - `device`: 使用的设备,默认为`gpu`。 - `eval_steps`: 评估模型的间隔步数。 - `logging_steps`: 打印日志的间隔步数。 @@ -352,9 +355,9 @@ python infer.py --model_path_prefix checkpoints/export/model --data_dir ./data - 可配置参数说明: - `model_path_prefix`: 导出的静态图模型路径及文件前缀。 -- `model_name_or_path`: 内置预训练模型名,或者模型参数配置目录路径,用于加载tokenizer。默认为`ernie-3.0-base-zh`。 +- `model_name`: 内置预训练模型名,用于加载tokenizer。默认为`ernie-3.0-base-zh`。 - `data_dir`: 待推理数据所在路径,数据应存放在该目录下的`data.txt`文件。 -- `max_seq_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。 +- `max_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。 - `batch_size`: 每次预测的样本数量。 - `device`: 选择推理设备,包括`cpu`和`gpu`。默认为`gpu`。 - `device_id`: 指定GPU设备ID。 diff --git a/applications/text_classification/multi_label/few-shot/infer.py b/applications/text_classification/multi_label/few-shot/infer.py index 48d42d294e96..2014afaef56b 100644 --- a/applications/text_classification/multi_label/few-shot/infer.py +++ b/applications/text_classification/multi_label/few-shot/infer.py @@ -14,23 +14,24 @@ import os import six +import json import psutil import argparse import numpy as np from paddlenlp.utils.log import logger -from paddlenlp.prompt import AutoTemplate, Verbalizer, InputExample -from paddlenlp.transformers import AutoTokenizer +from paddlenlp.prompt import AutoTemplate, PromptDataCollatorWithPadding +from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM import paddle2onnx import onnxruntime as ort # yapf: disable parser = argparse.ArgumentParser() parser.add_argument("--model_path_prefix", type=str, required=True, help="The path prefix of inference model to be used.") -parser.add_argument("--model_name_or_path", default="ernie-3.0-base-zh", type=str, help="The directory or name of model.") +parser.add_argument("--model_name", default="ernie-3.0-base-zh", type=str, help="The name of pretrained model.") parser.add_argument("--data_dir", default=None, type=str, help="The path to the prediction data, including label.txt and data.txt.") -parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") +parser.add_argument("--max_length", default=128, type=int, help="The maximum total input sequence length after tokenization.") parser.add_argument("--use_fp16", action='store_true', help="Whether to use fp16 inference, only takes effect when deploying on gpu.") parser.add_argument("--batch_size", default=200, type=int, help="Batch size per GPU/CPU for predicting.") parser.add_argument("--num_threads", default=psutil.cpu_count(logical=False), type=int, help="num_threads for cpu.") @@ -103,11 +104,6 @@ def __init__(self, 'device_id': device_id }]) - self.input_handles = [ - self.predictor.get_inputs()[0].name, - self.predictor.get_inputs()[1].name, - self.predictor.get_inputs()[2].name - ] if device == "gpu": try: @@ -122,27 +118,52 @@ def __init__(self, logger.info(">>> [InferBackend] Engine Created ...") def infer(self, input_dict: dict): - input_dict = { - k: v - for k, v in input_dict.items() if k in self.input_handles - } result = self.predictor.run(None, input_dict) return result class MultiLabelPredictor(object): - def __init__(self, args, label_list): - self._label_list = label_list - self._tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) - self._max_seq_length = args.max_seq_length - self._batch_size = args.batch_size - self.inference_backend = InferBackend(args.model_path_prefix, - args.device, args.device_id, - args.use_fp16, args.num_threads) - self._template = AutoTemplate.load_from( - os.path.dirname(args.model_path_prefix), self._tokenizer, - args.max_seq_length) + def __init__(self, args): + self.args = args + self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + self.model = AutoModelForMaskedLM.from_pretrained(args.model_name) + self.template, self.labels, self.input_handles = self.post_init() + self.collate_fn = PromptDataCollatorWithPadding( + self.tokenizer, + padding=True, + return_tensors="np", + return_attention_mask=True) + + self.inference_backend = InferBackend(self.args.model_path_prefix, + self.args.device, + self.args.device_id, + self.args.use_fp16, + self.args.num_threads) + + def post_init(self): + export_path = os.path.dirname(self.args.model_path_prefix) + template_path = os.path.join(export_path, "template_config.json") + with open(template_path, "r") as fp: + prompt = json.load(fp) + template = AutoTemplate.create_from(prompt, self.tokenizer, + self.args.max_length, + self.model) + keywords = template.extract_template_keywords(template.prompt) + inputs = [ + "input_ids", "token_type_ids", "position_ids", "attention_mask", + "masked_positions" + ] + if "soft" in keywords: + inputs.append("soft_token_ids") + if "encoder" in keywords: + inputs.append("encoder_ids") + verbalizer_path = os.path.join(export_path, "verbalizer_config.json") + with open(verbalizer_path, "r") as fp: + label_words = json.load(fp) + labels = sorted(list(label_words.keys())) + + return template, labels, inputs def predict(self, input_data: list): encoded_inputs = self.preprocess(input_data) @@ -155,14 +176,27 @@ def _infer(self, input_dict): infer_data = self.inference_backend.infer(input_dict) return infer_data - def infer_batch(self, encoded_inputs): - num_sample = len(encoded_inputs["input_ids"]) + def infer_batch(self, inputs): + num_sample = len(inputs) infer_data = None num_infer_data = None - for idx in range(0, num_sample, self._batch_size): - l, r = idx, idx + self._batch_size - keys = encoded_inputs.keys() - input_dict = {k: encoded_inputs[k][l:r] for k in keys} + for index in range(0, num_sample, self.args.batch_size): + left, right = index, index + self.args.batch_size + batch_dict = self.collate_fn(inputs[left:right]) + input_dict = {} + for key in self.input_handles: + value = batch_dict[key] + if key == "attention_mask": + if value.ndim == 2: + value = (1 - value[:, np.newaxis, np.newaxis, :]) * -1e4 + elif value.ndim != 4: + raise ValueError( + "Expect attention mask with ndim=2 or 4, but get ndim={}" + .format(value.ndim)) + value = value.astype("float32") + else: + value = value.astype("int64") + input_dict[key] = value results = self._infer(input_dict) if infer_data is None: infer_data = [[x] for x in results] @@ -175,16 +209,8 @@ def infer_batch(self, encoded_inputs): return infer_data def preprocess(self, input_data: list): - text = [InputExample(text_a=x) for x in input_data] - inputs = [self._template.wrap_one_example(x) for x in text] - inputs = { - "input_ids": - np.array([x["input_ids"] for x in inputs], dtype="int64"), - "mask_ids": - np.array([x["mask_ids"] for x in inputs], dtype="int64"), - "soft_token_ids": - np.array([x["soft_token_ids"] for x in inputs], dtype="int64") - } + text = [{"text_a": x} for x in input_data] + inputs = [self.template(x) for x in text] return inputs @staticmethod @@ -197,7 +223,7 @@ def postprocess(self, infer_data): label_ids = np.argwhere(probs > threshold) labels = [[] for _ in range(probs.shape[0])] for idx, label_id in label_ids: - labels[idx].append(self._label_list[label_id]) + labels[idx].append(self.labels[label_id]) return {"label": labels} def printer(self, result, input_data): @@ -212,12 +238,10 @@ def printer(self, result, input_data): for arg_name, arg_value in vars(args).items(): logger.info("{:20}: {}".format(arg_name, arg_value)) - export_path = os.path.dirname(args.model_path_prefix) - labels, _ = Verbalizer.load_from(export_path) + predictor = MultiLabelPredictor(args) text_dir = os.path.join(args.data_dir, "data.txt") with open(text_dir, "r", encoding="utf-8") as f: text_list = [x.strip() for x in f.readlines()] - predictor = MultiLabelPredictor(args, labels) predictor.predict(text_list) diff --git a/applications/text_classification/multi_label/few-shot/requirements_cpu.txt b/applications/text_classification/multi_label/few-shot/requirements_cpu.txt new file mode 100644 index 000000000000..bbe76e363f00 --- /dev/null +++ b/applications/text_classification/multi_label/few-shot/requirements_cpu.txt @@ -0,0 +1,5 @@ +psutil +paddlepaddle>=2.4rc +paddlenlp>=2.4.3 +paddle2onnx>=1.0.3 +onnxruntime diff --git a/applications/text_classification/multi_label/few-shot/requirements_gpu.txt b/applications/text_classification/multi_label/few-shot/requirements_gpu.txt new file mode 100644 index 000000000000..66454bd8b6b5 --- /dev/null +++ b/applications/text_classification/multi_label/few-shot/requirements_gpu.txt @@ -0,0 +1,7 @@ +psutil +paddlepaddle-gpu>=2.4rc +paddlenlp>=2.4.3 +paddle2onnx>=1.0.3 +onnxruntime-gpu +onnx +onnxconverter-common diff --git a/applications/text_classification/multi_label/few-shot/train.py b/applications/text_classification/multi_label/few-shot/train.py index 44f0c190c90f..6d12f6b3e5fd 100644 --- a/applications/text_classification/multi_label/few-shot/train.py +++ b/applications/text_classification/multi_label/few-shot/train.py @@ -15,6 +15,7 @@ from dataclasses import dataclass, field import os import sys +from collections import defaultdict import paddle import paddle.nn.functional as F @@ -41,8 +42,6 @@ class DataArguments: data_dir: str = field(default="./data", metadata={"help": "The dataset dictionary includes train.txt, dev.txt and label.txt files."}) prompt: str = field(default=None, metadata={"help": "The input prompt for tuning."}) - soft_encoder: str = field(default="lstm", metadata={"help": "The encoder type of soft template, `lstm`, `mlp` or None."}) - encoder_hidden_size: int = field(default=200, metadata={"help": "The dimension of soft embeddings."}) @dataclass @@ -67,17 +66,20 @@ def main(): tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) # Define the template for preprocess and the verbalizer for postprocess. - template = AutoTemplate.create_from( - data_args.prompt, - tokenizer, - training_args.max_seq_length, - model=model, - prompt_encoder=data_args.soft_encoder, - encoder_hidden_size=data_args.encoder_hidden_size) - logger.info("Using template: {}".format(template.template)) + template = AutoTemplate.create_from(data_args.prompt, + tokenizer, + training_args.max_seq_length, + model=model) + logger.info("Using template: {}".format(template.prompt)) label_file = os.path.join(data_args.data_dir, "label.txt") - verbalizer = SoftVerbalizer.from_file(tokenizer, model, label_file) + with open(label_file, "r", encoding="utf-8") as fp: + label_words = defaultdict(list) + for line in fp: + data = line.strip().split("==") + word = data[1] if len(data) > 1 else data[0].split("##")[-1] + label_words[data[0]].append(word) + verbalizer = SoftVerbalizer(label_words, tokenizer, model) # Load the few-shot datasets. train_ds, dev_ds, test_ds = load_local_dataset( @@ -139,11 +141,24 @@ def compute_metrics(eval_preds): # Export static model. if training_args.do_export: + template = prompt_model.template + template_keywords = template.extract_template_keywords(template.prompt) input_spec = [ - InputSpec(shape=[None, None], dtype="int64"), # input_ids - InputSpec(shape=[None, None], dtype="int64"), # mask_ids - InputSpec(shape=[None, None], dtype="int64"), # soft_token_ids + InputSpec(shape=[None, None], dtype="int64"), # input_ids, + InputSpec(shape=[None, None], dtype="int64"), # token_type_ids + InputSpec(shape=[None, None], dtype="int64"), # position_ids + InputSpec(shape=[None, None, None, None], + dtype="float32") # attention_mask ] + if "mask" in template_keywords: + input_spec.append(InputSpec(shape=[None], + dtype="int64")) # masked_positions + if "soft" in template_keywords: + input_spec.append(InputSpec(shape=[None, None], + dtype="int64")) # soft_token_ids + if "encoder" in template_keywords: + input_spec.append(InputSpec(shape=[None, None], + dtype="int64")) # encoder_ids export_path = os.path.join(training_args.output_dir, 'export') trainer.export_model(export_path, input_spec=input_spec, diff --git a/applications/text_classification/multi_label/few-shot/utils.py b/applications/text_classification/multi_label/few-shot/utils.py index 6891c22829fc..4855e43e7bf3 100644 --- a/applications/text_classification/multi_label/few-shot/utils.py +++ b/applications/text_classification/multi_label/few-shot/utils.py @@ -15,7 +15,6 @@ import os from paddlenlp.datasets import load_dataset -from paddlenlp.prompt import InputExample def load_local_dataset(data_path, splits, label_list): @@ -39,14 +38,14 @@ def _reader(data_file, label_list): for idx, line in enumerate(fp): data = line.strip().split("\t") if len(data) == 1: - yield InputExample(text_a=data[0]) + yield {"text_a": data[0]} else: text, label = data label = label.strip().split(",") label = [ float(1) if x in label else float(0) for x in label_list ] - yield InputExample(text_a=text, labels=label) + yield {"text_a": text, "labels": label} split_map = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"} datasets = [] diff --git a/docs/advanced_guide/prompt.md b/docs/advanced_guide/prompt.md index e45aca61d4b6..c8b720261302 100644 --- a/docs/advanced_guide/prompt.md +++ b/docs/advanced_guide/prompt.md @@ -22,11 +22,11 @@ Prompt API 提供了这类算法实现的基本模块,支持[PET](https://arxi * [如何定义模板](#如何定义模板) * [离散型模板](#离散型模板) * [连续型模板](#连续型模板) + * [前缀连续型模板](#前缀连续型模板) * [快速定义模板](#快速定义模板) * [如何定义标签词映射](#如何定义标签词映射) - * [单掩码映射](#单掩码映射) - * [多掩码映射](#多掩码映射) - * [标签词映射分类](#标签词映射分类) + * [离散型标签词映射](#离散型标签词映射) + * [连续型标签词映射](#连续型标签词映射) * [快速开始训练](#快速开始训练) * [数据准备](#数据准备) * [预训练参数准备](#预训练参数准备) @@ -39,18 +39,28 @@ Prompt API 提供了这类算法实现的基本模块,支持[PET](https://arxi ## 如何定义模板 -**模板**(Template)的功能是在原有输入文本上增加提示语句,从而将原任务转化为 MLM 任务,可以分为离散型和连续型两种。Prompt API 中提供了统一的数据结构来构造不同类型的模板,输入相应格式的**字符串**,通过解析得到对应的输入模板,即字典构成的列表。 +**模板**(Template)的功能是在原有输入文本上增加提示语句,从而将原任务转化为 MLM 任务,可以分为离散型和连续型两种。Prompt API 中提供了统一的数据结构来构造不同类型的模板,输入相应格式的**字符串**,通过解析得到对应的输入模板。模板由不同字段构成,可任意组合。每个字段中的关键字定义了数据文本或者提示文本,即 `input_ids`,属性可定义该字段是否可截断,以及对应的 `position_ids`,`token_type_ids` 等。 ### 离散型模板 离散型模板 `ManualTemplate` 是直接将提示语句与原始输入文本拼接起来,二者的词向量矩阵共享,均为预训练模型学到的词向量矩阵。可用于实现 PET、RGL 等算法。 -**模板关键字** +**模板关键字及属性** -- ``text`` :数据集中原始输入文本对应的关键字,包括`text_a`和`text_b`。[数据准备](#数据准备)中介绍了如何将自定义数据集转化为统一格式。 -- ``hard`` :自定义的文本提示语句。 +- ``text`` :数据集中原始输入文本对应的关键字,例如,`text_a`、`text_b` 和 `content`。 +- ``hard`` :自定义的提示语句文本。 - ``mask`` :待预测词的占位符。 -- ``sep`` :用于区分不同的句子。`sep`前后的句子对应不同的`token_type_id`。 + - ``length`` :定义 ``mask`` 的数量。 +- ``sep`` :句间的标志符。不同句子的 `token_type_ids` 需使用 `token_type` 属性定义,默认相同。 +- ``options`` :数据集字典或者文件中的候选标签序列。 + - ``add_omask`` :在每个标签前新增 `[O-MASK]` 字符,用于计算候选标签的预测值。支持实现 [UniMC](https://arxiv.org/pdf/2210.08590.pdf) 算法。 + - ``add_prompt`` :给每个标签拼接固定的提示文本,标签位置由 `[OPT]` 标记。支持实现 [EFL](https://arxiv.org/pdf/2104.14690.pdf) 算法。 + +**模版通用属性** + +- `position`: 定义当前字段的起始 `position id`。 +- `token_type`: 定义当前字段及后续字段的 `token type id`。 +- `truncate`: 定义当提示和文本总长度超过最大长度时,当前字段是否可截断。可选 `True` 和 `False`。 **模板定义** @@ -64,16 +74,25 @@ Prompt API 提供了这类算法实现的基本模块,支持[PET](https://arxi “{'text': 'text_a'}”和“{'text': 'text_b'}”之间的逻辑关系是{'mask'} ``` +``` +{'options': './data/label.txt'}{'sep'}下边两句话间的逻辑关系是什么?{'text': 'text_a'}{'sep': None, 'token_type': 1}{'text': 'text_b'} +``` +其中 `label.txt` 为候选标签的本地文件路径,每行一个候选标签,例如 + +``` +中立 +蕴含 +矛盾 +``` + **样本示例** 例如,对于自然语言推理任务,给定样本 ```python -from paddlenlp.prompt import InputExample -sample = InputExample(uid=0, - text_a="心里有些生畏,又不知畏惧什么", - text_b="心里特别开心", - labels="矛盾") +sample = { + "text_a": "心里有些生畏,又不知畏惧什么", "text_b": "心里特别开心", "labels": "矛盾" +} ``` 按照模板修改拼接后,最终输入模型的文本数据为 @@ -90,17 +109,17 @@ from paddlenlp.prompt import ManualTemplate from paddlenlp.transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh") -template = ManualTemplate(tokenizer=tokenizer, - max_seq_length=512, - template="“{'text': 'text_a'}”和“{'text': 'text_b'}”之间的逻辑关系是{'mask'}") -input_dict = template.wrap_one_example(sample) +template = ManualTemplate(prompt="“{'text': 'text_a'}”和“{'text': 'text_b'}”之间的逻辑关系是{'mask'}", + tokenizer=tokenizer, + max_length=512) +input_dict = template(sample) ``` 其中初始化参数定义如下 +- ``prompt`` :定义提示语句以及与输入文本组合方式的字符串。 - ``tokenizer`` :预训练模型的 tokenizer,用于文本编码。 -- ``max_seq_length`` :定义输入模型文本的最大长度,包括提示部分。当输入长度超过最大长度时,只会截断`text`关键字对应的输入文本,提示部分不做处理。 -- ``template`` :定义提示语句以及与输入文本组合方式的字符串。 +- ``max_length`` :定义输入模型文本的最大长度,包括提示部分。 **使用技巧** @@ -115,24 +134,42 @@ input_dict = template.wrap_one_example(sample) **模板关键字** -- ``text`` :数据集中原始输入文本对应的关键字,包括`text_a`和`text_b`。[数据准备](#数据准备)中介绍了如何将自定义数据集转化为统一格式。 +- ``text`` :数据集中原始输入文本对应的关键字,例如,`text_a`和`text_b`。 - ``hard`` :自定义的文本提示语句。 - ``mask`` :待预测词的占位符。 -- ``sep`` :用于区分不同的句子。`sep`前后的句子对应不同的`token_type_id`。 -- ``soft`` 表示连续型提示。若值为 ``None`` ,则使用对应数量的随机初始化向量作为提示;若值为文本,则使用对应长度的连续性向量作为提示,并预训练词向量中文本对应的向量进行初始化。 +- ``sep`` :句间的标志符。不同句子的 `token_type_ids` 需使用 `token_type` 属性定义,默认相同。 +- ``soft`` 表示连续型提示。若值为 ``None`` ,则随机初始化提示向量;若值为文本,则使用文本对应的预训练字向量初始化提示向量。 + - ``length`` :定义 ``soft token`` 的数量。若定义文本长度小于该值,超过部分随机初始化。 + - ``encoder`` :定义 `soft token` 的编码器类型,可选 `lstm`,`mlp`。默认为 `None`, 不使用编码器。 + - ``hidden_size`` :定义编码器的隐藏层维度。默认与预训练词向量维度相同。 +- ``options`` :数据集字典或者文件中的候选标签序列。 + - ``add_omask`` :在每个标签前新增 `[O-MASK]` 字符,用于计算候选标签的预测值。支持实现 [UniMC](https://arxiv.org/pdf/2210.08590.pdf) 算法。 + - ``add_prompt`` :给每个标签拼接固定的提示文本,标签位置由 `[OPT]` 标记。支持实现 [EFL](https://arxiv.org/pdf/2104.14690.pdf) 算法。 + +**模版通用属性** + +- `position`: 定义当前字段的起始 `position id`。 +- `token_type`: 定义当前字段及后续字段的 `token type id`。 +- `truncate`: 定义当提示和文本总长度超过最大长度时,当前字段是否可截断。可选 `True` 和 `False`。 **模板定义** - 定义长度为 1 的连续型提示,随机初始化: ```python -"{'soft': None}{'text': 'text_a'}{'sep'}{'text': 'text_b'}" +"{'soft'}{'text': 'text_a'}{'sep': None, 'token_type': 1}{'text': 'text_b'}" +``` + +- 定义长度为 10 的连续型提示,随机初始化,编码器为 `mlp`: + +```python +"{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': None, 'length':10, 'encoder': 'mlp'}{'mask'}" ``` -- 定义长度为 10 的连续型提示,随机初始化,其中 ``duplicate`` 参数表示连续型提示的长度(仅在随机初始化时有效,即`soft`值为`None`): +- 定义长度为 15 的连续型提示,使用 `请判断` 初始化前三个 soft token,其余随机初始化,编码器为隐藏层维度为 100 的双层 LSTM: ```python -"{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': None, `duplicate`:10}{'mask'}" +"{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断:', 'length': 15, 'encoder': 'lstm', 'hidden_size': 100}{'mask'}" ``` - 定义长度为 15 的连续型提示,使用 `"请判断这两个句子间的逻辑关系:"` 的预训练词向量逐一进行初始化: @@ -156,38 +193,98 @@ from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM model = AutoModelForMaskedLM.from_pretrained("ernie-3.0-base-zh") tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh") -template = SoftTemplate(tokenizer=tokenizer, - max_seq_length=512, - model=model, - template="{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断这两个句子间的逻辑关系:'}{'mask'}", - prompt_encoder='lstm', - encoder_hidden_size=200) +template = SoftTemplate(prompt="{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断这两个句子间的逻辑关系:'}{'mask'}", + tokenizer=tokenizer, + max_length=512, + word_embeddings=model.get_input_embeddings()) ``` 其中初始化参数定义如下 +- ``prompt`` :定义连续型模板的提示语句、初始化以及与输入文本组合方式的字符串。 - ``tokenizer`` :预训练模型的 tokenizer,用于文本编码。 -- ``max_seq_length`` :定义输入模型文本的最大长度,包括提示部分。当输入长度超过最大长度时,只会截断`text`关键字对应的输入文本,提示部分不做处理。 -- ``model`` : 预训练语言模型,为了取预训练词向量用于连续型提示向量初始化。 -- ``template`` :定义连续型模板的提示语句、初始化以及与输入文本组合方式的字符串。 -- ``prompt_encoder`` : 连续型提示向量的编码器,可选 ``mlp`` 和 ``lstm``。默认为 ``None`` ,即无编码器,直接使用向量。 -- ``encoder_hidden_size`` : 连续型提示向量的维度。默认为 ``None`` ,即与预训练词向量维度相同。 +- ``max_seq_length`` :定义输入模型文本的最大长度,包括提示部分。 +- ``word_embeddings`` :预训练语言模型的词向量,用于连续型提示向量初始化。 +- ``soft_embeddings`` :连续型提示向量矩阵,可用于不同模板间的连续型参数共享。设置后将覆盖默认连续型向量矩阵。 **使用技巧** - 对于分类任务,推荐的连续型提示长度一般为10-20。 - 对于随机初始化的连续性 prompt 向量,通常用比预训练模型微调更大的学习率来更新参数。 - 与离散型模板相似,连续型模板对初始化参数也比较敏感。自定义提示语句作为连续性 prompt 向量的初始化参数通常比随机初始化效果好。 -- prompt_encoder 为已有论文中的策略,用于建模不同连续型提示向量之间的序列关系。在实际应用中推荐先去掉 prompt_encoder 调整向量初始化。 +- prompt_encoder 为已有论文中的策略,用于建模不同连续型提示向量之间的序列关系。 + + +### 前缀连续型模板 + +`PrefixTemplate` 同样使用了连续型向量作为提示,与 `SoftTemplate` 的不同,该模版的提示向量不仅仅作用于输入层,每层都会有相应的提示向量。可用于实现 P-Tuning 等算法。 + +**模板关键字** + +- ``text`` :数据集中原始输入文本对应的关键字,例如,`text_a`和`text_b`。 +- ``hard`` :自定义的文本提示语句。 +- ``mask`` :待预测词的占位符。 +- ``sep`` :句间的标志符。不同句子的 `token_type_ids` 需使用 `token_type` 属性定义,默认相同。 +- ``prefix`` 表示连续型提示,该字段**必须**位于模板首位。若值为 ``None`` ,则随机初始化提示向量;若值为文本,则使用文本对应的预训练字向量初始化提示向量。 + - ``length`` :定义 ``soft token`` 的数量。若定义文本长度小于该值,超过部分随机初始化。 + - ``encoder`` :定义 `soft token` 的编码器类型,可选 `lstm`,`mlp`。默认为 `None`, 不使用编码器。 + - ``hidden_size`` :定义编码器的隐藏层维度。默认与预训练词向量维度相同。 +- ``options`` :数据集字典或者文件中的候选标签序列。 + - ``add_omask`` :在每个标签前新增 `[O-MASK]` 字符,用于计算候选标签的预测值。支持实现 [UniMC](https://arxiv.org/pdf/2210.08590.pdf) 算法。 + - ``add_prompt`` :给每个标签拼接固定的提示文本,标签位置由 `[OPT]` 标记。支持实现 [EFL](https://arxiv.org/pdf/2104.14690.pdf) 算法。 + +**模版通用属性** + +- `position`: 定义当前字段的起始 `position id`。 +- `token_type`: 定义当前字段及后续字段的 `token type id`。 +- `truncate`: 定义当提示和文本总长度超过最大长度时,当前字段是否可截断。可选 `True` 和 `False`。 + +**模板定义** + +- 定义长度为 15 的连续型提示,随机初始化: + +```python +"{'prefix': '新闻类别', 'length': 10, 'encoder': 'lstm'}{'text': 'text_a'}" +``` + +- 定义混合模板,这里`prefix`关键字对应的提示和`hard`对应的提示对应两套不同的向量: + +```python +"{'prefix': '自然语言推理任务:', 'encoder': 'mlp'}{'text': 'text_a'}{'sep'}{'text': 'text_b'}这两个句子间的逻辑关系是{'mask'}" +``` + + +**调用 API** + +```python +from paddlenlp.prompt import PrefixTemplate +from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM + +model = AutoModelForMaskedLM.from_pretrained("ernie-3.0-base-zh") +tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh") +template = PrefixTemplate(prompt="{'prefix': '任务描述'}{'text': 'text_a'}{'mask'}", + tokenizer=tokenizer, + max_length=512, + model=model, + prefix_dropout=0.1) +``` + +其中初始化参数定义如下 + +- ``prompt`` :定义连续型模板的提示语句、初始化以及与输入文本组合方式的字符串。 +- ``tokenizer`` :预训练模型的 tokenizer,用于文本编码。 +- ``max_length`` :定义输入模型文本的最大长度,包括提示部分。 +- ``model`` :预训练语言模型,用于连续型提示向量初始化,以及根据模型结构生成每层对应的提示向量。 +- ``prefix_dropout`` :连续型提示向量的丢弃概率,用于正则化。 ### 快速定义模板 -PaddleNLP 提供了 ``AutoTemplate`` API 以便快速定义单句输入的手工初始化的连续型模板,同时支持直接按照模板类型自动切换离散型模板和离散型模板。 +PaddleNLP 提供了 ``AutoTemplate`` API 快速定义简化离散型模板,也可根据完整模板字符串自动切换 ManualTemplate、SoftTemplate 和 PrefixTemplate。 **模板定义** -- 只定义用于初始化连续型向量的文本提示,即可得到拼接到句尾的连续型模板输入。例如, +- 快速定义离散型的文本提示。例如, ```python "这篇文章表达了怎样的情感?" @@ -196,7 +293,7 @@ PaddleNLP 提供了 ``AutoTemplate`` API 以便快速定义单句输入的手工 等价于 ```python -"{'text': 'text_a'}{'soft': '这篇文章表达了怎样的情感?'}{'mask'}" +"{'text': 'text_a'}{'hard': '这篇文章表达了怎样的情感?'}{'mask'}" ``` - 当输入为完整模板字符串时,解析得到的模板与[离散型模板](#离散型模板)和[连续型模板](#连续型模板)中描述的一致。 @@ -210,40 +307,37 @@ from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM model = AutoModelForMaskedLM.from_pretrained("ernie-3.0-base-zh") tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh") # 离散型模板,返回值为 ManualTemplate 实例 -template = AutoTemplate.create_from(template="{'text': 'text_a'}和{'text': 'text_b'}之间的逻辑关系是{'mask'}", +template = AutoTemplate.create_from(prompt="这个句子表达了怎样的情感?", + tokenizer=tokenizer, + max_length=512) + +template = AutoTemplate.create_from(prompt="这个句子表达了怎样的情感?{'text': 'text_a'}{'mask'}", tokenizer=tokenizer, - max_seq_length=512) + max_length=512) # 连续型模板,返回值为 SoftTemplate 实例 -template = AutoTemplate.create_from(template="{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断这两个句子间的逻辑关系:'}{'mask'}", +template = AutoTemplate.create_from(prompt="{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断这两个句子间的逻辑关系:'}{'mask'}", tokenizer=tokenizer, - max_seq_length=512, - model=model, - prompt_encoder='lstm', - encoder_hidden_size=200) + max_length=512, + model=model) -# 快速定义单句连续型模板,返回值为 SoftTemplate 实例 -template = AutoTemplate.create_from(template="这篇文章表达了怎样的情感?", +# 前缀连续型模板,返回值为 PrefixTemplate 实例 +template = AutoTemplate.create_from(prompt="{'prefix': None, 'encoder': 'mlp', 'hidden_size': 50}{'text': 'text_a'}", tokenizer=tokenizer, - max_seq_length=512, - model=model, - prompt_encoder='lstm', - encoder_hidden_size=200) + max_length=512, + model=model) ``` 其中初始化参数定义如下 +- ``prompt`` :定义离散型/连续型提示、初始化以及和输入文本的组合方式。 - ``tokenizer`` :预训练模型的 tokenizer,用于文本编码。 -- ``max_seq_length`` :定义输入模型文本的最大长度,包括提示部分。当输入长度超过最大长度时,只会截断`text`关键字对应的输入文本,提示部分不做处理。 +- ``max_length`` :定义输入模型文本的最大长度,包括提示部分。 - ``model`` :预训练语言模型,为了取预训练词向量用于连续型提示向量初始化。 -- ``template`` :定义离散型/连续型提示、初始化以及和输入文本的组合方式。 -- ``prompt_encoder`` :连续型提示向量的编码器,可选 ``mlp`` 和 ``lstm`` 。默认为 ``None`` ,即无编码器,直接使用向量。 -- ``encoder_hidden_size`` :连续型提示向量的维度。默认为 ``None`` ,即与预训练词向量维度相同。 - ## 如何定义标签词映射 -**标签词映射**(Verbalizer)也是提示学习中可选的重要模块,用于建立预测词和标签之间的映射,将“预训练-微调”模式中预测标签的任务转换为预测模板中掩码位置的词语,从而将下游任务统一为预训练任务的形式。目前框架支持了离散型标签词映射和 [Word-level Adversarial ReProgramming (WARP)](https://aclanthology.org/2021.acl-long.381/) 方法。 +**标签词映射**(Verbalizer)也是提示学习中可选的重要模块,用于建立预测词和标签之间的映射,将“预训练-微调”模式中预测标签的任务转换为预测模板中掩码位置的词语,从而将下游任务统一为预训练任务的形式。目前框架支持了离散型标签词映射和连续型标签词映射 [Word-level Adversarial ReProgramming (WARP)](https://aclanthology.org/2021.acl-long.381/) 方法。 例如,在情感二分类任务中,微调方法和提示学习的标签体系如下 @@ -259,9 +353,9 @@ template = AutoTemplate.create_from(template="这篇文章表达了怎样的情 具体来说,对于模板 ``{'text':'text_a'}这句话表示我{'mask'}满意。`` ,我们使用映射 ``{'负向': '不', '正向': '很'}`` 将标签 ``负向`` 映射为 ``不`` ,将标签 ``正向`` 映射为 ``很`` 。也就是说,我们期望对于正向情感的文本,预测结果为 ``...这句话表示我很满意。`` ,对于负向情感的文本,预测结果为 ``...这句话表示我不满意。`` -### 单掩码映射 +### 离散型标签词映射 -``ManualVerbalizer`` 支持构造简单的单 ``{'mask'}`` 标签词映射,直接作用于 ``AutoMaskedLM`` 模型结构。当标签对应的预测词长度大于 ``1`` 时取均值。 +``ManualVerbalizer`` 支持构造 ``{'mask'}`` 对应的标签词映射,支持多``{'mask'}``,直接作用于 ``AutoMaskedLM`` 模型结构。当标签对应的预测词长度大于 ``1`` 时,默认取均值。 **调用 API** @@ -271,41 +365,15 @@ from paddlenlp.transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh") verbalizer = ManualVerbalizer(tokenizer=tokenizer, - labels=['负向', '正向'], - label_words={'负向': '不', '正向': '很'}, - prefix=None) + label_words={'负向': '不', '正向': '很'}) ``` 其中初始化参数定义如下 +- ``label_words`` : 原标签到预测词之间的映射字典。 - ``tokenizer`` : 预训练模型的 tokenizer,用于预测词的编码。 -- ``labels`` : 数据集的原标签列表(可选)。 -- ``label_words`` : 原标签到预测词之间的映射字典。如果同时定义了 ``labels`` ,二者的标签集合需要相同。 -- ``prefix`` : 预测词解码前增加的前缀,用于 ``RoBERTa`` 等对前缀敏感的模型,例如 `roberta-large`, `good` 和 ` good` 经过 tokenize 会得到不同的 id。默认为 ``None`` ,无前缀。 - -### 多掩码映射 - -``MultiMaskVerbalizer`` 继承自 ``ManualVerbalizer`` ,支持多 ``{'mask'}`` 标签词映射。预测词长度需与 ``{'mask'}`` 长度一致。 - -**调用 API** - -```python -from paddlenlp.prompt import MultiMaskVerbalizer -from paddlenlp.transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh") -verbalizer = MultiMaskVerbalizer(tokenizer=tokenizer, - labels=['负向', '正向'], - label_words={'负向': '生气', '正向': '高兴'}, - prefix=None) -``` - - -其中初始化参数定义同[单掩码映射](#单掩码映射) 。 - - -### 标签词映射分类 +### 连续型标签词映射 标签词映射分类器 ``SoftVerbalizer`` 修改了原 ``AutoMaskedLM`` 的模型结构,将预训练模型最后一层“隐藏层-词表”替换为“隐藏层-标签”的映射。该层网络的初始化参数由标签词映射中的预测词词向量来决定,如果预测词长度大于 ``1`` ,则使用词向量均值进行初始化。当前支持的预训练模型包括 ``ErnieForMaskedLM`` 、 ``BertForMaskedLM`` 、 ``AlbertForMaskedLM`` 和 ``RobertaForMaskedLM`` 。可用于实现 WARP 算法。 @@ -318,15 +386,13 @@ from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM model = AutoModelForMaskedLM.from_pretrained("ernie-3.0-base-zh") tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh") -verbalizer = SoftVerbalizer(tokenizer=tokenizer, - model=model, - labels=['负向', '正向'], - label_words={'负向': '生气', '正向': '高兴'}, - prefix=None) +verbalizer = SoftVerbalizer(label_words={'负向': '生气', '正向': '高兴'}, + tokenizer=tokenizer, + model=model) ``` -其中初始化参数定义同[单掩码映射](#单掩码映射) ,此外 - +- ``label_words`` : 原标签到预测词之间的映射字典。 +- ``tokenizer`` : 预训练模型的 tokenizer,用于预测词的编码。 - ``model`` :预训练语言模型,用于取预训练词向量进行“隐藏层-标签”网络的修改和初始化。 ## 快速开始训练 @@ -335,29 +401,24 @@ verbalizer = SoftVerbalizer(tokenizer=tokenizer, ### 数据准备 -Prompt 框架定义了统一的样本结构 ``InputExample`` 以便进行数据处理,数据集样本需要封装在 ``MapDataset`` 中。 +数据集封装为 ``MapDataset`` 类型。每条数据格式为字典结构,字典中关键字与模板中 `text` 定义的值相对应,统一使用 `labels` 关键字表示样本标签。 -例如,对于文本语义相似度 BUSTM 数据集中的原始样本 +例如,文本语义相似度 BUSTM 数据集中的数据样本 ```python -data = [ +from paddlenlp.datasets import MapDataset + +data_ds = MapDataset([ {'id': 3, 'sentence1': '你晚上吃了什么', 'sentence2': '你晚上吃啥了', 'label': 1}, {'id': 4, 'sentence1': '我想打开滴滴叫的士', 'sentence2': '你叫小欧吗', 'label': 0}, {'id': 5, 'sentence1': '女孩子到底是不是你', 'sentence2': '你不是女孩子吗', 'label': 1} -] -``` +]) +def convert_label_keyword(input_dict): + input_dict["labels"] = input_dict.pop("label") + return input_dict -需要转换为统一格式 - -```python -from paddlenlp.datasets import MapDataset -from paddlenlp.prompt import InputExample - -data_ds = MapDataset([InputExample(uid=example["id"], - text_a=example["sentence1"], - text_b=example["sentence2"], - labels=example["label"]) for example in data]) +data_ds = data_ds.map(convert_label_keyword) ``` ### 预训练参数准备 @@ -383,13 +444,13 @@ from paddlenlp.prompt import ManualVerbalizer from paddlenlp.prompt import PromptModelForSequenceClassification # 定义模板 -template = AutoTemplate.create_from(template="{'text': 'text_a'}和{'text': 'text_b'}说的是{'mask'}同的事情。", +template = AutoTemplate.create_from(prompt="{'text': 'text_a'}和{'text': 'text_b'}说的是{'mask'}同的事情。", tokenizer=tokenizer, - max_seq_length=512) + max_length=512) # 定义标签词映射 -verbalizer = ManualVerbalizer(tokenizer=tokenizer, - label_words={0: '不', 1: '相'}) +verbalizer = ManualVerbalizer(label_words={0: '不', 1: '相'}, + tokenizer=tokenizer) # 定义文本分类提示模型 prompt_model = PromptModelForSequenceClassification(model, @@ -404,8 +465,8 @@ prompt_model = PromptModelForSequenceClassification(model, - ``model`` : 预训练模型实例,支持 ``AutoModelForMaskedLM`` 和 ``AutoModelForSequenceClassification`` 。 - ``template`` : 模板实例。 - ``verbalizer`` : 标签词映射实例。当设为 ``None`` 时,不使用标签词映射,模型输出及损失值计算由 ``model`` 类型定义。 -- ``freeze_plm`` : 在训练时是否固定预训练模型参数。对于规模较小的预训练模型,推荐更新预训练模型参数。 -- ``freeze_dropout`` : 在训练时是否固定预训练模型参数并关闭 ``dropout`` 。 当 ``freeze_dropout=True`` ,``freeze_plm`` 也为 ``True`` 。 +- ``freeze_plm`` : 在训练时固定预训练模型参数,默认为 `False`。对于轻量级预训练模型,推荐使用默认值。 +- ``freeze_dropout`` : 在训练时固定预训练模型参数并关闭 ``dropout`` 。 当 ``freeze_dropout=True`` ,``freeze_plm`` 也为 ``True`` 。 ### 使用PromptTrainer训练 @@ -497,6 +558,8 @@ if training_args.do_train: - WARP: Word-level Adversarial ReProgramming. [[PDF]](https://aclanthology.org/2021.acl-long.381/) - RGL: A Simple yet Effective Relation Graph Augmented Prompt-based Tuning Approach for Few-Shot Learning. [[PDF]](https://aclanthology.org/2022.findings-naacl.81/) - R-Drop: Regularized Dropout for Neural Networks. [[PDF]](https://arxiv.org/abs/2106.14448) +- Openprompt: An open-source framework for prompt-learning. [[PDF]](https://arxiv.org/abs/2111.01998) + ### 附录 @@ -506,9 +569,9 @@ if training_args.do_train: | 参数 | 类型 | 默认值 | 含义 | | ---------------- | ------ | ------- | ------------------------------------------------------- | -| max_seq_length | int | 512 | 模型输入的最大长度,包括模板部分 | -| freeze_plm | bool | False | 是否在训练时固定预训练模型的参数 | -| freeze_dropout | bool | False | 是否在训练时固定预训练模型的参数,同时关闭 dropout | +| max_seq_length | int | 512 | 模型输入的最大长度,包括模板部分 | +| freeze_plm | bool | False | 是否在训练时固定预训练模型的参数 | +| freeze_dropout | bool | False | 是否在训练时固定预训练模型的参数,同时关闭 dropout | | use_rdrop | bool | False | 是否使用 RDrop 策略,详见 [RDrop 论文](https://arxiv.org/abs/2106.14448) | | alpha_rdrop | float | 5.0 | RDrop Loss 的权重 | | use_rgl | bool | False | 是否使用 RGL 策略,详见 [RGL 论文](https://aclanthology.org/2022.findings-naacl.81/) | diff --git a/paddlenlp/prompt/prompt_args.py b/paddlenlp/prompt/prompt_args.py index 7c8ca507c82f..fcab68b63754 100644 --- a/paddlenlp/prompt/prompt_args.py +++ b/paddlenlp/prompt/prompt_args.py @@ -91,7 +91,7 @@ class PromptTuningArguments(TrainingArguments): metadata={"help": "Epsilon for the AdamW optimizer of prompt."}) def __post_init__(self): - super().__post_init__() + super(PromptTuningArguments, self).__post_init__() if self.use_rgl and self.alpha_rgl == 0.0: logger.warning("Ignore `use_rgl` because `alpha_rgl` = 0. Please "\ "set `alpha_rgl` a positive float to use RGL loss.") diff --git a/paddlenlp/prompt/prompt_tokenizer.py b/paddlenlp/prompt/prompt_tokenizer.py index efb2e80bce40..46af802586e1 100644 --- a/paddlenlp/prompt/prompt_tokenizer.py +++ b/paddlenlp/prompt/prompt_tokenizer.py @@ -16,122 +16,194 @@ import warnings from functools import partial from collections import defaultdict +from typing import Any, Dict, List, Union import numpy as np -from .prompt_utils import InputFeatures - __all__ = ["MLMPromptTokenizer"] class MLMPromptTokenizer(object): - def __init__(self, tokenizer, max_seq_length, **kwargs): - self._tokenizer = tokenizer - self._max_seq_len = max_seq_length - self._num_special_tokens = self._tokenizer.num_special_tokens_to_add() - self._special_map = { - "": "cls_token", - "": "sep_token", - "": "pad_token", - "": "unk_token", - "": "mask_token" - } - self.mask_token_id = self._tokenizer.mask_token_id - self.pad_token_id = self._tokenizer.pad_token_id - self.soft_token_id = self._tokenizer.unk_token_id - - def __call__(self, input_list): - encoded_input = defaultdict(list) - - for input_dict in input_list: - # Format text and special tokens, then convert them to ids. - if input_dict["mask_ids"] == 1: - text = [self.mask_token_id] - - if input_dict["text"] in self._special_map: - special_token = getattr(self._tokenizer, - self._special_map[input_dict["text"]]) - input_dict["text"] = special_token - - soft_ids = input_dict.get("soft_token_ids", None) - if soft_ids is not None and soft_ids == 1: - text = [self.soft_token_id] - else: - text = self._tokenizer.encode( - input_dict["text"], + omask_token = "[O-MASK]" + + def __init__(self, tokenizer, max_length): + self.tokenizer = tokenizer + self.max_length = max_length + + def __call__(self, inputs: List[Dict[str, Any]]): + part_text = [part["text"] for part in inputs] + part_do_truncate = [part["do_truncate"] for part in inputs] + max_lengths = self._create_max_lengths_from_do_truncate( + part_text, part_do_truncate) + + encoded_inputs = defaultdict(list) + option_length = None + last_position = 1 # Id 0 denotes special token '[CLS]'. + last_token_type = 0 + for index, part in enumerate(inputs): + # Create input_ids. + soft_token_ids = part.get("soft_tokens", None) + if soft_token_ids is None or len( + soft_token_ids) == 1 and soft_token_ids[0] == 0: + input_ids = self.tokenizer.encode( + part["text"], add_special_tokens=False, - return_token_type_ids=False)["input_ids"] - encoded_input["input_ids"].append(text) - - # Extend other features as the same length of input ids. - for key in input_dict: - if key != "text": - encoded_input[key].append([input_dict[key]] * len(text)) - - max_seq_len = self._max_seq_len - self._num_special_tokens - encoded_input = self.truncate(encoded_input, max_seq_len) - encoded_input.pop("shortenable_ids") - encoded_input = self.join(encoded_input) - - encoded_input = self.add_special_tokens(encoded_input) - encoded_input = self.pad(encoded_input, self._max_seq_len, - self.pad_token_id) - return encoded_input - - def add_special_tokens(self, input_dict): + return_token_type_ids=False, + truncation=True, + max_length=max_lengths[index])["input_ids"] + encoded_inputs["soft_token_ids"].append([0] * len(input_ids)) + else: + input_ids = soft_token_ids + encoded_inputs["soft_token_ids"].append(soft_token_ids) + encoded_inputs["input_ids"].append(input_ids) + part_length = len(input_ids) + + # Create position_ids. + position_ids, last_position = self._create_position_ids_from_part( + input_ids, part, last_position) + encoded_inputs["position_ids"].append(position_ids) + + # Create token_type_ids. + if "token_types" in part: + last_token_type = part["token_types"] + encoded_inputs["token_type_ids"].append([last_token_type] * + part_length) + + # Create other features like encoder_ids. + for name in part: + if name not in [ + "text", "soft_tokens", "positions", "token_types" + ]: + encoded_inputs[name].append([part[name]] * part_length) + + # Record the length of options if exists. + if self.omask_token in part["text"]: + if option_length is not None: + raise ValueError( + "There are more than one sequence of options, which " + "will cause wrong attention masks.") + option_length = len(input_ids) + + encoded_inputs.pop("do_truncate") + encoded_inputs = self.join(encoded_inputs) + encoded_inputs = self.add_special_tokens(encoded_inputs) + attention_mask = self._create_attention_mask( + encoded_inputs["input_ids"], option_length) + if attention_mask is not None: + encoded_inputs["attention_mask"] = attention_mask + masked_positions = self._create_masked_positions( + encoded_inputs["input_ids"], encoded_inputs["soft_token_ids"]) + if masked_positions is not None: + encoded_inputs["masked_positions"] = masked_positions + return encoded_inputs + + def _create_position_ids_from_part(self, input_ids: List[int], + part: Dict[str, + Any], last_position: int): + """ + Create position ids from prompt for each part. + """ + part_length = len(input_ids) + if "positions" in part and part["positions"] > 0: + last_position = part["positions"] + if self.omask_token in part["text"]: + omask_id = self.tokenizer.convert_tokens_to_ids(self.omask_token) + omask_index = [ + x for x in range(part_length) if input_ids[x] == omask_id + ] + omask_index = [0] + omask_index + position_ids = [] + max_index = 0 + for start_id, end_id in zip(omask_index[:-1], omask_index[1:]): + position_ids.extend( + list(range(last_position, + last_position + end_id - start_id))) + max_index = max(end_id - start_id, max_index) + if len(position_ids) < part_length: + difference = part_length - len(position_ids) + position_ids.extend( + range(last_position, last_position + difference)) + max_index = max(difference, max_index) + last_position += max_index + else: + position_ids = list( + range(last_position, last_position + part_length)) + last_position += part_length + return position_ids, last_position + + def _create_max_lengths_from_do_truncate(self, part_text: List[str], + part_do_truncate: List[bool]): + """ + Create the max sequence length of each part. + """ + text_length = sum([len(x) for x in part_text]) + if text_length < self.max_length: + return [None] * len(part_text) + + num_special_token = self.tokenizer.num_special_tokens_to_add() + cut_length = text_length - self.max_length + num_special_token + max_lengths = [] + if self.tokenizer.truncation_side == "right": + for index, part in enumerate(part_text[::-1]): + if part_do_truncate[-1 - index] and cut_length > 0: + max_lengths.append(max(len(part) - cut_length, 0)) + cut_length = cut_length - len(part) + else: + max_lengths.append(None) + max_lengths = max_lengths[::-1] + else: + for index, part in enumerate(text): + if part_do_truncate[index] and cut_length > 0: + max_lengths.append(max(len(part) - cut_length, 0)) + cut_length = cut_length - len(part) + else: + max_lengths.append(None) + return max_lengths + + def _create_attention_mask(self, input_ids: List[int], + option_length: Union[int, None]): + if option_length is None: + return None + omask_id = self.tokenizer.convert_tokens_to_ids(self.omask_token) + input_ids = np.array(input_ids) + attention_mask = np.zeros([len(input_ids), len(input_ids)]) + pad_index = np.where(input_ids == self.tokenizer.pad_token_id)[0] + attention_mask[:, pad_index] = 1 + attention_mask[pad_index, :] = 1 + omask_index = np.where(input_ids == omask_id)[0].tolist() + opt_begin, opt_end = omask_index[0], omask_index[0] + option_length + attention_mask[opt_begin:opt_end, opt_begin:opt_end] = 1 + omask_index.append(opt_end) + for opt_begin, opt_end in zip(omask_index[:-1], omask_index[1:]): + attention_mask[opt_begin:opt_end, opt_begin:opt_end] = 0 + attention_mask = (1 - attention_mask) * -1e4 + return attention_mask + + def _create_masked_positions(self, input_ids: List[int], + soft_token_ids: List[int]): + non_soft_ids = np.array(input_ids) * (np.array(soft_token_ids) == 0) + mask_id = self.tokenizer.mask_token_id + + masked_positions = np.where(non_soft_ids == mask_id)[0] + if masked_positions.shape[0] == 0: + return None + return masked_positions.tolist() + + def add_special_tokens(self, input_dict: Dict[str, Any]): for key in input_dict: - new_inputs = self._tokenizer.build_inputs_with_special_tokens( + new_inputs = self.tokenizer.build_inputs_with_special_tokens( input_dict[key]) if key != "input_ids": special_mask = np.array( - self._tokenizer.get_special_tokens_mask(input_dict[key])) + self.tokenizer.get_special_tokens_mask(input_dict[key])) new_inputs = np.array(new_inputs) + # TODO (Huijuan): Use different ids according to specific keyword. new_inputs[special_mask == 1] = 0 new_inputs = new_inputs.tolist() input_dict[key] = new_inputs return input_dict - @staticmethod - def truncate(input_dict, max_seq_len): - total_tokens = sum([len(text) for text in input_dict["input_ids"]]) - trunc_length = total_tokens - max_seq_len - if trunc_length > 0: - truncated_dict = defaultdict(list) - trunc_mask = input_dict["shortenable_ids"] - for key in input_dict: - content = input_dict[key] - count = trunc_length - for idx, text in enumerate(content[::-1]): - index = -idx - 1 - if len(text) == 0 or trunc_mask[index][0] == 0: - continue - if count < len(text): - content[index] = text[:-count] - else: - content[index] = [] - count -= len(text) - if count <= 0: - break - truncated_dict[key] = content - return truncated_dict - else: - return input_dict - - @staticmethod - def pad(input_dict, max_seq_len, pad_id, other_pad_id=0): - for key, content in input_dict.items(): - if len(content) > max_seq_len: - raise ValueError( - f"Truncated length of {key} is still longer than " - f"{max_seq_len}, please use a shorter prompt.") - if key == "input_ids": - pad_seq = [pad_id] * (max_seq_len - len(content)) - else: - pad_seq = [other_pad_id] * (max_seq_len - len(content)) - input_dict[key].extend(pad_seq) - return input_dict - @staticmethod def join(input_dict): for key in input_dict: diff --git a/paddlenlp/prompt/prompt_trainer.py b/paddlenlp/prompt/prompt_trainer.py index 2ffd2d9fd58e..e46d8f810ed4 100644 --- a/paddlenlp/prompt/prompt_trainer.py +++ b/paddlenlp/prompt/prompt_trainer.py @@ -18,25 +18,22 @@ import paddle import paddle.nn as nn import paddle.nn.functional as F -from paddle.static import InputSpec from ..datasets import MapDataset from ..utils.log import logger from ..trainer import Trainer, TrainerCallback -from ..trainer.trainer_utils import EvalPrediction +from ..trainer.trainer_utils import EvalPrediction, get_scheduler from ..data import DataCollator from ..losses import RDropLoss from ..transformers import PretrainedTokenizer, export_model from .template import AutoTemplate from .verbalizer import SoftVerbalizer -from .prompt_utils import InputFeatures, signature +from .prompt_utils import signature, PromptDataCollatorWithPadding from .prompt_args import PromptTuningArguments __all__ = ["PromptTrainer", "PromptModelForSequenceClassification"] -PROMPT_NAME = "prompt.pdparams" - class PromptTrainer(Trainer): """ @@ -67,18 +64,20 @@ def __init__(self, args = PromptTuningArguments(output_dir=output_dir) if data_collator is None: - data_collator = InputFeatures.collate_fn - - super().__init__(model=model, - criterion=criterion, - args=args, - data_collator=data_collator, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - tokenizer=tokenizer, - compute_metrics=compute_metrics, - callbacks=callbacks, - optimizers=optimizers) + data_collator = PromptDataCollatorWithPadding(tokenizer, + padding=True, + return_tensors='pd') + + super(PromptTrainer, self).__init__(model=model, + criterion=criterion, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers) self.load_state_dict_from_checkpoint(args.resume_from_checkpoint) @@ -88,86 +87,74 @@ def __init__(self, if self.args.use_rdrop: self.rdrop_criterion = RDropLoss() + def _get_model(self): + model = self.model + if isinstance(model, paddle.DataParallel): + model = model._layers + return model + @property def template(self): - return self.model.template + return self._get_model().template @template.setter def template(self, template): - self.model.template = template + self._get_model().template = template @property def verbalizer(self): - return getattr(self.model, "verbalizer", None) + return self._get_model().verbalizer @verbalizer.setter def verbalizer(self, verbalizer): - setattr(self.model, "verbalizer", verbalizer) + self._get_model().verbalizer = verbalizer @property - def plm(self): - return self.model.plm + def pretrained_model(self): + self._set_model_attributes(self.model, "plm") - @plm.setter - def plm(self, model): - self.model.plm = model + @pretrained_model.setter + def pretrained_model(self, model): + self._set_model_attributes(self.model, "plm", model) - def _map_dataset(self, dataset): + def _map_dataset(self, dataset: MapDataset): if dataset is None: return None if not isinstance(dataset, MapDataset): raise ValueError("Expected `MapDataset` but received {}.".format( type(dataset))) - return dataset.map(self._convert_example) - def _convert_example(self, example): - encoded_inputs = self.template.wrap_one_example(example) - return encoded_inputs + def encode_with_template(example): + return self.template(example) - def _prepare_input(self, inputs: InputFeatures): + return dataset.map(encode_with_template) + + def _prepare_input(self, inputs: Dict): return inputs - def _save(self, output_dir: Optional[str] = None, state_dict=None): - super()._save(output_dir, state_dict) + def _save(self, + output_dir: Optional[str] = None, + state_dict: Dict[str, Any] = None): + super(PromptTrainer, self)._save(output_dir, state_dict) output_dir = output_dir if output_dir is not None else self.args.output_dir if self.template: - self.template.save_to(output_dir) - if self.verbalizer: - self.verbalizer.save_to(output_dir) + self.template.save(output_dir) + if self.verbalizer is not None: + self.verbalizer.save(output_dir) - def load_state_dict_from_checkpoint(self, resume_from_checkpoint=None): + def load_state_dict_from_checkpoint( + self, resume_from_checkpoint: os.PathLike = None): if resume_from_checkpoint is not None: - self.template = AutoTemplate.load_from( - resume_from_checkpoint, self.tokenizer, - self.args.max_seq_length, self.plm, - getattr(self.template, "_prompt_encoder", None), - getattr(self.template, "encoder_hidden_size", None)) - - super().load_state_dict_from_checkpoint(resume_from_checkpoint) - - def get_eval_dataloader(self, eval_dataset=None): - """ - Return the evaluation [`~paddle.io.DataLoader`]. - - Args: - eval_dataset (`paddlenlp.datasets.MapDataset`): - Created by `paddlenlp.prompt.load_dataset`, - where every item is an InputExample object. - """ - eval_dataset = self._map_dataset(eval_dataset) - return super().get_eval_dataloader(eval_dataset) + self.template = AutoTemplate.load_from(resume_from_checkpoint, + self.tokenizer, + self.args.max_seq_length, + self._get_model()) + super(PromptTrainer, + self).load_state_dict_from_checkpoint(resume_from_checkpoint) def get_test_dataloader(self, test_dataset): - """ - Return the test [`~paddle.io.DataLoader`]. - - Args: - test_dataset (`paddlenlp.datasets.MapDataset`): - The test dataset created by `paddlenlp.prompt.load_dataset`, - where every item is an InputExample object. - """ test_dataset = self._map_dataset(test_dataset) - return super().get_test_dataloader(test_dataset) + return super(PromptTrainer, self).get_test_dataloader(test_dataset) def create_optimizer(self, lr_scheduler=None): """ @@ -178,20 +165,21 @@ def create_optimizer(self, lr_scheduler=None): self.args) plm_parameters = [] - if not self.model.freeze_plm: + if not self.args.freeze_plm: plm_parameters.extend([ - p for p in self.model.plm.parameters() + p for p in self._get_model().plm.parameters() if not p.stop_gradient ]) ppt_parameters = [] if self.template is not None: ppt_parameters.extend([ - x for x in self.template.parameters() if not x.stop_gradient + x for n, x in self.template.named_parameters() + if not x.stop_gradient ]) if self.verbalizer is not None: if isinstance(self.verbalizer, SoftVerbalizer): - if not self.model.freeze_plm: + if not self.args.freeze_plm: plm_parameters.extend([ p for n, p in self.verbalizer.non_head_parameters() if not p.stop_gradient @@ -203,7 +191,7 @@ def create_optimizer(self, lr_scheduler=None): [p for n, p in self.verbalizer.parameters()]) decay_parameters = [ - p.name for n, p in self.model.named_parameters() + p.name for n, p in self._get_model().named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] apply_decay_param_fun = lambda x: x in decay_parameters @@ -225,7 +213,16 @@ def create_optimizer(self, lr_scheduler=None): else: params = plm_parameters else: - lr = self.args.ppt_learning_rate + args = self.init_num_steps(self.args, len(self.train_dataset)) + warmup = args.warmup_steps if args.warmup_steps > 0 else int( + args.warmup_ratio * args.num_training_steps) + self.lr_scheduler = get_scheduler( + args.lr_scheduler_type, + learning_rate=self.args.ppt_learning_rate, + num_warmup_steps=warmup, + num_training_steps=args.num_training_steps, + ) + lr = self.lr_scheduler params = ppt_parameters self.optimizer = optim_cls( @@ -243,20 +240,20 @@ def compute_loss(self, model, inputs, return_outputs=False): Compute the total loss for every batch. """ if "labels" not in inputs: - raise ValueError("Fail to compute loss as there are no labels "\ - "in {}.".format(inputs)) + raise ValueError( + "Fail to compute loss as `labels` not in {}.".format(inputs)) labels = inputs["labels"] - soft_token_ids = inputs.get("soft_token_ids", None) - outputs, hidden_states = model(inputs["input_ids"], - inputs["mask_ids"], - soft_token_ids, - return_hidden_states=True) + input_dict = inputs.copy() + input_dict["return_hidden_states"] = True + outputs, hidden_states = model(**input_dict) + if self.criterion is not None: loss = self.criterion(outputs, labels) if self.args.use_rdrop: - loss = self._compute_rdrop_loss(model, inputs, outputs, loss) + loss = self._compute_rdrop_loss(model, input_dict, outputs, + loss) if self.args.use_rgl: loss += self._compute_rgl_loss(hidden_states, labels) @@ -267,10 +264,10 @@ def compute_loss(self, model, inputs, return_outputs=False): return (loss, outputs) if return_outputs else loss - def _compute_rdrop_loss(self, model, inputs, outputs, loss): - re_outputs = model(inputs["input_ids"], inputs["mask_ids"], - inputs.get("soft_token_ids", None)) - ce_loss = (self.criterion(re_outputs, inputs["labels"]) + loss) * 0.5 + def _compute_rdrop_loss(self, model, input_dict, outputs, loss): + re_outputs, _ = model(**input_dict) + labels = input_dict["labels"] + ce_loss = (self.criterion(re_outputs, labels) + loss) * 0.5 kl_loss = self.rdrop_criterion(outputs, re_outputs) loss = ce_loss + self.args.alpha_rdrop * kl_loss return loss @@ -312,14 +309,11 @@ def _raw_equal(x, y): return loss - def export_model(self, export_path, input_spec=None, export_type="paddle"): + def export_model(self, export_path, input_spec, export_type="paddle"): os.makedirs(export_path, exist_ok=True) - self.template.save_to(export_path) - self.verbalizer.save_to(export_path) - if input_spec is None and hasattr(self.model, "get_input_spec"): - input_spec = self.model.get_input_spec() - if input_spec is None: - raise ValueError("Please define input_spec to export model.") + self.template.save(export_path) + if self.verbalizer is not None: + self.verbalizer.save(export_path) export_model(self.model, input_spec, export_path, export_type) @@ -334,65 +328,62 @@ def __init__(self, verbalizer=None, freeze_plm: bool = False, freeze_dropout: bool = False): - super().__init__() + super(PromptModelForSequenceClassification, self).__init__() self.plm = model self.template = template self.verbalizer = verbalizer self.freeze_plm = freeze_plm self.freeze_dropout = freeze_dropout - if self.verbalizer is not None and hasattr(verbalizer, "process_model"): - self.plm = self.verbalizer.process_model(self.plm) if self.freeze_plm: for param in self.plm.parameters(): param.stop_gradient = True - if self.freeze_dropout: - self.plm.eval() + if self.freeze_dropout: + self.plm.eval() self.forward_keys = signature(self.plm.forward) self._mask_token_id = self.template.tokenizer.mask_token_id self._pad_token_id = self.template.tokenizer.pad_token_id def forward(self, - input_ids=None, - mask_ids=None, + input_ids, + token_type_ids=None, + position_ids=None, + attention_mask=None, + masked_positions=None, soft_token_ids=None, + encoder_ids=None, **kwargs): - return_hidden_states = kwargs.pop('return_hidden_states', False) - if self.freeze_dropout: - self.plm.eval() - attention_mask = (input_ids != self._pad_token_id).astype("int64") - inputs = InputFeatures(input_ids=input_ids, - mask_ids=mask_ids, - attention_mask=attention_mask, - soft_token_ids=soft_token_ids) - if hasattr(self.template, "process_batch"): - inputs = self.template.process_batch(inputs) + input_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "position_ids": position_ids, + "masked_positions": masked_positions, + "soft_token_ids": soft_token_ids, + "attention_mask": attention_mask, + "encoder_ids": encoder_ids + } + input_dict = self.template.process_batch(input_dict) model_inputs = { - k: inputs[k] - for k in inputs.keys(keep_none=True) if k in self.forward_keys + k: input_dict[k] + for k in input_dict if k in self.forward_keys } + model_inputs["masked_positions"] = None outputs = self.plm(**model_inputs) - hidden_states = outputs - if hasattr(self.template, "post_process_batch"): - outputs = self.template.post_process_batch(outputs) - if self.verbalizer and hasattr(self.verbalizer, "process_outputs"): - outputs = self.verbalizer.process_outputs(outputs, inputs=inputs) - - if return_hidden_states: - return outputs, hidden_states + if self.verbalizer is not None: + label_outputs = self.verbalizer.process_outputs( + outputs, input_dict["masked_positions"]) + else: + label_outputs = outputs + + if kwargs.pop('return_hidden_states', False): + return label_outputs, outputs else: - return outputs + return label_outputs def prompt_parameters(self): """ Get the parameters of template and verbalizer. """ - return [p for p in self.template.parameters() - ] + [p for p in self.verbalizer.parameters()] - - def get_input_spec(self): - input_spec = [ - InputSpec(shape=[None, None], dtype="int64"), # input_ids - InputSpec(shape=[None, None], dtype="int64"), # mask_ids - InputSpec(shape=[None, None], dtype="int64") # soft_token_ids - ] - return input_spec + params = [p for p in self.template.parameters()] + if self.verbalizer is not None: + params += [p for p in self.verbalizer.parameters()] + return params diff --git a/paddlenlp/prompt/prompt_utils.py b/paddlenlp/prompt/prompt_utils.py index f51d64552ade..cbf00cf29bc7 100644 --- a/paddlenlp/prompt/prompt_utils.py +++ b/paddlenlp/prompt/prompt_utils.py @@ -1,217 +1,103 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections import defaultdict -import json +""" +Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This module defines the itermediate data structure of inputs. +""" + import inspect -from dataclasses import dataclass, field +from typing import Any, Dict, List, Union, Optional +from dataclasses import dataclass import numpy as np import paddle -from ..utils.log import logger - -__all__ = ["InputExample", "InputFeatures"] +from ..transformers.tokenizer_utils_base import (PretrainedTokenizerBase, + PaddingStrategy) -@dataclass -class InputExample(object): - """Data structure of every example in datasets.""" - uid: str = field(default=None, - metadata={'help': 'A unique identifier of the example.'}) - text_a: str = field( - default=None, - metadata={'help': 'The first text sequence in each example.'}) - text_b: str = field( - default=None, - metadata={'help': 'The other text sequences in each example.'}) - labels: int = field(default=None, - metadata={'help': 'The label in each example.'}) - meta: dict = field( - default=None, - metadata={ - 'help': 'An optional dictionary of other data for each example.' - }) - - def __repr__(self): - content = {k: v for k, v in self.__dict__.items() if v is not None} - content = json.dumps(content, indent=2, sort_keys=True) + '\n' - return str(content) - - def keys(self, keep_none=False): - return [ - key for key in self.__dict__.keys() - if getattr(self, key) is not None - ] - - -class InputFeatures(dict): - """ - Data structure of every wrapped example or a batch of examples as the input of model. - - Args: - input_ids (paddle.Tensor): - The token ids. - attention_mask (paddle.Tensor): - The mask ids. - token_type_ids (paddle.Tensor, optional): - The token type ids. - inputs_embeds (paddle.Tensor, optional): - The embeddings of soft tokens. - mask_ids (paddle.Tensor, optional): - The mask ids where 1 denotes that a token is a mask, 0 denotes it is not a mask. - labels (list, optional): - The labels of classification task. - uid (list, optional): - The unique id(s) for example(s). - """ - input_keys = [ - 'input_ids', 'attention_mask', 'token_type_ids', 'inputs_embeds', 'uid', - 'labels', 'mask_ids', 'soft_token_ids' - ] - tensorable = [ - 'input_ids', 'attention_mask', 'token_type_ids', 'inputs_embeds', - 'labels', 'mask_ids', 'soft_token_ids' - ] - - def __init__(self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - inputs_embeds=None, - mask_ids=None, - labels=None, - uid=None, - soft_token_ids=None): - self.input_ids = input_ids - self.attention_mask = attention_mask - self.token_type_ids = token_type_ids - self.inputs_embeds = inputs_embeds - self.labels = labels - self.mask_ids = mask_ids - self.uid = uid - self.soft_token_ids = soft_token_ids - - @classmethod - def add_keys(cls, *args): - cls.input_keys.extend(args) - - def keys(self, keep_none=False): - if keep_none: - return self.input_keys - else: - return [ - key for key in self.input_keys if getattr(self, key) is not None - ] - - @property - def tensorable_keys(self, keep_none=False): - if keep_none: - return self.tensorable - else: - return [ - key for key in self.tensorable if getattr(self, key) is not None - ] - - @tensorable_keys.setter - def tensorable_keys(self, keys): - diff_keys = set(keys) - set(self.input_keys) - if len(diff_keys) > 0: - raise ValueError("{} not in predefined keys.".format( - ["`%s`" % k for k in diff_keys].join(", "))) - self.tensorable = keys - - def values(self, keep_none=False): - return [getattr(self, key) for key in self.keys(keep_none=keep_none)] - - def items(self): - return [(key, getattr(self, key)) for key in self.keys()] - - def __len__(self): - return len(self.keys()) - - def __repr__(self): - content = {} - for key, value in self.items(): - if isinstance(value, paddle.Tensor): - value = value.numpy().tolist() - elif isinstance(value, paddle.static.Variable): - value = value.to_string(True) - content[key] = value - return str(json.dumps(content)) - - def __getitem__(self, key): - return getattr(self, key) - - def __iter__(self): - return iter(self.keys()) - - def __contains__(self, key, keep_none): - return key in self.keys(keep_none) - - def __setitem__(self, key, value): - if key not in self.input_keys: - logger.warning( - "`{}` is not a predefined key in InputFeatures. Perhaps it "\ - "brings unexpected results.".format(key)) - self.add_keys(key) - setattr(self, key, value) - - def __eq__(self, other): - if not isinstance(other, InputFeatures): - return False - if self.keys() != other.keys(): - return False - for key in self.keys(): - value = getattr(self, key) - other_value = getattr(other, key) - if type(value) != type(other_value): - return False - if isinstance(value, paddle.Tensor): - value = value.numpy() - other_value = other_value.numpy() - if isinstance(value, list): - value = np.array(value) - other_value = np.array(other_value) - if not (value == other_value).all(): - return False - return True - - def __hash__(self): - return hash(self.__repr__()) - - @classmethod - def collate_fn(cls, batch): - """Collate batch data in form of InputFeatures.""" - new_batch = {} - for key in batch[0]: - values = [b[key] for b in batch] - if key in cls.tensorable: - new_batch[key] = paddle.to_tensor(values) - else: - new_batch[key] = values - - return InputFeatures(**new_batch) - - -def signature(fn): +def signature(function): """ Obtain the input arguments of the given function. """ - sig = inspect.signature(fn) + sig = inspect.signature(function) args = [ p.name for p in sig.parameters.values() if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD ] return args + + +@dataclass +class PromptDataCollatorWithPadding: + """ + Data collator that will group inputs by keywords and dynamically + pad the inputs to the longest sequence in the batch. + + Args: + tokenizer (`paddlennlp.transformers.PretrainedTokenizer`): + The tokenizer used for encoding the data from PromptTokenizer. + """ + + tokenizer: PretrainedTokenizerBase + padding: Union[bool, str, PaddingStrategy] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None + return_tensors: str = "pd" + return_attention_mask: Optional[bool] = None + default_model_input_names: List = ("input_ids", "token_type_ids", + "special_tokens_mask", "offset_mapping", + "position_ids") + + def _convert_to_tensors(self, data): + if self.return_tensors == "np": + return np.array(data) + else: + return paddle.to_tensor(data) + + def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: + batch = {} + for key in features[0]: + if key in self.default_model_input_names: + batch[key] = [b[key] for b in features] + + batch = self.tokenizer.pad( + batch, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + return_attention_mask=self.return_attention_mask) + max_length = batch["input_ids"].shape[1] + for key in features[0]: + if key not in self.default_model_input_names: + values = [b[key] for b in features] + if key == "masked_positions": + new_values = [] + for index, value in enumerate(values): + value = np.array(value) + index * max_length + new_values.extend(value.tolist()) + values = new_values + elif key == "attention_mask": + new_values = np.zeros( + [len(values), 1, max_length, max_length]) + for index, value in enumerate(values): + length = len(value) + new_values[index][0, :length, :length] = value + values = new_values + elif key != "labels": + for index, value in enumerate(values): + values[index] = value + [0] * (max_length - len(value)) + batch[key] = self._convert_to_tensors(values) + return batch diff --git a/paddlenlp/prompt/template.py b/paddlenlp/prompt/template.py index 400101082d0c..b9b98addd436 100644 --- a/paddlenlp/prompt/template.py +++ b/paddlenlp/prompt/template.py @@ -1,463 +1,814 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +""" +Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +This module provide prompt definition methods. +""" -from abc import abstractmethod import os import re import json +import traceback +from abc import abstractmethod +from typing import Any, Dict, List, Tuple, Optional, Union + +import numpy as np import paddle import paddle.nn as nn +from paddle import Tensor +from paddlenlp.utils.log import logger +from paddlenlp.transformers import PretrainedTokenizer, PretrainedModel -from .prompt_utils import InputExample, InputFeatures from .prompt_tokenizer import MLMPromptTokenizer -from ..utils.log import logger - -__all__ = ["Template", "ManualTemplate", "SoftTemplate", "AutoTemplate"] - -TEMPLATE_FILE = "template.json" - - -def parse_template(inputs: str, part_start="{", part_end="}"): - """ Parse items from the input template text. """ - parsed = [] - i_start = 0 - while i_start < len(inputs): - space = ' ' if (i_start > 0 and inputs[i_start - 1] == ' ') else '' - p = {"add_prefix_space": space} - while i_start < len(inputs) and inputs[i_start] == ' ': - p["add_prefix_space"] = ' ' - i_start += 1 - if i_start == len(inputs): break - - if inputs[i_start] == part_start: - i_end = i_start + 1 - count_part = 1 - while i_end < len(inputs): - if inputs[i_end] == part_end: - count_part -= 1 - if count_part == 0: break - elif inputs[i_end] == part_start: - count_part += 1 - i_end += 1 - if i_end == len(inputs): - raise ValueError( - '{} at position {} has no corresponding {}'.format( - part_start, i_start, part_end)) - try: - part = eval('{%s}' % inputs[i_start + 1:i_end]) - if isinstance(part, set): - part = {k: None for k in part} - p.update(part) - except: - import traceback - logger.error(traceback.format_exc()) - logger.error( - 'syntax error in {}'.format(f"{inputs[i_start + 1:i_end]}")) - exit() - i_start = i_end + 1 - else: - i_end = i_start + 1 - while i_end < len(inputs): - if inputs[i_end] == part_start: - break - i_end += 1 - p['hard'] = inputs[i_start:i_end].rstrip(' ') - i_start = i_end - parsed.append(p) - return parsed +__all__ = [ + "Template", "ManualTemplate", "SoftTemplate", "PrefixTemplate", + "AutoTemplate" +] + +# Template used to be saved in a file. +TEMPLATE_CONFIG_FILE = "template_config.json" +TEMPLATE_PARAMETER_FILE = "template_state.pdparams" + +# Default values for some template attributes. +DEFAULT_MAX_OPTIONS = 10 class Template(nn.Layer): """ - Base template class used to preprocess the inputs of model. + Base class for [`Template`]. Args: - tokenizer (paddlenlp.transformers.PretrainedTokenizer): - The tokenizer of pretrained models. - + prompt (`str`): + A template string which defines how to combine text and prompt. + tokenizer (`PretrainedTokenizer`): + An instance of PretrainedTokenizer used for tokenization. + max_length (`int`): + If set to a number, it will limit the total sequence returned so + that it has a maximum length, including prompts. """ - registered_input_names = ['mask_ids', 'shortenable_ids'] - registered_text_keys = ['text_a', 'text_b'] - - def __init__(self, tokenizer, max_seq_length): - super().__init__() + template_special_tokens = [ + "text", "hard", "soft", "soft_id", "prefix", "sep", "mask", "options" + ] + template_attributes = [ + "length", "encoder", "position", "token_type", "hidden_size", + "add_omask", "add_prompt", "add_space" + ] + input_feature_names = ["do_truncate", "token_types", "positions"] + opt_token = "[OPT]" + omask_token = "[O-MASK]" + + def __init__(self, prompt: str, tokenizer: PretrainedTokenizer, + max_length: int, **kwargs): + super(Template, self).__init__() + for key, value in kwargs.items(): + setattr(self, key, value) self.tokenizer = tokenizer - self.wrapped_tokenizer = MLMPromptTokenizer(tokenizer, max_seq_length) + self.prompt_tokenizer = MLMPromptTokenizer(tokenizer, max_length) + self.prompt = prompt @property - def template(self): - if not hasattr(self, '_template'): - raise RuntimeError( - 'Property template has not been set before used.') - return self._template - - @template.setter - def template(self, template): - if template is None: - return - self._template = template - self._process_template() + def prompt(self): + return self._prompt + + @prompt.setter + def prompt(self, prompt: str): + if prompt is not None: + if isinstance(prompt, str): + self._prompt = self.parse_template_string(prompt) + else: + self._prompt = prompt + self._check_template_special_tokens() + self.example_keys = self.create_example_keys_from_prompt() + self.token_types = self.create_token_type_sequence_from_prompt() + self.do_truncate = self.create_truncation_sequence_from_prompt() + self.positions = self.create_position_sequence_from_prompt() + self.create_prompt_parameters() @abstractmethod - def _process_template(self): - """ A hook to process template text when it is set. """ + def create_prompt_parameters(self): raise NotImplementedError - def parse_inputs(self, inputs): - return parse_template(inputs) + def _check_template_special_tokens(self): + valid_attr = self.template_special_tokens + self.template_attributes + prompt_attr = [] + for part in self._prompt: + prompt_attr.extend(list(part.keys())) + if "add_prompt" in part: + opt_prompt = part["add_prompt"] + if self.opt_token not in opt_prompt: + raise ValueError("'{}' not found in option prompt.".format( + self.opt_token)) + if "add_omask" in part: + self._check_omask_token() + diff_attr = set(prompt_attr) - set(valid_attr) + if len(diff_attr) > 0: + raise ValueError( + "Invalid attributes found in template: {}.".format(diff_attr)) + return True + + def _check_example_name(self, name: str, example: Dict[str, Any]): + if name not in example: + raise ValueError( + "Unexpected value in template. Can not find keyword {} in example: {}" + .format(name, example)) + return True + + def _check_omask_token(self): + omask_example = """ + Add '[O-MASK]' to tokenizer to use `add_omask`. + + Examples: + + ```python + omask_dict = {"additional_special_tokens": ["[O-MASK]"]} + tokenizer.add_special_tokens(omask_dict) + model.resize_token_embeddings(len(tokenizer)) + ```""" + if self.omask_token not in self.tokenizer.additional_special_tokens: + self.tokenizer.add_special_tokens( + {"additional_special_tokens": [self.omask_token]}) + return True + raise ValueError( + "'{}' not found in tokenizer.".format(self.omask_token) + + omask_example) + return True + + def build_inputs_with_prompt( + self, + example: Dict[str, Any], + prompt: Optional[List[Dict[str, Any]]] = None) -> List[str]: + """ + Build input text sequences according to both prompt and example. + + Args: + example (`Dict[str, Any]`): + A data sample with corresponding keys as `prompt`. + prompt (`Optional[List[Dict[str, Any]]]`): + A sequence of dictionary which defines positions of prompt, + input text and special tokens. + """ + inputs = self._prompt.copy() if prompt is None else prompt.copy() + + for index, part in enumerate(inputs): + if "text" in part: + self._check_example_name(part["text"], example) + inputs[index] = str(example[part["text"]]) + elif "mask" in part: + if "length" not in part: + part["length"] = 1 + inputs[index] = self.tokenizer.mask_token * part["length"] + elif "sep" in part: + inputs[index] = self.tokenizer.sep_token + elif "hard" in part: + inputs[index] = part["hard"] + elif "options" in part: + if not isinstance(part["options"], list): + self._check_example_name(part["options"], example) + labels = example[part["options"]] + labels = [labels] if isinstance(labels, str) else labels + else: + labels = part["options"] + if "add_prompt" in part: + opt_prompt = part["add_prompt"] + labels = [ + opt_prompt.replace(self.opt_token, x) for x in labels + ] + if "add_omask" in part: + labels = [self.omask_token + x for x in labels] + inputs[index] = "".join(labels) + else: + inputs[index] = part - def get_default_mask_ids(self): - """ List to denote whether an item in template is a mask token. """ - return [1 if 'mask' in p else 0 for p in self.template] + if "add_space" in part: + inputs[index] = " " + inputs[index] + return inputs - def get_default_shortenable_ids(self): - """ List to denote whther an item in template can be truncated. """ - idx = [] - for p in self.template: - if 'shortenable' in p: - idx.append(1 if p['shortenable'] else 0) + def create_token_type_sequence_from_prompt( + self, prompt: Optional[List[Dict[str, Any]]] = None) -> List[int]: + prompt = self._prompt if prompt is None else prompt + last_token_type = 0 + token_type_ids = [] + for part in prompt: + if "token_type" in part: + last_token_type = part["token_type"] + token_type_ids.append(last_token_type) + return token_type_ids + + def create_position_sequence_from_prompt( + self, prompt: Optional[List[Dict[str, Any]]] = None) -> List[int]: + prompt = self._prompt if prompt is None else prompt + position_ids = [] + for part in prompt: + if "position" in part: + position_ids.append(part["position"]) else: - idx.append(1 if 'text' in p else 0) - return idx - - def incorporate_template_text(self, example, template=None): - """ Replace each item in template with real text. """ - inputs = template.copy( - ) if self.template is None else self.template.copy() - - for i, p in enumerate(inputs): - if 'text' in p: - inputs[i] = p['add_prefix_space'] + getattr(example, p['text']) - elif 'mask' in p: - inputs[i] = self.tokenizer.mask_token - elif 'hard' in p: - inputs[i] = p['add_prefix_space'] + p['hard'] - elif 'sep' in p: - inputs[i] = self.tokenizer.sep_token + position_ids.append(-1) + return position_ids + + def create_truncation_sequence_from_prompt( + self, prompt: Optional[List[Dict[str, Any]]] = None) -> List[int]: + prompt = self._prompt.copy() if prompt is None else prompt.copy() + do_truncate = [] + for part in prompt: + if "truncate" in part: + do_truncate.append(part["truncation"]) + prompt_tokens = set(part.keys()) - set(["text"]) + if len(prompt_tokens) > 0 and part["truncation"]: + logger.warning("{} in template will be truncated, ".format( + prompt_tokens) + "which might degrade performance.") + elif "text" in part: + do_truncate.append(True) else: - raise ValueError('Can not parse {}'.format(p)) + do_truncate.append(False) + return do_truncate + + def create_example_keys_from_prompt(self): + example_keys = set() + for part in self._prompt: + if "text" in part: + example_keys.add(part["text"]) + if "options" in part and isinstance(part["options"], list): + example_keys.update(set(part["options"])) + return example_keys + + def encode(self, example: Dict[str, Any]): + input_text = self.build_inputs_with_prompt(example) + input_names, input_values = ["text"], [input_text] + for name in self.input_feature_names: + input_names.append(name) + input_values.append(getattr(self, name, None)) - return inputs + inputs = [] + for value in list(zip(*input_values)): + inputs.append(dict(zip(input_names, value))) - def wrap_one_example(self, example): - """ Process InputExample according to the predefined template. """ - if self.template is None: - raise ValueError('The template has not been initialized.') - if isinstance(example, InputExample): - text = self.incorporate_template_text(example) - - non_empty_keys = example.keys() - for key in self.registered_text_keys: - if key in non_empty_keys: - non_empty_keys.remove(key) - - keys, values = ['text'], [text] - for name in self.registered_input_names: - keys.append(name) - v = None - if hasattr(self, name) and getattr(self, name) is not None: - v = getattr(self, name) - elif hasattr(self, 'get_default_' + name): - v = getattr(self, 'get_default_' + name)() - setattr(self, name, v) - else: - raise ValueError(""" - Template's part attribute '{}' is registered but not - initialized. Try using template.{} = [...] to - initialize or create a get_default_{}(self) - method in your template.""".format(name, name, name)) - values.append(v) - - wrapped_parts_to_tokenize = [] - for value in list(zip(*values)): - wrapped_parts_to_tokenize.append(dict(zip(keys, value))) - - wrapped_parts_not_to_tokenize = { - key: getattr(example, key) - for key in non_empty_keys - } - wrapped_parts_to_tokenize = self.wrapped_tokenizer( - wrapped_parts_to_tokenize) - - return InputFeatures(**wrapped_parts_to_tokenize, - **wrapped_parts_not_to_tokenize) - else: - raise TypeError('InputExample') + input_dict = self.prompt_tokenizer(inputs) + unused_example = { + k: v + for k, v in example.items() if k not in self.example_keys + } - def process_batch(self, batch): - return batch + return {**input_dict, **unused_example} - def save_to(self, data_dir): - with open(os.path.join(data_dir, TEMPLATE_FILE), "w") as f: - json.dump(self.template, f) + def __call__(self, example: Dict[str, Any]): + return self.encode(example=example) + @abstractmethod + def process_batch(self, input_dict): + raise NotImplementedError -class ManualTemplate(Template): - """ - ManualTemplate for hard prompt methods, such as PET, EFL. + def save(self, save_path): + if not os.path.exists(save_path): + os.makedirs(save_path, exist_ok=True) + template_config_file = os.path.join(save_path, TEMPLATE_CONFIG_FILE) + with open(template_config_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(self._prompt, ensure_ascii=False)) + template_param_file = os.path.join(save_path, TEMPLATE_PARAMETER_FILE) + template_state_dict = self.state_dict() + if len(template_state_dict) > 0: + paddle.save(template_state_dict, template_param_file) + + @staticmethod + def extract_template_keywords(prompt: List[Dict[str, Any]]): + keywords = set() + for part in prompt: + keywords.update(part.keys()) + return keywords + + @staticmethod + def parse_template_string(prompt: str, + left_token: Optional[str] = "{", + right_token: Optional[str] = "}"): + """ + Parse the defined string as a sequence of dictionaries. - Args: - tokenizer (paddlenlp.transformers.PretrainedTokenizer): - The same as `Template`. - template (str | list): - It describes how to combine text and prompts. For example, - `str`: "{'text':'text_a'} It is {'mask'}." or a corresponding - list of dictionary/set parsed by `parse_template` method. - """ + Args: + prompt: A string comprised of nestable {}, [], integers and strings. - def __init__(self, tokenizer, max_seq_length, template=None): - super().__init__(tokenizer=tokenizer, max_seq_length=max_seq_length) - self.template = template + Returns: + A list of dictionaries corresponding to the input string. + + For example, if we define `prompt` as - def _process_template(self): - if isinstance(self._template, str): - self._template = self.parse_inputs(self._template) + "{'text': 'hypothesis'}基于这一假设{'mask'}推断出{'options': 'label.txt'}", + then this function returns -class SoftTemplate(Template): + [{"text": "hypothesis"}, {"hard": "基于这一假设"}, {"mask": null}, + {"hard": "推断出"}, {"options": ["正确", "错误"]}]. + + Raises: + ValueError: A error occurred parsing an string with unmatched punctuations. + """ + left_stack = [] + parsed = [] + index = 0 + while index < len(prompt): + # Delete extra spaces. + part = {"add_space": " "} if prompt[index] == " " else {} + while index < len(prompt) and prompt[index] == " ": + index += 1 + if index == len(prompt): + break + # Parse blocks with paired tokens like "{ }". + if prompt[index] == left_token: + left_index = index + while index < len(prompt): + if prompt[index] == left_token: + left_stack.append(index) + elif prompt[index] == right_token: + left_stack.pop() + if len(left_stack) == 0: + break + index += 1 + if index == len(prompt) and len(left_stack) > 0: + raise ValueError( + "{} at position {} has no corresponding {}".format( + left_token, left_index, right_token)) + try: + part_dict = eval(prompt[left_index:index + 1]) + if isinstance(part_dict, set): + part_dict = {k: None for k in part_dict} + part.update(part_dict) + except SyntaxError as error: + logger.error(traceback.format_exc()) + exit() + index += 1 + # Parse simplified discrete prompts. + else: + left_index = index + while index < len(prompt) and prompt[index] != left_token: + index += 1 + part["hard"] = prompt[left_index:index].rstrip(" ") + + if "options" in part: + if os.path.isfile(part["options"]): + with open(part["options"], "r") as fp: + labels = [x.strip() for x in fp] + part["options"] = labels + part["length"] = len(labels) + elif "length" not in "options": + part["length"] = DEFAULT_MAX_OPTIONS + logger.warning( + "[options]: The maximum number of options not defined," + " set as {} by default.".format(DEFAULT_MAX_OPTIONS)) + if "length" in part: + assert part["length"] > 0 + if "hard" in part: + logger.warning( + "Ignore `length` attribute for keyword `hard`.") + if "position" in part: + assert part["position"] >= 0 + if "token_type" in part: + assert part["token_type"] in (0, 1) + parsed.append(part) + return parsed + + +class ManualTemplate(Template): """ - SoftTemplate on the input layer for soft prompt methods, such as p-tuning. + ManualTemplate for discrete prompt methods, such as PET, EFL. Args: - tokenizer (paddlenlp.transformers.PretrainedTokenizer): - The same as `Template`. - template (str | list): - It describes how to combine text with both manual and soft prompts. - prompt_encoder (str): - The encoder to project soft embeddings. Support `lstm` and 'mlp'. - Use soft embeddings directly when prompt_encoder is `None`. + prompt (`str`): + A template string which defines how to combine text and prompt. + tokenizer (`PretrainedTokenizer`): + An instance of PretrainedTokenizer used for tokenization. + max_length (`int`): + If set to a number, it will limit the total sequence returned so + that it has a maximum length, including prompts. """ - registered_input_names = ['soft_token_ids', 'mask_ids', 'shortenable_ids'] + template_special_tokens = ["text", "hard", "sep", "mask", "options"] + template_attributes = [ + "length", "position", "token_type", "add_prompt", "add_space", + "add_omask" + ] - def __init__(self, - tokenizer, - max_seq_length, - model=None, - template=None, - prompt_encoder=None, - encoder_hidden_size=None): - super().__init__(tokenizer=tokenizer, max_seq_length=max_seq_length) - if model is None: - self.token_embeddings = None - logger.warning( - "SoftTemplate: The pretrained model is not given. It would " - "lead to error unless it is initialized for deployment.") - else: - if type(model).__name__.endswith('Model'): - self.token_embeddings = model.embeddings.word_embeddings - else: - for module in model.children(): - if type(module).__name__.endswith('Model'): - self.token_embeddings = module.embeddings.word_embeddings - break - self.token_embeddings.weight.stop_gradient = True - self.embedding_size = self.token_embeddings.weight.shape[-1] - self.encoder_hidden_size = encoder_hidden_size - if self.encoder_hidden_size is not None and prompt_encoder is None: - logger.warning("`prompt_encoder` is not set yet. Use MLP for " - "soft embeddings' projection by default.") - prompt_encoder = "mlp" - self.prompt_encoder = prompt_encoder - self.template = template - - def _process_template(self): - if isinstance(self._template, str): - self._template = self.parse_inputs(self._template) - self.parse_soft_tokens() - self.generate_parameters() + def __init__(self, prompt: str, tokenizer: PretrainedTokenizer, + max_length: int): + super(ManualTemplate, self).__init__(prompt, tokenizer, max_length) - @property - def prompt_encoder(self): - return self._prompt_encoder + def create_prompt_parameters(self): + return None - @prompt_encoder.setter - def prompt_encoder(self, prompt_encoder): + def process_batch(self, input_dict): + return input_dict - if prompt_encoder is None: - return None - if getattr(self, "_prompt_encoder", None) is not None: - logger.warning( - f"Encoder has already set as {self._prompt_encoder}, change " + - "`prompt_encoder` will reset parameters.") +class SoftLSTM(nn.Layer): + """ + LSTM encoder for soft token embeddings. + """ - self._prompt_encoder = prompt_encoder + def __init__(self, input_size, hidden_size, output_size, activation): + super(SoftLSTM, self).__init__() + self.lstm = nn.LSTM(input_size=input_size, + hidden_size=hidden_size, + num_layers=2, + direction='bidirect', + time_major=False) + self.mlp = nn.Sequential(nn.Linear(2 * hidden_size, + hidden_size), activation, + nn.Linear(hidden_size, output_size)) - if self.encoder_hidden_size is None: - hidden_size = self.embedding_size - else: - hidden_size = self.encoder_hidden_size - if prompt_encoder == 'lstm': - self.lstm_head = nn.LSTM(input_size=self.embedding_size, - hidden_size=hidden_size, - num_layers=2, - direction='bidirect', - time_major=False) - self.mlp_head = nn.Sequential( - nn.Linear(2 * hidden_size, hidden_size), nn.ReLU(), - nn.Linear(hidden_size, self.embedding_size)) - elif prompt_encoder == 'mlp': - self.mlp_head = nn.Sequential( - nn.Linear(self.embedding_size, hidden_size), nn.ReLU(), - nn.Linear(hidden_size, self.embedding_size)) - if hasattr(self, "lstm_head"): - delattr(self, "lstm_head") - else: - raise ValueError( - "Unsupported soft token encoder: {}".format(prompt_encoder)) - - def incorporate_template_text(self, example, template=None): - """ Replace each item in template with real text. """ - inputs = template.copy( - ) if self.template is None else self.template.copy() - - for i, p in enumerate(inputs): - if 'text' in p: - inputs[i] = p['add_prefix_space'] + getattr(example, p['text']) - elif 'mask' in p: - inputs[i] = self.tokenizer.mask_token - elif 'hard' in p: - inputs[i] = p['add_prefix_space'] + p['hard'] - elif 'soft' in p: - inputs[i] = p['add_prefix_space'] + p['soft'] - elif 'sep' in p: - inputs[i] = self.tokenizer.sep_token - else: - raise ValueError('can not parse {}'.format(p)) + def forward(self, embeds): + hidden_states, _ = self.lstm(embeds) + return self.mlp(hidden_states) - return inputs - def parse_soft_tokens(self): - inputs = [] - soft_token_ids = [] - num_soft_token = 0 - soft2word_init = {} - soft_id_reindex = {} +class SoftTemplate(Template): + """ + SoftTemplate for continuous prompt methods on the input layer. - for part in self._template: - if 'soft' not in part and 'soft_id' not in part: - soft_token_ids.append(0) - inputs.append(part) - continue + Args: + prompt (`str`): + A template string which defines how to combine text and prompt. + tokenizer (`PretrainedTokenizer`): + An instance of PretrainedTokenizer used for tokenization. + max_length (`int`): + If set to a number, it will limit the total sequence returned so + that it has a maximum length, including prompts. + word_embeddings (`Tensor`): + The word embeddings of pretrained models, which can be obtained by + calling `model.get_input_embeddings().weight`. + soft_embeddings (`Tensor`): + The embeddings of soft tokens, which overwrites `word_embeddings` + as initial weights when defined. + """ + template_special_tokens = [ + "text", "hard", "soft", "soft_id", "sep", "mask", "options" + ] + input_feature_names = [ + "do_truncate", "token_types", "positions", "soft_tokens", "encoder_ids" + ] - if 'soft' in part and part['soft'] is not None: - if 'duplicate' in part: - logger.warning( - 'Ignore ``duplicate``. It is ' - 'incompatible with ``soft`` with text values.') + def __init__(self, + prompt: str, + tokenizer: PretrainedTokenizer, + max_length: int, + word_embeddings: Tensor, + soft_embeddings: Tensor = None): + super(SoftTemplate, self).__init__(prompt, + tokenizer, + max_length, + word_embeddings=word_embeddings, + soft_embeddings=soft_embeddings) + + def named_parameters(self): + named_params = [(n, p) + for n, p in self.soft_embeddings.named_parameters()] + named_params.extend([(n, p) + for n, p in self.encoder_list.named_parameters()]) + return named_params + + def parameters(self): + return [p for n, p in self.named_parameters()] + + def create_prompt_parameters(self): + self._prompt, soft_token_config = self.parse_soft_prompt() + self.embed_size = self.word_embeddings.weight.shape[1] + soft2word, self.soft_tokens, self.num_soft_token = soft_token_config + self._init_soft_parameters(soft2word) + self.encoder_ids, self.encoder_list = self._create_soft_encoders() + + def process_batch(self, input_dict: Dict[str, Tensor]) -> Dict[str, Tensor]: + """ + Convert input_ids to inputs_embeds. + + Soft tokens are encoded soft_embeddings with predefined encoders. + For other tokens, use word embeddings in pretrained model. + """ + word_embeds = self.word_embeddings(input_dict["input_ids"]) + if "attention_mask" not in input_dict or input_dict[ + "attention_mask"] is None: + pad_token_id = self.tokenizer.pad_token_id + attention_mask = paddle.unsqueeze( + (input_dict["input_ids"] == pad_token_id).astype("float32") * + -1e4, + axis=[1, 2]) + input_dict["attention_mask"] = attention_mask + input_dict["input_ids"] = None + soft_embeds = self.soft_embeddings(input_dict["soft_token_ids"]) + soft_shape = soft_embeds.shape + soft_embeds = soft_embeds.reshape([-1, soft_shape[-1]]) + for encoder_id in range(1, len(self.encoder_list)): + to_encode = paddle.where(input_dict["encoder_ids"] == encoder_id) + to_encode = to_encode[0] * soft_shape[1] + to_encode[1] + to_encode = to_encode.squeeze(1) + to_encode_embeds = soft_embeds[to_encode] + to_encode_embeds = to_encode_embeds.reshape( + [soft_shape[0], -1, soft_shape[-1]]) + encoder = self.encoder_list[encoder_id] + encoded = encoder(to_encode_embeds) + encoded = encoded.reshape([-1, soft_shape[-1]]) + soft_embeds = paddle.scatter(soft_embeds, to_encode, encoded) + soft_embeds = soft_embeds.reshape([soft_shape[0], -1, soft_shape[-1]]) + soft_token_ids = input_dict["soft_token_ids"].unsqueeze(2) + input_dict["inputs_embeds"] = paddle.where(soft_token_ids > 0, + soft_embeds, word_embeds) + return input_dict + + def parse_soft_prompt(self): + """ + Unify the form of continuous prompts as {"soft": "xxx"} and create + continuous token id sequence for each part in template. + + Returns: + `List[Dict[str, str]]`: Template with continuous prompt formated as {"soft": "xxx"}. + `Tuple[Dict[int, int], List[List[int]], int]`: + - Mapping from continuous ids to word ids for initialization. + - Continuous ids for each part. Id 0 denotes none-continuous part. + - Number of unique coutinuous tokens. + """ + prompt = self._prompt.copy() + num_soft_token = 1 + soft_prompt = [] + soft_token_ids = [] + soft2word = {} + soft_id_reindex = {} - # Get word tokens and ids for soft token initialization. - init_token_ids = self.tokenizer( - part['add_prefix_space'] + part['soft'], + for part in prompt: + part_prompt = None + # Copy non-continuous prompt part. + if "soft" not in part and "soft_id" not in part: + soft_prompt.append(part) + soft_token_ids.append(None) + + # Deal with continuous prompt with specific initialization. + elif "soft" in part and part["soft"] is not None: + + # Get word tokens for initialization. + if "add_space" in part: + part["soft"] = part["add_space"] + part["soft"] + word_token_ids = self.tokenizer( + part["soft"], add_special_tokens=False, - return_token_type_ids=False)['input_ids'] - init_tokens = self.tokenizer.convert_ids_to_tokens( - init_token_ids) - assert len(init_tokens) == len(init_token_ids) - - # Create soft ids and corresponding ``soft`` part in template. - next_num_soft = num_soft_token + 1 - num_soft_token += len(init_tokens) - id_list = list(range(next_num_soft, num_soft_token + 1)) - - soft_token_ids.extend(id_list) - inputs.extend([{ - 'add_prefix_space': part['add_prefix_space'], - 'soft': token - } for token in init_tokens]) - for soft_id, word_id in zip(id_list, init_token_ids): - soft2word_init[soft_id] = word_id - - # Check the ids of ``soft`` and ``soft_id``. - if 'soft_id' in part: - if part['soft_id'] in soft_id_reindex: - assert id_list == soft_id_reindex[part['soft_id']] + return_token_type_ids=False)["input_ids"] + + # Create continuous token ids. + soft_id_list = list( + range(num_soft_token, num_soft_token + len(word_token_ids))) + num_soft_token += len(word_token_ids) + + for soft_id, word_id in zip(soft_id_list, word_token_ids): + soft2word[soft_id] = word_id + + # Check `length` if exists. + if "length" in part: + if part["length"] < len(word_token_ids): + logger.warning("Ignore `length` because it is less than" + " the length of defined word sequence.") + elif part["length"] > len(word_token_ids): + length = part["length"] - len(word_token_ids) + soft_id_list += list( + range(num_soft_token, num_soft_token + length)) + num_soft_token += length + part["soft"] += self.tokenizer.unk_token * length + + soft_token_ids.append(soft_id_list) + part_prompt = {"soft": part["soft"]} + + # Check or record `soft_id` if exists. + if "soft_id" in part: + if part["soft_id"] in soft_id_reindex: + assert soft_id_list == soft_id_reindex[part["soft_id"]] else: - soft_id_reindex[part['soft_id']] = id_list - continue - - if 'soft_id' in part and part['soft_id'] in soft_id_reindex: - if 'duplicate' in part: - logger.warnings('Ignore ``duplicate``. Initialize ' - '``soft`` by ``soft_id`` directly.') - id_list = soft_id_reindex[part['soft_id']] - - elif 'duplicate' in part: - assert isinstance(part['duplicate'], int) - if 'same' in part: - num_soft_token += 1 - id_list = [num_soft_token for _ in range(part['duplicate'])] - else: - next_num_soft = num_soft_token + 1 - num_soft_token += part['duplicate'] - id_list = list(range(next_num_soft, num_soft_token + 1)) + soft_id_reindex[part["soft_id"]] = soft_id_list + + # Deal with continous prompt defined by `soft_id`. + elif "soft_id" in part and part["soft_id"] in soft_id_reindex: + soft_id_list = soft_id_reindex[part["soft_id"]] + if "length" in part: + logger.warning("Ignore `length` because it is incompatible" + " with existing `soft_id`.") + soft_token_ids.append(soft_id_list) + part_prompt = { + "soft": [self.tokenizer.unk_token] * len(soft_id_list) + } + + # Deal with continous prompt with random initialization. else: - num_soft_token += 1 - id_list = [num_soft_token] - - if 'soft_id' in part: - soft_id_reindex[part['soft_id']] = id_list + if "length" not in part: + part["length"] = 1 + soft_id_list = list( + range(num_soft_token, num_soft_token + part["length"])) + num_soft_token += part["length"] + soft_token_ids.append(soft_id_list) + if "soft_id" in part: + soft_id_reindex[part["soft_id"]] = soft_id_list + part_prompt = { + "soft": [self.tokenizer.unk_token] * len(soft_id_list) + } + if part_prompt is not None: + for key in part: + if key not in ["soft", "soft_id", "length", "add_space"]: + part_prompt[key] = part[key] + soft_prompt.append(part_prompt) + + if num_soft_token == 1: + raise ValueError("Soft prompt expected for SoftTemplate, but" + " get {}.".format(self._prompt)) + + soft_token_config = (soft2word, soft_token_ids, num_soft_token) + + return soft_prompt, soft_token_config + + def _init_soft_parameters(self, soft2word: Dict[int, int]): + if self.soft_embeddings is not None: + if self.soft_embeddings.weight.shape[0] != self.num_soft_token: + raise ValueError( + "Given soft embeddings are incompatible with those " + "defined in template \"{}\"".format(self._prompt)) + else: + self.soft_embeddings = nn.Embedding(self.num_soft_token, + self.embed_size) + weight = self.soft_embeddings.weight.clone().detach() + for soft_id, word_id in soft2word.items(): + word_id = paddle.to_tensor(word_id) + weight[soft_id] = self.word_embeddings(word_id) + self.soft_embeddings.weight.set_value(weight) + + def _create_soft_encoders(self, + output_size: int = None, + activation: nn.Layer = None): + encoder_list = [nn.Identity()] + encoder2id = {} + encoder_ids = [] + output_size = self.embed_size if output_size is None else output_size + activation = nn.ReLU() if activation is None else activation + for part in self._prompt: + if "encoder" not in part or part["encoder"] is None: + encoder_ids.append(0) + else: + if part["encoder"] not in encoder2id: + encoder2id[part["encoder"]] = len(encoder_list) + encoder_ids.append(len(encoder_list)) + if "hidden_size" in part: + hidden_size = part["hidden_size"] + else: + hidden_size = self.embed_size + if part["encoder"] == "lstm": + encoder_list.append( + SoftLSTM(self.embed_size, hidden_size, output_size, + activation)) + elif part["encoder"] == "mlp": + encoder_list.append( + nn.Sequential( + nn.Linear(self.embed_size, + hidden_size), activation, + nn.Linear(hidden_size, output_size))) + else: + raise ValueError("Encoder {} not supported.".format( + part["encoder"])) + else: + encoder_ids.append(encoder2id[part["encoder"]]) + encoder_list = nn.LayerList(encoder_list) + return encoder_ids, encoder_list + + def build_inputs_with_prompt( + self, + example: Dict[str, Any], + prompt: Optional[List[Dict[str, Any]]] = None) -> List[str]: + inputs = super(SoftTemplate, + self).build_inputs_with_prompt(example, prompt) + for index, part in enumerate(inputs): + if isinstance(part, dict) and "soft" in part: + inputs[index] = part["soft"] + return inputs - soft_token_ids.extend(id_list) - inputs.extend([{ - 'add_prefix_space': part['add_prefix_space'], - 'soft': self.tokenizer.cls_token - } for _ in range(len(id_list))]) + def save(self, save_path): + super(SoftTemplate, self).save(save_path) + template_param_file = os.path.join(save_path, TEMPLATE_PARAMETER_FILE) + paddle.save(self.state_dict(), template_param_file) - self._template = inputs - self.soft_token_ids = soft_token_ids - self.num_soft_token = num_soft_token - self.soft2word_init = soft2word_init - if self.num_soft_token == 0: - logger.warning('No soft tokens in template. '\ - 'Use ManualTemplate for better performance.') +class PrefixTemplate(SoftTemplate): + """ + PrefixTemplate for continuous prompt methods on every layer. - def generate_parameters(self): - """ - Generate parameters for soft tokens. - """ - if self.num_soft_token == 0 or self.token_embeddings is None: - return None - self.soft_embeddings = nn.Embedding(self.num_soft_token + 1, - self.embedding_size) - - weight = self.soft_embeddings.weight.clone().detach() - for soft_id, word_id in self.soft2word_init.items(): - weight[soft_id] = self.token_embeddings(paddle.to_tensor(word_id)) - self.soft_embeddings.weight.set_value(weight) - - def process_batch(self, batch): - word_embeds = self.token_embeddings(batch["input_ids"]) - batch["input_ids"] = None - if not hasattr(self, - "soft_embeddings") or batch["soft_token_ids"] is None: - batch["inputs_embeds"] = word_embeds - else: - soft_embeds = self.soft_embeddings(batch["soft_token_ids"]) - if hasattr(self, "lstm_head"): - soft_embeds = self.lstm_head(soft_embeds)[0] - if hasattr(self, "mlp_head"): - soft_embeds = self.mlp_head(soft_embeds) + Args: + prompt (`str`): + A template string which defines how to combine text and prompt. + tokenizer (`PretrainedTokenizer`): + An instance of PretrainedTokenizer used for tokenization. + max_length (`int`): + If set to a number, it will limit the total sequence returned so + that it has a maximum length, including prompts. + model (`PretrainedModel`): + An instance of PretrainedModel. + """ + template_special_tokens = [ + "text", "hard", "prefix", "soft", "sep", "mask", "options" + ] + input_feature_names = [ + "do_truncate", "token_types", "positions", "soft_tokens", "encoder_ids" + ] - inputs_embeds = paddle.where( - (batch["soft_token_ids"] > 0).unsqueeze(-1), soft_embeds, - word_embeds) - batch["inputs_embeds"] = inputs_embeds - return batch + def __init__(self, + prompt: str, + tokenizer: PretrainedTokenizer, + max_length: int, + model: PretrainedModel, + prefix_dropout: float = 0.1): + self.n_layer, self.n_heads = self._get_config(model) + super(PrefixTemplate).__init__(prompt, tokenizer, max_length, + model.get_input_embeddings()) + self.dropout = nn.Dropout(p=prefix_dropout) + + @staticmethod + def _get_config(model): + names = [n for n, p in model.named_parameters() if "layers" in n] + pattern = re.compile(r".*?\.(\d+)\..*?") + indices = [] + for name in names: + result = pattern.match(name) + if result is not None: + indices.append(int(result.group(1))) + num_layer = max(indices) + 1 + layer_names = names[0].split(".")[:-2] + layer = model + for name in layer_names: + layer = getattr(layer, name) + num_heads = layer.num_heads + + return num_layer, num_heads + + def parse_soft_prompt(self): + prompt = self._prompt.copy() + + for index, part in enumerate(prompt): + if "soft" in part: + raise ValueError("Keyward `soft` should not be used in " + "PrefixTemplate.") + if "prefix" not in part: + continue + if index != 0: + raise ValueError("Keyword `prefix` should locate at the " + "beginning of template.") + part["soft"] = part["prefix"] + part.pop("prefix") + prompt[index] = part + + self._prompt = prompt + return super(PrefixTemplate, self).parse_soft_prompt() + + def process_batch(self, input_dict: Dict[str, Tensor]) -> Dict[str, Tensor]: + word_embeds = self.word_embeddings(input_dict["input_ids"]) + if "attention_mask" not in input_dict or input_dict[ + "attention_mask"] is None: + pad_token_id = self.tokenizer.pad_token_id + attention_mask = paddle.unsqueeze( + (input_dict["input_ids"] == pad_token_id).astype("float32") * + -1e4, + axis=[1, 2]) + input_dict["attention_mask"] = attention_mask + input_dict["input_ids"] = None + + batch_size, _ = input_dict["soft_token_ids"].shape + soft_token_ids = paddle.masked_select(input_dict["soft_token_ids"], + input_dict["soft_token_ids"] > 0) + soft_token_ids = soft_token_ids.reshape([batch_size, -1]) + _, soft_len = soft_token_ids.shape + + input_dict["inputs_embeds"] = word_embeds[:, soft_len:, :] + + soft_embeds = self.soft_embeddings(soft_token_ids) + for encoder_id in range(1, len(self.encoder_list)): + to_encode = paddle.where(input_dict["encoder_ids"] == encoder_id) + encoded = self.encoder_list[encoder_id](to_encode) + soft_embeds = paddle.where(input_dict["encoder_ids"] == encoder_id, + encoded, soft_embeds) + soft_embeds = soft_embeds.reshape([ + batch_size, soft_len, self.n_layer * 2, self.n_heads, + self.embed_size // self.n_heads + ]) + soft_embeds = self.dropout(soft_embeds) + soft_embeds = paddle.transpose(soft_embeds, perm=[2, 0, 3, 1, 4]) + soft_embeds = paddle.split(soft_embeds, num_or_sections=self.n_layer) + soft_embeds = [paddle.split(emb, 2) for emb in soft_embeds] + soft_embeds = [[x.squeeze(0) for x in emb] for emb in soft_embeds] + input_dict["past_key_values"] = tuple( + [tuple(emb) for emb in soft_embeds]) + return input_dict + + def _create_soft_encoders(self): + output_size = self.embed_size * self.n_layer * 2 + activation = nn.Tanh() + return super(PrefixTemplate, + self)._create_soft_encoders(output_size, activation) class AutoTemplate(object): @@ -465,83 +816,70 @@ class AutoTemplate(object): AutoTemplate can help you automatically create the relevant Template given the provided prompt. """ - registered_text_keys = ['text_a', 'text_b'] + default_text_keyword = "text_a" def __init__(self, *args, **kwargs): raise EnvironmentError( '{} is designed to be instantiated using {}.create_from('\ - 'template, tokenizer, text_list, ...)'.format( + 'prompt, tokenizer, max_length, ...)'.format( self.__class__.__name__, self.__class__.__name__)) - @classmethod - def parse_inputs(cls, inputs): - return parse_template(inputs) - @classmethod def create_from(cls, - template, - tokenizer, - max_seq_length, - model=None, - prompt_encoder=None, - encoder_hidden_size=None): - if template is None: - template = "{'soft'}" - if isinstance(template, str): - template = cls.parse_inputs(template) - template_keys = cls._extract_template_keys(template) - if 'text' not in template_keys: - soft_template = [] - for item in template: - if 'hard' in item: - soft_template.append({ - 'add_prefix_space': '', - 'soft': item['hard'] - }) - else: - soft_template.append(item) - text_item = [{ - 'add_prefix_space': ' ', - 'text': cls.registered_text_keys[0] - }] - template = text_item + soft_template - template_keys = cls._extract_template_keys(template) - - if 'mask' not in template_keys: - template.append({'add_prefix_space': ' ', 'mask': None}) - - if 'soft' in template_keys: - return SoftTemplate(tokenizer=tokenizer, - template=template, - max_seq_length=max_seq_length, - model=model, - prompt_encoder=prompt_encoder, - encoder_hidden_size=encoder_hidden_size) + prompt: str, + tokenizer: PretrainedTokenizer, + max_length: int = 512, + model: PretrainedModel = None, + soft_embeddings: Tensor = None): + # Default template if not defined. + if prompt is None: + prompt = "{'soft'}{'text': 'text_a'}{'mask'}" + + if isinstance(prompt, str): + prompt = Template.parse_template_string(prompt) + template_keywords = Template.extract_template_keywords(prompt) + + # Complement simplified template as ManualTemplate-style in form. + if "text" not in template_keywords: + prompt = [{"text": cls.default_text_keyword}] + prompt + if "mask" not in template_keywords: + prompt = prompt + [{"mask": None}] + + # Choose Template according to template keywords. + if "prefix" in template_keywords: + return PrefixTemplate(prompt=prompt, + tokenizer=tokenizer, + max_length=max_length, + model=model) + elif "soft" in template_keywords or "soft_id" in template_keywords: + word_embeddings = model.get_input_embeddings() + return SoftTemplate(prompt=prompt, + tokenizer=tokenizer, + max_length=max_length, + word_embeddings=word_embeddings) else: - return ManualTemplate(tokenizer=tokenizer, - max_seq_length=max_seq_length, - template=template) + return ManualTemplate(prompt=prompt, + tokenizer=tokenizer, + max_length=max_length) @classmethod def load_from(cls, - data_dir, - tokenizer, - max_seq_length, - model=None, - prompt_encoder=None, - encoder_hidden_size=None): - with open(os.path.join(data_dir, TEMPLATE_FILE), "r") as f: - template = json.load(f) - return cls.create_from(template, tokenizer, max_seq_length, model, - prompt_encoder, encoder_hidden_size) - - @classmethod - def _extract_template_keys(cls, inputs: list): - template_keys = set() - for item_dict in inputs: - for key, value in item_dict.items(): - template_keys.add(key) - if key == 'text': - assert value in cls.registered_text_keys, 'No ``{}`` attribute '\ - 'in InputExample.'.format(value) - return template_keys + data_path: os.PathLike, + tokenizer: PretrainedTokenizer, + max_length: int, + model: PretrainedModel = None): + template_config_file = os.path.join(data_path, TEMPLATE_CONFIG_FILE) + if not os.path.isfile(template_config_file): + raise ValueError("{} not found under {}".format( + TEMPLATE_CONFIG_FILE, data_path)) + with open(template_config_file, "r") as fp: + prompt = json.loads(fp.readline().strip()) + # TODO (Huijuan): Load all configs from data_path. + template = cls.create_from(prompt=prompt, + tokenizer=tokenizer, + max_length=max_length, + model=model) + template_param_file = os.path.join(data_path, TEMPLATE_PARAMETER_FILE) + if os.path.isfile(template_param_file): + template.set_state_dict(paddle.load(template_param_file)) + return template diff --git a/paddlenlp/prompt/verbalizer.py b/paddlenlp/prompt/verbalizer.py index ac64c861cc64..bb412259d78b 100644 --- a/paddlenlp/prompt/verbalizer.py +++ b/paddlenlp/prompt/verbalizer.py @@ -12,312 +12,337 @@ # See the License for the specific language governing permissions and # limitations under the License. -from abc import abstractmethod -from collections import defaultdict import os import copy import json +from abc import abstractmethod +from typing import Any, Dict, List import numpy as np -from typing import List, Dict, Union import paddle import paddle.nn as nn import paddle.nn.functional as F +from paddle import Tensor +from paddlenlp.transformers import PretrainedTokenizer, PretrainedModel +from paddlenlp.utils.log import logger -from ..utils.log import logger - -__all__ = [ - "Verbalizer", "MultiMaskVerbalizer", "ManualVerbalizer", "SoftVerbalizer" -] +__all__ = ["Verbalizer", "ManualVerbalizer", "SoftVerbalizer"] -VERBALIZER_FILE = "verbalizer.json" +# Verbalizer used to be saved in a file. +VERBALIZER_CONFIG_FILE = "verbalizer_config.json" +VERBALIZER_PARAMETER_FILE = "verbalizer_state.pdparams" class Verbalizer(nn.Layer): """ - Base verbalizer class used to process the outputs and labels. + Base class for [`Verbalizer`]. Args: - labels (list): - The sequence of labels in task. - + label_words (`dict`): + Define the mapping from labels to a single or multiple words. + tokenizer (`PretrainedTokenizer`): + An instance of PretrainedTokenizer for label word tokenization. """ - def __init__(self, labels): - super().__init__() - self.labels = labels + def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer, + **kwargs): + super(Verbalizer, self).__init__() + for key, value in kwargs.items(): + setattr(self, key, value) + self.tokenizer = tokenizer + self.token_aggregate_type = kwargs.get("token_aggregate_type", "mean") + self.word_aggregate_type = kwargs.get("word_aggregate_type", "mean") + self.mask_aggregate_type = kwargs.get("mask_aggregate_type", "product") + self.post_log_softmax = kwargs.get("post_log_sigmoid", True) + self.label_token_weight = kwargs.get("label_token_weight", None) + if self.label_token_weight is not None: + self.label_token_weight = self.normalize( + self.project(self.label_token_weight.unsqueeze(0))) + self.label_words = label_words @property def labels(self): - labels = getattr(self, "_labels", None) - if labels is None: - raise RuntimeError("`labels` is not set yet.") - return labels + if not hasattr(self, "_labels"): + raise RuntimeError("Attribute `labels` is not set yet.") + return self._labels @labels.setter def labels(self, labels): if labels is not None: self._labels = sorted(labels) - else: - self._labels = None @property def label_words(self): - label_words = getattr(self, "_label_words", None) - if label_words is None: - raise RuntimeError("`label_words not set yet.") - return label_words + if not hasattr(self, "_label_words"): + raise RuntimeError("Mapping from labels to words is not set yet.") + return self._label_words @label_words.setter - def label_words(self, label_words: Union[List, Dict]): + def label_words(self, label_words: Dict): if label_words is None: return None - if isinstance(label_words, dict): - new_labels = sorted(list(label_words.keys())) - if self._labels is None: - self._labels = new_labels - elif new_labels != self.labels: - raise ValueError( - f"The given `label_words` {new_labels} does not match " + - f"predefined labels {self.labels}.") - self._label_words = [label_words[k] for k in self.labels] - elif isinstance(label_words, list): - if self._labels is None: - raise ValueError( - "`labels` should be set as the given `label_words` is " - "a list. Make sure that the order is compatible.") - if len(self.labels) != len(label_words): - raise ValueError( - "The length of given `label_words` and predefined " + - "labels do not match.") - self._label_words = label_words - else: - raise TypeError('Unsupported type {} for label_words'.format( - type(label_words))) - self.process_label_words() - - @property - def labels_to_ids(self): - if not hasattr(self, 'labels'): - raise RuntimeError( - 'Property labels_to_ids has not been set before used.') - return {k: i for i, k in enumerate(self.labels)} - - @property - def ids_to_labels(self): - if not hasattr(self, 'labels'): - raise RuntimeError( - 'Property ids_to_labels has not been set before used.') - return {i: k for i, k in enumerate(self.labels)} - - @staticmethod - def add_prefix(label_words, prefix): - """ Add prefix to get expected token ids. """ - if isinstance(label_words[0], str): - label_words = [[word] for word in label_words] - - new_label_words = [] - for words_per_label in label_words: - new_words_per_label = [] - for word in words_per_label: - new_words_per_label.append(prefix + word) - new_label_words.append(new_words_per_label) - return new_label_words + self.labels = list(label_words.keys()) + self.labels_to_ids = { + label: idx + for idx, label in enumerate(self._labels) + } + self._words = [] + for label in self._labels: + words = label_words[label] + if isinstance(words, str): + words = [words] + self._words.append(words) + self._label_words = { + label: word + for label, word in zip(self._labels, self._words) + } + self.preprocess_label_words() + self.create_parameters() @abstractmethod - def process_label_words(self, ): - """ A hook to process verbalizer when it is set. """ - raise NotImplementedError - - @abstractmethod - def project(self, logits, **kwargs): - """ - Project the logits with shape ```[..., vocab_size]``` into - label_word_logits with shape ```[..., label_words]```. + def create_parameters(self): + """ + A hook to create parameters for mapping from labels to words. """ raise NotImplementedError - @staticmethod - def aggregate(embeddings, mask=None, atype='mean', ndim=2): - """ - Aggregate embeddings at the last dimension according to `atype` - if its number of dimensions is greater than `ndim`. - Used to handle multiple tokens for words and multiple words - for labels. + def preprocess_label_words(self): + label_token_ids = [] + for label_word in self._words: + word_token_ids = [] + for word in label_word: + token_ids = self.tokenizer.encode(word, + add_special_tokens=False, + return_token_type_ids=False) + word_token_ids.append(token_ids["input_ids"]) + label_token_ids.append(word_token_ids) + + max_num_words = max([len(words) for words in self._words]) + max_num_tokens = max([ + max([len(token_ids) for token_ids in word_token_ids]) + for word_token_ids in label_token_ids + ]) + token_ids_shape = [len(self.labels), max_num_words, max_num_tokens] + token_ids = np.zeros(token_ids_shape) + word_mask = np.zeros(token_ids_shape[:-1]) + token_mask = np.zeros(token_ids_shape) + for label_id, word_token_ids in enumerate(label_token_ids): + word_mask[label_id][:len(word_token_ids)] = 1 + for word_id, tokens in enumerate(word_token_ids): + token_ids[label_id][word_id][:len(tokens)] = tokens + token_mask[label_id][word_id][:len(tokens)] = 1 + self.token_ids = paddle.to_tensor(token_ids, + dtype="int64", + stop_gradient=True) + self.word_mask = paddle.to_tensor(word_mask, + dtype="float32", + stop_gradient=True) + self.token_mask = paddle.to_tensor(token_mask, + dtype="int64", + stop_gradient=True) - Args: - embeddings (paddle.Tensor): - The original embeddings. - atype (str): - The aggregation strategy, including mean and first. - ndim (str): - The aggregated embeddings' number of dimensions. + def convert_labels_to_ids(self, label: str): + assert isinstance(label, str) + return self.labels_to_ids[label] - """ - if embeddings.ndim > ndim and atype is not None: - if atype == 'mean': - if mask is None: - return embeddings.mean(axis=-1) - return (embeddings * mask.unsqueeze(0)).sum( - axis=-1) / (mask.unsqueeze(0).sum(axis=-1) + 1e-10) - elif atype == 'max': - if mask is None: - return embeddings.max(axis=-1) - return (embeddings - 1e4 * (1 - mask.unsqueeze(0))).max(axis=-1) - elif atype == 'first': - return embeddings[..., 0] - else: - raise ValueError('Unsupported aggregate type {}'.format(atype)) - return embeddings + def convert_ids_to_labels(self, index: int): + assert isinstance(index, int) + return self.labels[index] - def normalize(self, logits): - """ Normalize the logits of every example. """ - new_logits = F.softmax(logits.reshape(logits.shape[0], -1), axis=-1) - return new_logits.reshape(*logits.shape) + def project(self, outputs): + """ + Fetch label word predictions from outputs over vocabulary. + """ + token_ids = self.token_ids.reshape([-1]) + label_token_outputs = outputs.index_select(index=token_ids, axis=-1) + label_shape = [*outputs.shape[:-1], *self.token_ids.shape] + label_token_outputs = label_token_outputs.reshape(label_shape) + label_word_outputs = self.aggregate(label_token_outputs, + self.token_mask, + self.token_aggregate_type) + label_word_outputs -= 1e4 * (1 - self.word_mask) + return label_word_outputs + + def process_outputs(self, outputs, masked_positions: Tensor = None): + """ + Process outputs of `PretrainedModelForMaskedLM` over vocabulary. + """ + if masked_positions is None: + return outputs + batch_size, _, num_pred = outputs.shape + outputs = outputs.reshape([-1, num_pred]) + outputs = paddle.gather(outputs, masked_positions) + outputs = outputs.reshape([batch_size, -1, num_pred]) + return outputs + + def aggregate(self, outputs: Tensor, mask: Tensor, atype: str): + """ + Aggregate multiple tokens/words for each word/label. + """ + mask = mask.unsqueeze(0) + if atype == "mean": + outputs = outputs * mask + outputs = outputs.sum(axis=-1) / (mask.sum(axis=-1) + 1e-15) + elif atype == "max": + outputs = (outputs - 1e4 * (1 - mask)).max(axis=-1) + elif atype == "first": + index = paddle.to_tensor([0]) + outputs = paddle.index_select(outputs, index, axis=-1) + else: + raise ValueError( + "Strategy {} is not supported to aggregate multiple " + "tokens.".format(atype)) + return outputs - def from_file(self, path): + def normalize(self, outputs: Tensor): """ - Load labels and corresponding words from files. + Normalize the outputs over the whole vocabulary. """ - raise NotImplementedError + batch_size = outputs.shape[0] + outputs = F.softmax(outputs.reshape([batch_size, -1]), + axis=-1).reshape(outputs.shape) + return outputs - def save_to(self, path): - label_state = [self.labels, self.token_ids.numpy().tolist()] - with open(os.path.join(path, VERBALIZER_FILE), "w") as f: - json.dump(label_state, f) + def calibrate(self, label_word_outputs: Tensor): + """ + Calibrate predictions with pre-defined weights over the whole vocabulary. + """ + if self.label_token_weight.dim() != 1: + raise ValueError("Weights of label tokens should be a 1-D tensor.") + weight_shape = self.label_token_weight.shape + output_shape = label_word_outputs.shape + if weight_shape[1:] != output_shape[1:] or weight_shape[0] != 1: + raise ValueError( + "Shapes of label token weights and predictions do not match, " + "got {} and {}.".format(weight_shape, output_shape)) + label_word_outputs /= (self.label_token_weight + 1e-15) + batch_size = label_word_outputs.shape0[0] + label_word_outputs = paddle.mean( + label_word_outputs.reshape([batch_size, -1])).reshape(output_shape) + + return label_word_outputs + + def save(self, save_path): + if not os.path.exists(save_path): + os.makedirs(save_path, exist_ok=True) + verb_config_file = os.path.join(save_path, VERBALIZER_CONFIG_FILE) + with open(verb_config_file, "w", encoding="utf-8") as fp: + json.dump(self.label_words, fp, ensure_ascii=False) + verb_params_file = os.path.join(save_path, VERBALIZER_PARAMETER_FILE) + verb_state_dict = self.state_dict() + if len(verb_state_dict) > 0: + paddle.save(self.state_dict(), VERBALIZER_PARAMETER_FILE) @classmethod - def load_from(cls, path): - with open(os.path.join(path, VERBALIZER_FILE), "r") as f: - label_state = json.load(f) - return label_state + def load_from(cls, data_path: os.PathLike, tokenizer: PretrainedTokenizer): + verb_config_file = os.path.join(data_path, VERBALIZER_CONFIG_FILE) + if not os.path.isfile(verb_config_file): + raise ValueError("{} not found under {}".format( + VERBALIZER_CONFIG_FILE, data_path)) + with open(verb_config_file, "r") as fp: + label_words = json.load(fp) + + verbalizer = cls(label_words, tokenizer) + verb_state_file = os.path.join(data_path, VERBALIZER_PARAMETER_FILE) + if os.path.isfile(verb_state_file): + verbalizer.set_state_dict(paddle.load(verb_state_file)) + logger.info( + "Loading verbalizer state dict from {}".format(verb_state_file)) + return verbalizer class ManualVerbalizer(Verbalizer): """ - Manual Verbalizer to map labels to words for hard prompt methods. + ManualVerbalizer defines mapping from labels to words manually. Args: - tokenizer (paddlenlp.transformers.PretrainedTokenizer): - The tokenizer of pretrained models. - labels (list): - The sequence of all labels. - label_words (dict or list): - The dictionary or corresponding list to map labels to words. - prefix (str): - The prefix string of words, used in PLMs like RoBERTa, which is sensitive to the prefix. + label_words (`dict`): + Define the mapping from labels to a single or multiple words. + tokenizer (`PretrainedTokenizer`): + An instance of PretrainedTokenizer for label word tokenization. """ - def __init__(self, tokenizer, labels=None, label_words=None, prefix=None): - super().__init__(labels=labels) - self.tokenizer = tokenizer - self.prefix = prefix - self.label_words = label_words - - def process_label_words(self): - """ Create the label-word-token array and its corresponding mask. """ - if self.prefix is not None: - self._label_words = self.add_prefix(self.label_words, self.prefix) - - all_ids = [] - for words_per_label in self.label_words: - word_ids = [] - for word in words_per_label: - word_ids.append( - self.tokenizer.encode( - word, - add_special_tokens=False, - return_token_type_ids=False)["input_ids"]) - all_ids.append(word_ids) - - max_num_words = max([len(words) for words in self.label_words]) - max_num_tokens = max([ - max([len(token_ids) for token_ids in word_ids]) - for word_ids in all_ids - ]) - token_ids_shape = [len(self.labels), max_num_words, max_num_tokens] - token_ids = np.zeros(shape=token_ids_shape) - token_mask = np.zeros(shape=token_ids_shape) - word_mask = np.zeros(shape=[len(self.labels), max_num_words]) - for label_i, ids_per_label in enumerate(all_ids): - word_mask[label_i][:len(ids_per_label)] = 1 - for word_i, ids_per_word in enumerate(ids_per_label): - token_ids[label_i][word_i][:len(ids_per_word)] = ids_per_word - token_mask[label_i][word_i][:len(ids_per_word)] = 1 - self.token_ids = paddle.to_tensor(token_ids, - dtype="int64", - stop_gradient=True) - self.token_ids_mask = paddle.to_tensor(token_mask, - dtype="int64", - stop_gradient=True) - self.word_ids_mask = paddle.to_tensor(word_mask, - dtype="float32", - stop_gradient=True) - - def project(self, logits): - word_shape = [*logits.shape[:-1], *self.token_ids.shape] - token_logits = logits.index_select(index=self.token_ids.reshape([-1]), - axis=-1).reshape(word_shape) - word_logits = self.aggregate(token_logits, self.token_ids_mask) - return word_logits - - def process_outputs(self, logits, inputs, **kwargs): - mask_ids = inputs["mask_ids"].unsqueeze(2) - real_len = logits.shape[1] - mask_ids = mask_ids[:, -real_len:] - logits = paddle.where(mask_ids == 1, logits, paddle.zeros_like(logits)) - logits = logits.sum(axis=1) / mask_ids.sum(axis=1) - - word_logits = self.project(logits) - label_logits = self.aggregate(word_logits, self.word_ids_mask) - return label_logits + def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer): + super(ManualVerbalizer, self).__init__(label_words=label_words, + tokenizer=tokenizer) + + def create_parameters(self): + return None + + def aggregate_multiple_mask(self, outputs: Tensor, atype: str = None): + if atype is None: + return outputs + assert outputs.ndim == 3 + if atype == "mean": + outputs = outputs.sum(axis=1) + elif atype == "max": + outputs = outputs.max(axis=1) + elif atype == "first": + index = paddle.to_tensor([0]) + outputs = paddle.index_select(outputs, index, axis=1) + elif atype == "product": + new_outputs = outputs[:, 0, :] + for index in range(1, outputs.shape[1]): + new_outputs *= outputs[:, index, :] + outputs = new_outputs + else: + raise ValueError( + "Strategy {} is not supported to aggregate multiple " + "tokens.".format(atype)) + return outputs + + def process_outputs(self, + outputs: Tensor, + masked_positions: Tensor = None, + **kwargs): + """ + Process outputs over the vocabulary, including the following steps: - @classmethod - def from_file(cls, tokenizer, label_file, prefix=None, delimiter="=="): - with open(label_file, "r", encoding="utf-8") as fp: - label_words = defaultdict(list) - for line in fp: - data = line.strip().split(delimiter) - word = data[1] if len(data) > 1 else data[0].split("##")[-1] - label_words[data[0]].append(word) - return cls(tokenizer, - labels=set(label_words.keys()), - label_words=dict(label_words), - prefix=prefix) + (1) Project outputs into the outputs of corresponding word. + If self.post_log_softmax is True: + + (2) Normalize over all label words. -class MultiMaskVerbalizer(ManualVerbalizer): + (3) Calibrate (optional) - def __init__(self, tokenizer, labels=None, label_words=None, prefix=None): - super().__init__(tokenizer, labels, label_words, prefix) + (4) Aggregate multiple words for each label. - def process_outputs(self, logits, inputs, **kwargs): - """ - Process logits according to mask ids and label words. Args: - logits (paddle.Tensor): - The output of ForMaskedLM model with shape - [batch_size, max_seq_length, vocab_size]. - inputs (InputFeatures): - The input features of model, including mask_ids. + outputs (`Tensor`): + The outputs of `PretrainedModel` which class name ends with + `ForMaskedLM`. + Returns: + The prediction outputs over labels (`Tensor`). """ - batch_size, seq_len, vocab_size = logits.shape - batch_ids, word_ids = paddle.where(inputs["mask_ids"] == 1) - mask_ids = batch_ids * seq_len + word_ids - mask_logits = logits.reshape([-1, vocab_size])[mask_ids] - mask_logits = mask_logits.reshape([batch_size, -1, vocab_size]) - return mask_logits + outputs = super(ManualVerbalizer, + self).process_outputs(outputs, masked_positions) + label_word_outputs = self.project(outputs) + + if self.post_log_softmax: + label_word_outputs = self.normalize(label_word_outputs) + + if self.label_token_weight is not None: + label_word_outputs = self.calibrate(label_word_outputs) + + label_word_outputs = paddle.log(label_word_outputs + 1e-15) + label_outputs = self.aggregate(label_word_outputs, self.word_mask, + self.word_aggregate_type) + label_outputs = self.aggregate_multiple_mask(label_outputs, + self.mask_aggregate_type) + return label_outputs -class Identity(nn.Layer): + +class MaskedLMIdentity(nn.Layer): """ - Identity layer to replace the last linear layer in MLM model, which - outputs the input `sequence_output` directly. + Identity layer with the same arguments as the last linear layer in + `PretrainedModel` whose name ends with `ForMaskedLM`. """ def __init__(self): - super().__init__() + super(MaskedLMIdentity, self).__init__() def forward(self, sequence_output, masked_positions=None): return sequence_output @@ -325,59 +350,44 @@ def forward(self, sequence_output, masked_positions=None): class SoftVerbalizer(Verbalizer): """ - Soft Verbalizer to encode labels as embeddings. + SoftVerbalizer for the WARP method. Args: - tokenizer (paddlenlp.transformers.PretrainedTokenizer): - The tokenizer of pretrained models. - model (paddlenlp.transformers.PretrainedModel): - The pretrained language model. - labels (list): - The sequence of all labels. - label_words (dict or list): - The dictionary or corresponding list to map labels to words. - prefix (str): - The prefix string of words, used in PLMs like RoBERTa, which is sensitive to the prefix. + label_words (`dict`): + Define the mapping from labels to a single or multiple words. + tokenizer (`PretrainedTokenizer`): + An instance of PretrainedTokenizer for label word tokenization. + model (`PretrainedModel`): + An instance of PretrainedModel with class name ends with `ForMaskedLM` """ - LAST_WEIGHT = ["ErnieForMaskedLM", "BertForMaskedLM"] LAST_LINEAR = ["AlbertForMaskedLM", "RobertaForMaskedLM"] - def __init__(self, tokenizer, model, labels, label_words=None, prefix=''): - super().__init__(labels=labels) - self.tokenizer = tokenizer - self.labels = labels - self.prefix = prefix - self.label_words = label_words - - self._extract_head(model) - - def process_label_words(self): - """ Create the label-token array and its corresponding mask. """ - if self.prefix is not None: - self._label_words = self.add_prefix(self.label_words, self.prefix) - - all_ids = [] - for words_per_label in self.label_words: - if len(words_per_label) > 1: - logger.warning("Only the first word used for every label.") - all_ids.append( - self.tokenizer.encode(words_per_label[0], - add_special_tokens=False, - return_token_type_ids=False)["input_ids"]) - - max_num_tokens = max([len(tokens) for tokens in all_ids]) - token_ids = np.zeros(shape=[len(self.labels), max_num_tokens]) - token_mask = np.zeros(shape=[len(self.labels), max_num_tokens]) - for label_i, ids_per_label in enumerate(all_ids): - token_ids[label_i][:len(ids_per_label)] = ids_per_label - token_mask[label_i][:len(ids_per_label)] = 1 - self.token_ids = paddle.to_tensor(token_ids, - dtype="int64", - stop_gradient=True) - self.token_ids_mask = paddle.to_tensor(token_mask, - dtype="int64", - stop_gradient=True) + def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer, + model: PretrainedModel): + super(SoftVerbalizer, self).__init__(label_words=label_words, + tokenizer=tokenizer, + model=model) + del self.model + setattr(model, self.head_name[0], MaskedLMIdentity()) + + def create_parameters(self): + # Only the first word used for initialization. + if self.token_ids.shape[1] != 1: + logger.warning("Only the first word for each label is used for" + " initialization.") + index = paddle.to_tensor([0]) + self.token_ids = paddle.index_select(self.token_ids, index, axis=1) + self.token_mask = paddle.index_select(self.token_mask, + index, + axis=1) + self.word_mask = paddle.ones([len(self.labels), 1]) + self._extract_head(self.model) + + def process_outputs(self, outputs: Tensor, masked_positions: Tensor = None): + outputs = super(SoftVerbalizer, + self).process_outputs(outputs, masked_positions) + return self.head(outputs).squeeze(1) def head_parameters(self): if isinstance(self.head, nn.Linear): @@ -393,31 +403,6 @@ def non_head_parameters(self): return [(n, p) for n, p in self.head.named_parameters() if self.head_name[1] not in n] - def process_model(self, model): - setattr(model, self.head_name[0], Identity()) - return model - - def process_outputs(self, logits, inputs=None, **kwargs): - mask_ids = inputs["mask_ids"].unsqueeze(2) - real_len = logits.shape[1] - mask_ids = mask_ids[:, -real_len:] - logits = (logits * mask_ids).sum(axis=1) / mask_ids.sum(axis=1) - return self.head(logits) - - @classmethod - def from_file(cls, tokenizer, model, label_file, prefix=None): - with open(label_file, "r", encoding="utf-8") as fp: - label_words = defaultdict(list) - for line in fp: - data = line.strip().split("==") - word = data[1] if len(data) > 1 else data[0] - label_words[data[0]].append(word) - return cls(tokenizer, - model, - labels=set(label_words.keys()), - label_words=dict(label_words), - prefix=prefix) - def _extract_head(self, model): model_type = model.__class__.__name__ if model_type in self.LAST_LINEAR: @@ -439,45 +424,51 @@ def _extract_head(self, model): self.head_name.append(name) break elif model_type in self.LAST_WEIGHT: - # OnlyMLMHead last_name = [n for n, p in model.named_children()][-1] head = getattr(model, last_name) self.head_name = [last_name] - # LMPredictionHead - last_name = [n for n, p in head.named_children()][-1] - self.head = copy.deepcopy(getattr(head, last_name)) - self.head_name.append("decoder") + # OnlyMLMHead + if model_type in ["ErnieForMaskedLM", "BertForMaskedLM"]: + last_name = [n for n, p in head.named_children()][-1] + self.head = copy.deepcopy(getattr(head, last_name)) + self.head_name.append("decoder") + else: + self.head = copy.deepcopy(head) + # LMPredictionHead module = paddle.to_tensor(getattr(self.head, "decoder_weight")) - bias = paddle.to_tensor(getattr(self.head, "decoder_bias")) new_head = nn.Linear(len(self.labels), module.shape[1], bias_attr=False) new_head.weight.set_value(self._create_init_weight(module.T).T) setattr(self.head, "decoder_weight", new_head.weight) getattr(self.head, "decoder_weight").stop_gradient = False - setattr( - self.head, "decoder_bias", - self.head.create_parameter(shape=[len(self.labels)], - dtype=new_head.weight.dtype, - is_bias=True)) - getattr(self.head, "decoder_bias").stop_gradient = False + if hasattr(self.head, "decoder_bias"): + bias = paddle.to_tensor(getattr(self.head, "decoder_bias")) + setattr( + self.head, "decoder_bias", + self.head.create_parameter(shape=[len(self.labels)], + dtype=new_head.weight.dtype, + is_bias=True)) + getattr(self.head, "decoder_bias").stop_gradient = False else: raise NotImplementedError( f"Please open an issue to request for support of {model_type}" + f" or contribute to PaddleNLP.") def _create_init_weight(self, weight, is_bias=False): + token_ids = self.token_ids.squeeze(1) + token_mask = self.token_mask.squeeze(1) + aggr_type = self.token_aggregate_type if is_bias: - bias = paddle.index_select(weight, - self.token_ids.reshape([-1]), - axis=0).reshape(self.token_ids.shape) - bias = self.aggregate(bias, self.token_ids_mask) + bias = paddle.index_select(weight, token_ids.reshape([-1]), + axis=0).reshape(token_ids.shape) + bias = self.aggregate(bias, token_mask, aggr_type) return bias else: - word_shape = [weight.shape[0], *self.token_ids.shape] + word_shape = [weight.shape[0], *token_ids.shape] weight = paddle.index_select(weight, - self.token_ids.reshape([-1]), + token_ids.reshape([-1]), axis=1).reshape(word_shape) - weight = self.aggregate(weight, self.token_ids_mask) + weight = self.aggregate(weight, token_mask, aggr_type) return weight