diff --git a/applications/text_classification/hierarchical/few-shot/README.md b/applications/text_classification/hierarchical/few-shot/README.md
index 347e522164f8..c1311eda6d1e 100644
--- a/applications/text_classification/hierarchical/few-shot/README.md
+++ b/applications/text_classification/hierarchical/few-shot/README.md
@@ -65,9 +65,9 @@
内存: 630 GB
-3. PaddlePaddle 版本:2.3.1
+3. PaddlePaddle 版本:2.4rc
-4. PaddleNLP 版本:2.3.5 (develop)
+4. PaddleNLP 版本:2.4.3
5. 评估设置
@@ -91,7 +91,7 @@
| model_name | 训练方式 | Micro F1分数 | Macro F1分数 |
| ---------- | ------- | ----------- | ----------- |
| ernie-3.0-base-zh | 微调学习 | 0.7172 | 0.3821 |
- | ernie-3.0-base-zh | 提示学习 | 0.8855 | 0.8443 |
+ | ernie-3.0-base-zh | 提示学习 | 0.8945 | 0.8516 |
@@ -102,10 +102,10 @@
### 3.1 运行环境
-- python >= 3.6
-- paddlepaddle > 2.3(2.4版本发布前推荐安装[develop版本](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html))
-- paddlenlp >= 2.3.5
-- paddle2onnx >= 1.0.0rc3
+- python >= 3.7
+- paddlepaddle >= 2.4rc
+- paddlenlp >= 2.4.3
+- paddle2onnx >= 1.0.3
### 3.2 代码结构
@@ -222,12 +222,12 @@ python train.py \
--do_export \
--num_train_epochs 100 \
--logging_steps 5 \
+--save_total_limit 1 \
--per_device_eval_batch_size 32 \
--per_device_train_batch_size 8 \
--metric_for_best_model macro_f1_score \
--load_best_model_at_end \
---evaluation_strategy epoch \
---save_strategy epoch
+--eval_steps 100
```
**多卡训练**
@@ -247,12 +247,12 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \
--do_export \
--num_train_epochs 100 \
--logging_steps 5 \
+--save_total_limit 1 \
--per_device_eval_batch_size 32 \
--per_device_train_batch_size 8 \
--metric_for_best_model macro_f1_score \
--load_best_model_at_end \
---evaluation_strategy epoch \
---save_strategy epoch
+--eval_steps 100
```
可配置参数说明:
@@ -273,6 +273,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \
- `do_export`: 是否在运行结束时将模型导出为静态图,保存路径为`output_dir/export`。
- `num_train_epochs`: 训练的最大轮数。
- `max_steps`: 训练的最大步数。此设置将会覆盖`num_train_epochs`。
+- `save_total_limit`: 模型检查点保存数量。
- `device`: 使用的设备,默认为`gpu`。
- `eval_steps`: 评估模型的间隔步数。
- `logging_steps`: 打印日志的间隔步数。
@@ -352,9 +353,9 @@ python infer.py --model_path_prefix checkpoints/export/model --data_dir ./data -
可配置参数说明:
- `model_path_prefix`: 导出的静态图模型路径及文件前缀。
-- `model_name_or_path`: 内置预训练模型名,或者模型参数配置目录路径,用于加载tokenizer。默认为`ernie-3.0-base-zh`。
+- `model_name`: 内置预训练模型名,用于加载tokenizer。默认为`ernie-3.0-base-zh`。
- `data_dir`: 待推理数据所在路径,数据应存放在该目录下的`data.txt`文件。
-- `max_seq_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。
+- `max_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。
- `batch_size`: 每次预测的样本数量。
- `device`: 选择推理设备,包括`cpu`和`gpu`。默认为`gpu`。
- `device_id`: 指定GPU设备ID。
diff --git a/applications/text_classification/hierarchical/few-shot/infer.py b/applications/text_classification/hierarchical/few-shot/infer.py
index 7442641d502f..5fc92a9dd933 100644
--- a/applications/text_classification/hierarchical/few-shot/infer.py
+++ b/applications/text_classification/hierarchical/few-shot/infer.py
@@ -14,23 +14,24 @@
import os
import six
+import json
import psutil
import argparse
import numpy as np
from paddlenlp.utils.log import logger
-from paddlenlp.prompt import AutoTemplate, Verbalizer, InputExample
-from paddlenlp.transformers import AutoTokenizer
+from paddlenlp.prompt import AutoTemplate, PromptDataCollatorWithPadding
+from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM
import paddle2onnx
import onnxruntime as ort
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_path_prefix", type=str, required=True, help="The path prefix of inference model to be used.")
-parser.add_argument("--model_name_or_path", default="ernie-3.0-base-zh", type=str, help="The directory or name of model.")
+parser.add_argument("--model_name", default="ernie-3.0-base-zh", type=str, help="The name of pretrained model.")
parser.add_argument("--data_dir", default=None, type=str, help="The path to the prediction data, including label.txt and data.txt.")
-parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization.")
+parser.add_argument("--max_length", default=128, type=int, help="The maximum total input sequence length after tokenization.")
parser.add_argument("--use_fp16", action='store_true', help="Whether to use fp16 inference, only takes effect when deploying on gpu.")
parser.add_argument("--batch_size", default=200, type=int, help="Batch size per GPU/CPU for predicting.")
parser.add_argument("--num_threads", default=psutil.cpu_count(logical=False), type=int, help="num_threads for cpu.")
@@ -103,12 +104,6 @@ def __init__(self,
'device_id':
device_id
}])
- self.input_handles = [
- self.predictor.get_inputs()[0].name,
- self.predictor.get_inputs()[1].name,
- self.predictor.get_inputs()[2].name
- ]
-
if device == "gpu":
try:
assert 'CUDAExecutionProvider' in self.predictor.get_providers()
@@ -122,27 +117,52 @@ def __init__(self,
logger.info(">>> [InferBackend] Engine Created ...")
def infer(self, input_dict: dict):
- input_dict = {
- k: v
- for k, v in input_dict.items() if k in self.input_handles
- }
result = self.predictor.run(None, input_dict)
return result
class HierachicalPredictor(object):
- def __init__(self, args, label_list):
- self._label_list = label_list
- self._tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
- self._max_seq_length = args.max_seq_length
- self._batch_size = args.batch_size
- self.inference_backend = InferBackend(args.model_path_prefix,
- args.device, args.device_id,
- args.use_fp16, args.num_threads)
- self._template = AutoTemplate.load_from(
- os.path.dirname(args.model_path_prefix), self._tokenizer,
- args.max_seq_length)
+ def __init__(self, args):
+ self.args = args
+ self.tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+ self.model = AutoModelForMaskedLM.from_pretrained(args.model_name)
+ self.template, self.labels, self.input_handles = self.post_init()
+ self.collate_fn = PromptDataCollatorWithPadding(
+ self.tokenizer,
+ padding=True,
+ return_tensors="np",
+ return_attention_mask=True)
+
+ self.inference_backend = InferBackend(self.args.model_path_prefix,
+ self.args.device,
+ self.args.device_id,
+ self.args.use_fp16,
+ self.args.num_threads)
+
+ def post_init(self):
+ export_path = os.path.dirname(self.args.model_path_prefix)
+ template_path = os.path.join(export_path, "template_config.json")
+ with open(template_path, "r") as fp:
+ prompt = json.load(fp)
+ template = AutoTemplate.create_from(prompt, self.tokenizer,
+ self.args.max_length,
+ self.model)
+ keywords = template.extract_template_keywords(template.prompt)
+ inputs = [
+ "input_ids", "token_type_ids", "position_ids", "attention_mask",
+ "masked_positions"
+ ]
+ if "soft" in keywords:
+ inputs.append("soft_token_ids")
+ if "encoder" in keywords:
+ inputs.append("encoder_ids")
+ verbalizer_path = os.path.join(export_path, "verbalizer_config.json")
+ with open(verbalizer_path, "r") as fp:
+ label_words = json.load(fp)
+ labels = sorted(list(label_words.keys()))
+
+ return template, labels, inputs
def predict(self, input_data: list):
encoded_inputs = self.preprocess(input_data)
@@ -155,14 +175,27 @@ def _infer(self, input_dict):
infer_data = self.inference_backend.infer(input_dict)
return infer_data
- def infer_batch(self, encoded_inputs):
- num_sample = len(encoded_inputs["input_ids"])
+ def infer_batch(self, inputs):
+ num_sample = len(inputs)
infer_data = None
num_infer_data = None
- for idx in range(0, num_sample, self._batch_size):
- l, r = idx, idx + self._batch_size
- keys = encoded_inputs.keys()
- input_dict = {k: encoded_inputs[k][l:r] for k in keys}
+ for index in range(0, num_sample, self.args.batch_size):
+ left, right = index, index + self.args.batch_size
+ batch_dict = self.collate_fn(inputs[left:right])
+ input_dict = {}
+ for key in self.input_handles:
+ value = batch_dict[key]
+ if key == "attention_mask":
+ if value.ndim == 2:
+ value = (1 - value[:, np.newaxis, np.newaxis, :]) * -1e4
+ elif value.ndim != 4:
+ raise ValueError(
+ "Expect attention mask with ndim=2 or 4, but get ndim={}"
+ .format(value.ndim))
+ value = value.astype("float32")
+ else:
+ value = value.astype("int64")
+ input_dict[key] = value
results = self._infer(input_dict)
if infer_data is None:
infer_data = [[x] for x in results]
@@ -175,16 +208,8 @@ def infer_batch(self, encoded_inputs):
return infer_data
def preprocess(self, input_data: list):
- text = [InputExample(text_a=x) for x in input_data]
- inputs = [self._template.wrap_one_example(x) for x in text]
- inputs = {
- "input_ids":
- np.array([x["input_ids"] for x in inputs], dtype="int64"),
- "mask_ids":
- np.array([x["mask_ids"] for x in inputs], dtype="int64"),
- "soft_token_ids":
- np.array([x["soft_token_ids"] for x in inputs], dtype="int64")
- }
+ text = [{"text_a": x} for x in input_data]
+ inputs = [self.template(x) for x in text]
return inputs
@staticmethod
@@ -197,7 +222,7 @@ def postprocess(self, infer_data):
label_ids = np.argwhere(probs > threshold)
labels = [[] for _ in range(probs.shape[0])]
for idx, label_id in label_ids:
- labels[idx].append(self._label_list[label_id])
+ labels[idx].append(self.labels[label_id])
return {"label": labels}
def printer(self, result, input_data):
@@ -212,12 +237,10 @@ def printer(self, result, input_data):
for arg_name, arg_value in vars(args).items():
logger.info("{:20}: {}".format(arg_name, arg_value))
- export_path = os.path.dirname(args.model_path_prefix)
- labels, _ = Verbalizer.load_from(export_path)
+ predictor = HierachicalPredictor(args)
text_dir = os.path.join(args.data_dir, "data.txt")
with open(text_dir, "r", encoding="utf-8") as f:
text_list = [x.strip() for x in f.readlines()]
- predictor = HierachicalPredictor(args, labels)
predictor.predict(text_list)
diff --git a/applications/text_classification/hierarchical/few-shot/requirements_cpu.txt b/applications/text_classification/hierarchical/few-shot/requirements_cpu.txt
new file mode 100644
index 000000000000..bbe76e363f00
--- /dev/null
+++ b/applications/text_classification/hierarchical/few-shot/requirements_cpu.txt
@@ -0,0 +1,5 @@
+psutil
+paddlepaddle>=2.4rc
+paddlenlp>=2.4.3
+paddle2onnx>=1.0.3
+onnxruntime
diff --git a/applications/text_classification/hierarchical/few-shot/requirements_gpu.txt b/applications/text_classification/hierarchical/few-shot/requirements_gpu.txt
new file mode 100644
index 000000000000..66454bd8b6b5
--- /dev/null
+++ b/applications/text_classification/hierarchical/few-shot/requirements_gpu.txt
@@ -0,0 +1,7 @@
+psutil
+paddlepaddle-gpu>=2.4rc
+paddlenlp>=2.4.3
+paddle2onnx>=1.0.3
+onnxruntime-gpu
+onnx
+onnxconverter-common
diff --git a/applications/text_classification/hierarchical/few-shot/train.py b/applications/text_classification/hierarchical/few-shot/train.py
index c7c0a2daa38a..aac82bff9ab5 100644
--- a/applications/text_classification/hierarchical/few-shot/train.py
+++ b/applications/text_classification/hierarchical/few-shot/train.py
@@ -15,6 +15,7 @@
from dataclasses import dataclass, field
import os
import sys
+from collections import defaultdict
import paddle
import paddle.nn.functional as F
@@ -41,9 +42,6 @@
class DataArguments:
data_dir: str = field(default="./data", metadata={"help": "The dataset dictionary includes train.txt, dev.txt, test.txt, label.txt and data.txt (optional) files."})
prompt: str = field(default=None, metadata={"help": "The input prompt for tuning."})
- soft_encoder: str = field(default="lstm", metadata={"help": "The encoder type of soft template, `lstm`, `mlp` or None."})
- encoder_hidden_size: int = field(default=200, metadata={"help": "The dimension of soft embeddings."})
-
@dataclass
class ModelArguments:
@@ -67,17 +65,20 @@ def main():
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
# Define the template for preprocess and the verbalizer for postprocess.
- template = AutoTemplate.create_from(
- data_args.prompt,
- tokenizer,
- training_args.max_seq_length,
- model=model,
- prompt_encoder=data_args.soft_encoder,
- encoder_hidden_size=data_args.encoder_hidden_size)
- logger.info("Using template: {}".format(template.template))
+ template = AutoTemplate.create_from(data_args.prompt,
+ tokenizer,
+ training_args.max_seq_length,
+ model=model)
+ logger.info("Using template: {}".format(template.prompt))
label_file = os.path.join(data_args.data_dir, "label.txt")
- verbalizer = SoftVerbalizer.from_file(tokenizer, model, label_file)
+ with open(label_file, "r", encoding="utf-8") as fp:
+ label_words = defaultdict(list)
+ for line in fp:
+ data = line.strip().split("==")
+ word = data[1] if len(data) > 1 else data[0].split("##")[-1]
+ label_words[data[0]].append(word)
+ verbalizer = SoftVerbalizer(label_words, tokenizer, model)
# Load the few-shot datasets.
train_ds, dev_ds, test_ds = load_local_dataset(
@@ -139,11 +140,24 @@ def compute_metrics(eval_preds):
# Export static model.
if training_args.do_export:
+ template = prompt_model.template
+ template_keywords = template.extract_template_keywords(template.prompt)
input_spec = [
- InputSpec(shape=[None, None], dtype="int64"), # input_ids
- InputSpec(shape=[None, None], dtype="int64"), # mask_ids
- InputSpec(shape=[None, None], dtype="int64"), # soft_token_ids
+ InputSpec(shape=[None, None], dtype="int64"), # input_ids,
+ InputSpec(shape=[None, None], dtype="int64"), # token_type_ids
+ InputSpec(shape=[None, None], dtype="int64"), # position_ids
+ InputSpec(shape=[None, None, None, None],
+ dtype="float32") # attention_mask
]
+ if "mask" in template_keywords:
+ input_spec.append(InputSpec(shape=[None],
+ dtype="int64")) # masked_positions
+ if "soft" in template_keywords:
+ input_spec.append(InputSpec(shape=[None, None],
+ dtype="int64")) # soft_token_ids
+ if "encoder" in template_keywords:
+ input_spec.append(InputSpec(shape=[None, None],
+ dtype="int64")) # encoder_ids
export_path = os.path.join(training_args.output_dir, 'export')
trainer.export_model(export_path,
input_spec=input_spec,
diff --git a/applications/text_classification/hierarchical/few-shot/utils.py b/applications/text_classification/hierarchical/few-shot/utils.py
index 4880432310f3..e29aaa9db61d 100644
--- a/applications/text_classification/hierarchical/few-shot/utils.py
+++ b/applications/text_classification/hierarchical/few-shot/utils.py
@@ -15,7 +15,6 @@
import os
from paddlenlp.datasets import load_dataset
-from paddlenlp.prompt import InputExample
def load_local_dataset(data_path, splits, label_list):
@@ -39,14 +38,14 @@ def _reader(data_file, label_list):
for idx, line in enumerate(fp):
data = line.strip().split("\t")
if len(data) == 1:
- yield InputExample(text_a=data[0])
+ yield {"text_a": data[0]}
else:
text, label = data
label = label.strip().split(",")
label = [
float(1) if x in label else float(0) for x in label_list
]
- yield InputExample(text_a=text, labels=label)
+ yield {"text_a": text, "labels": label}
split_map = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
datasets = []
diff --git a/applications/text_classification/multi_class/few-shot/README.md b/applications/text_classification/multi_class/few-shot/README.md
index 3f6e5f759bbf..45f9b009bf34 100644
--- a/applications/text_classification/multi_class/few-shot/README.md
+++ b/applications/text_classification/multi_class/few-shot/README.md
@@ -65,9 +65,9 @@
内存: 630 GB
-3. PaddlePaddle 版本:2.3.1
+3. PaddlePaddle 版本:2.4rc
-4. PaddleNLP 版本:2.3.5 (develop)
+4. PaddleNLP 版本:2.4.3
5. 评估设置
@@ -82,7 +82,7 @@ python train.py --dataset_dir "./data/" --save_dir "./checkpoints" --max_seq_len
- 提示学习
```
-python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这条新闻写的是" --model_name_or_path ernie-3.0-base-zh --max_seq_length 128 --learning_rate 3e-5 --ppt_learning_rate 3e-4 --do_train --do_eval --num_train_epochs 100 --logging_steps 5 --per_device_eval_batch_size 32 --per_device_train_batch_size 8 --do_predict --metric_for_best_model accuracy --load_best_model_at_end --evaluation_strategy epoch --save_strategy epoch
+python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这条新闻写的是" --model_name_or_path ernie-3.0-base-zh --max_seq_length 128 --learning_rate 3e-5 --ppt_learning_rate 3e-4 --do_train --do_eval --num_train_epochs 100 --logging_steps 5 --per_device_eval_batch_size 32 --per_device_train_batch_size 8 --do_predict --metric_for_best_model accuracy --load_best_model_at_end --evaluation_strategy epoch --save_strategy epoch --save_total_limit 1
```
6. 精度评价指标:Accuracy
@@ -102,10 +102,10 @@ python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这条
### 3.1 运行环境
-- python >= 3.6
-- paddlepaddle > 2.3 (2.4版本发布前推荐安装[develop版本](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html))
-- paddlenlp >= 2.3.5
-- paddle2onnx >= 1.0.0rc3
+- python >= 3.7
+- paddlepaddle >= 2.4rc
+- paddlenlp >= 2.4.3
+- paddle2onnx >= 1.0.3
### 3.2 代码结构
@@ -204,14 +204,15 @@ python train.py \
--output_dir ./checkpoints/ \
--prompt "这条新闻标题的主题是" \
--max_seq_length 128 \
---learning_rate 3e-5 \
---ppt_learning_rate 3e-4 \
+--learning_rate 3e-6 \
+--ppt_learning_rate 3e-5 \
--do_train \
--do_eval \
--use_rdrop \
--max_steps 1000 \
--eval_steps 10 \
--logging_steps 5 \
+--save_total_limit 1 \
--load_best_model_at_end True \
--per_device_eval_batch_size 32 \
--per_device_train_batch_size 8 \
@@ -227,8 +228,8 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \
--output_dir ./checkpoints/ \
--prompt "这条新闻标题的主题是" \
--max_seq_length 128 \
---learning_rate 3e-5 \
---ppt_learning_rate 3e-4 \
+--learning_rate 3e-6 \
+--ppt_learning_rate 3e-5 \
--do_train \
--do_eval \
--use_rdrop \
@@ -236,6 +237,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \
--max_steps 1000 \
--eval_steps 10 \
--logging_steps 5 \
+--save_total_limit 1 \
--load_best_model_at_end True \
--per_device_eval_batch_size 32 \
--per_device_train_batch_size 8 \
@@ -260,6 +262,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \
- `do_predict`: 是否进行预测。
- `do_export`: 是否在运行结束时将模型导出为静态图,保存路径为`output_dir/export`。
- `max_steps`: 训练的最大步数。此设置将会覆盖`num_train_epochs`。
+- `save_total_limit`: 模型检查点保存数量。
- `eval_steps`: 评估模型的间隔步数。
- `device`: 使用的设备,默认为`gpu`。
- `logging_steps`: 打印日志的间隔步数。
@@ -335,9 +338,9 @@ python infer.py --model_path_prefix checkpoints/export/model --data_dir ./data -
可配置参数说明:
- `model_path_prefix`: 导出的静态图模型路径及文件前缀。
-- `model_name_or_path`: 内置预训练模型名,或者模型参数配置目录路径,用于加载tokenizer。默认为`ernie-3.0-base-zh`。
+- `model_name`: 内置预训练模型名,用于加载tokenizer。默认为`ernie-3.0-base-zh`。
- `data_dir`: 待推理数据所在路径,数据应存放在该目录下的`data.txt`文件。
-- `max_seq_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。
+- `max_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。
- `batch_size`: 每次预测的样本数量。
- `device`: 选择推理设备,包括`cpu`和`gpu`。默认为`gpu`。
- `device_id`: 指定GPU设备ID。
diff --git a/applications/text_classification/multi_class/few-shot/infer.py b/applications/text_classification/multi_class/few-shot/infer.py
index 142a20d1ed76..03715bc29d0e 100644
--- a/applications/text_classification/multi_class/few-shot/infer.py
+++ b/applications/text_classification/multi_class/few-shot/infer.py
@@ -14,23 +14,24 @@
import os
import six
+import json
import psutil
import argparse
import numpy as np
from paddlenlp.utils.log import logger
-from paddlenlp.prompt import AutoTemplate, Verbalizer, InputExample
-from paddlenlp.transformers import AutoTokenizer
+from paddlenlp.prompt import AutoTemplate, PromptDataCollatorWithPadding
+from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM
import paddle2onnx
import onnxruntime as ort
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_path_prefix", type=str, required=True, help="The path prefix of inference model to be used.")
-parser.add_argument("--model_name_or_path", default="ernie-3.0-base-zh", type=str, help="The directory or name of model.")
+parser.add_argument("--model_name", default="ernie-3.0-base-zh", type=str, help="The name of pretrained model.")
parser.add_argument("--data_dir", default=None, type=str, help="The path to the prediction data, including label.txt and data.txt.")
-parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization.")
+parser.add_argument("--max_length", default=128, type=int, help="The maximum total input sequence length after tokenization.")
parser.add_argument("--use_fp16", action='store_true', help="Whether to use fp16 inference, only takes effect when deploying on gpu.")
parser.add_argument("--batch_size", default=200, type=int, help="Batch size per GPU/CPU for predicting.")
parser.add_argument("--num_threads", default=psutil.cpu_count(logical=False), type=int, help="num_threads for cpu.")
@@ -103,11 +104,6 @@ def __init__(self,
'device_id':
device_id
}])
- self.input_handles = [
- self.predictor.get_inputs()[0].name,
- self.predictor.get_inputs()[1].name,
- self.predictor.get_inputs()[2].name
- ]
if device == "gpu":
try:
@@ -122,27 +118,53 @@ def __init__(self,
logger.info(">>> [InferBackend] Engine Created ...")
def infer(self, input_dict: dict):
- input_dict = {
- k: v
- for k, v in input_dict.items() if k in self.input_handles
- }
result = self.predictor.run(None, input_dict)
return result
class MultiClassPredictor(object):
- def __init__(self, args, label_list):
- self._label_list = label_list
- self._tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
- self._max_seq_length = args.max_seq_length
- self._batch_size = args.batch_size
- self.inference_backend = InferBackend(args.model_path_prefix,
- args.device, args.device_id,
- args.use_fp16, args.num_threads)
- self._template = AutoTemplate.load_from(
- os.path.dirname(args.model_path_prefix), self._tokenizer,
- args.max_seq_length)
+ def __init__(self, args):
+ self.args = args
+ self.tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+ self.model = AutoModelForMaskedLM.from_pretrained(args.model_name)
+ self.template, self.labels, self.input_handles = self.post_init()
+ self.collate_fn = PromptDataCollatorWithPadding(
+ self.tokenizer,
+ padding=True,
+ return_tensors="np",
+ return_attention_mask=True)
+
+ self.inference_backend = InferBackend(self.args.model_path_prefix,
+ self.args.device,
+ self.args.device_id,
+ self.args.use_fp16,
+ self.args.num_threads)
+
+ def post_init(self):
+ export_path = os.path.dirname(self.args.model_path_prefix)
+ template_path = os.path.join(export_path, "template_config.json")
+ with open(template_path, "r") as fp:
+ prompt = json.load(fp)
+ template = AutoTemplate.create_from(prompt, self.tokenizer,
+ self.args.max_length,
+ self.model)
+ keywords = template.extract_template_keywords(template.prompt)
+ inputs = [
+ "input_ids", "token_type_ids", "position_ids", "attention_mask"
+ ]
+ if "mask" in keywords:
+ inputs.append("masked_positions")
+ if "soft" in keywords:
+ inputs.append("soft_token_ids")
+ if "encoder" in keywords:
+ inputs.append("encoder_ids")
+ verbalizer_path = os.path.join(export_path, "verbalizer_config.json")
+ with open(verbalizer_path, "r") as fp:
+ label_words = json.load(fp)
+ labels = sorted(list(label_words.keys()))
+
+ return template, labels, inputs
def predict(self, input_data: list):
encoded_inputs = self.preprocess(input_data)
@@ -155,14 +177,27 @@ def _infer(self, input_dict):
infer_data = self.inference_backend.infer(input_dict)
return infer_data
- def infer_batch(self, encoded_inputs):
- num_sample = len(encoded_inputs["input_ids"])
+ def infer_batch(self, inputs):
+ num_sample = len(inputs)
infer_data = None
num_infer_data = None
- for idx in range(0, num_sample, self._batch_size):
- l, r = idx, idx + self._batch_size
- keys = encoded_inputs.keys()
- input_dict = {k: encoded_inputs[k][l:r] for k in keys}
+ for index in range(0, num_sample, self.args.batch_size):
+ left, right = index, index + self.args.batch_size
+ batch_dict = self.collate_fn(inputs[left:right])
+ input_dict = {}
+ for key in self.input_handles:
+ value = batch_dict[key]
+ if key == "attention_mask":
+ if value.ndim == 2:
+ value = (1 - value[:, np.newaxis, np.newaxis, :]) * -1e4
+ elif value.ndim != 4:
+ raise ValueError(
+ "Expect attention mask with ndim=2 or 4, but get ndim={}"
+ .format(value.ndim))
+ value = value.astype("float32")
+ else:
+ value = value.astype("int64")
+ input_dict[key] = value
results = self._infer(input_dict)
if infer_data is None:
infer_data = [[x] for x in results]
@@ -175,21 +210,13 @@ def infer_batch(self, encoded_inputs):
return infer_data
def preprocess(self, input_data: list):
- text = [InputExample(text_a=x) for x in input_data]
- inputs = [self._template.wrap_one_example(x) for x in text]
- inputs = {
- "input_ids":
- np.array([x["input_ids"] for x in inputs], dtype="int64"),
- "mask_ids":
- np.array([x["mask_ids"] for x in inputs], dtype="int64"),
- "soft_token_ids":
- np.array([x["soft_token_ids"] for x in inputs], dtype="int64")
- }
+ text = [{"text_a": x} for x in input_data]
+ inputs = [self.template(x) for x in text]
return inputs
def postprocess(self, infer_data):
preds = np.argmax(infer_data[0], axis=-1)
- labels = [self._label_list[x] for x in preds]
+ labels = [self.labels[x] for x in preds]
return {"label": labels}
def printer(self, result, input_data):
@@ -204,12 +231,10 @@ def printer(self, result, input_data):
for arg_name, arg_value in vars(args).items():
logger.info("{:20}: {}".format(arg_name, arg_value))
- export_path = os.path.dirname(args.model_path_prefix)
- labels, _ = Verbalizer.load_from(export_path)
+ predictor = MultiClassPredictor(args)
text_dir = os.path.join(args.data_dir, "data.txt")
with open(text_dir, "r", encoding="utf-8") as f:
text_list = [x.strip() for x in f.readlines()]
- predictor = MultiClassPredictor(args, labels)
predictor.predict(text_list)
diff --git a/applications/text_classification/multi_class/few-shot/requirements_cpu.txt b/applications/text_classification/multi_class/few-shot/requirements_cpu.txt
new file mode 100644
index 000000000000..bbe76e363f00
--- /dev/null
+++ b/applications/text_classification/multi_class/few-shot/requirements_cpu.txt
@@ -0,0 +1,5 @@
+psutil
+paddlepaddle>=2.4rc
+paddlenlp>=2.4.3
+paddle2onnx>=1.0.3
+onnxruntime
diff --git a/applications/text_classification/multi_class/few-shot/requirements_gpu.txt b/applications/text_classification/multi_class/few-shot/requirements_gpu.txt
new file mode 100644
index 000000000000..66454bd8b6b5
--- /dev/null
+++ b/applications/text_classification/multi_class/few-shot/requirements_gpu.txt
@@ -0,0 +1,7 @@
+psutil
+paddlepaddle-gpu>=2.4rc
+paddlenlp>=2.4.3
+paddle2onnx>=1.0.3
+onnxruntime-gpu
+onnx
+onnxconverter-common
diff --git a/applications/text_classification/multi_class/few-shot/train.py b/applications/text_classification/multi_class/few-shot/train.py
index a07d27ea5879..0e67750deb23 100644
--- a/applications/text_classification/multi_class/few-shot/train.py
+++ b/applications/text_classification/multi_class/few-shot/train.py
@@ -14,6 +14,9 @@
from dataclasses import dataclass, field
import os
+from collections import defaultdict
+
+import numpy as np
import paddle
from paddle.static import InputSpec
@@ -37,8 +40,6 @@
class DataArguments:
data_dir: str = field(default="./data/", metadata={"help": "Path to a dataset which includes train.txt, dev.txt, test.txt, label.txt and data.txt (optional)."})
prompt: str = field(default=None, metadata={"help": "The input prompt for tuning."})
- soft_encoder: str = field(default="lstm", metadata={"help": "The encoder type of soft template, `lstm`, `mlp` or None."})
- encoder_hidden_size: int = field(default=200, metadata={"help": "The dimension of soft embeddings."})
@dataclass
@@ -63,17 +64,20 @@ def main():
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
# Define the template for preprocess and the verbalizer for postprocess.
- template = AutoTemplate.create_from(
- data_args.prompt,
- tokenizer,
- training_args.max_seq_length,
- model=model,
- prompt_encoder=data_args.soft_encoder,
- encoder_hidden_size=data_args.encoder_hidden_size)
- logger.info("Using template: {}".format(template.template))
+ template = AutoTemplate.create_from(data_args.prompt,
+ tokenizer,
+ training_args.max_seq_length,
+ model=model)
+ logger.info("Using template: {}".format(template.prompt))
label_file = os.path.join(data_args.data_dir, "label.txt")
- verbalizer = SoftVerbalizer.from_file(tokenizer, model, label_file)
+ with open(label_file, "r", encoding="utf-8") as fp:
+ label_words = defaultdict(list)
+ for line in fp:
+ data = line.strip().split("==")
+ word = data[1] if len(data) > 1 else data[0].split("##")[-1]
+ label_words[data[0]].append(word)
+ verbalizer = SoftVerbalizer(label_words, tokenizer, model)
# Load the few-shot datasets.
train_ds, dev_ds, test_ds = load_local_dataset(
@@ -133,11 +137,24 @@ def compute_metrics(eval_preds):
# Export static model.
if training_args.do_export:
+ template = prompt_model.template
+ template_keywords = template.extract_template_keywords(template.prompt)
input_spec = [
- InputSpec(shape=[None, None], dtype="int64"), # input_ids
- InputSpec(shape=[None, None], dtype="int64"), # mask_ids
- InputSpec(shape=[None, None], dtype="int64"), # soft_token_ids
+ InputSpec(shape=[None, None], dtype="int64"), # input_ids,
+ InputSpec(shape=[None, None], dtype="int64"), # token_type_ids
+ InputSpec(shape=[None, None], dtype="int64"), # position_ids
+ InputSpec(shape=[None, None, None, None],
+ dtype="float32") # attention_mask
]
+ if "mask" in template_keywords:
+ input_spec.append(InputSpec(shape=[None],
+ dtype="int64")) # masked_positions
+ if "soft" in template_keywords:
+ input_spec.append(InputSpec(shape=[None, None],
+ dtype="int64")) # soft_token_ids
+ if "encoder" in template_keywords:
+ input_spec.append(InputSpec(shape=[None, None],
+ dtype="int64")) # encoder_ids
export_path = os.path.join(training_args.output_dir, 'export')
trainer.export_model(export_path,
input_spec=input_spec,
diff --git a/applications/text_classification/multi_class/few-shot/utils.py b/applications/text_classification/multi_class/few-shot/utils.py
index 907b640ea96b..b440c9627917 100644
--- a/applications/text_classification/multi_class/few-shot/utils.py
+++ b/applications/text_classification/multi_class/few-shot/utils.py
@@ -15,7 +15,6 @@
import os
from paddlenlp.datasets import load_dataset
-from paddlenlp.prompt import InputExample
def load_local_dataset(data_path, splits, label_list):
@@ -38,10 +37,10 @@ def _reader(data_file, label_list):
for idx, line in enumerate(fp):
data = line.strip().split("\t")
if len(data) == 1:
- yield InputExample(text_a=data[0])
+ yield {"text_a": data[0]}
else:
text, label = data
- yield InputExample(text_a=text, labels=label_list[label])
+ yield {"text_a": text, "labels": label_list[label]}
assert isinstance(splits, list) and len(splits) > 0
diff --git a/applications/text_classification/multi_label/few-shot/README.md b/applications/text_classification/multi_label/few-shot/README.md
index d7ee94ff68f2..4bb3bbf51fb5 100644
--- a/applications/text_classification/multi_label/few-shot/README.md
+++ b/applications/text_classification/multi_label/few-shot/README.md
@@ -70,9 +70,9 @@
内存: 630 GB
-3. PaddlePaddle 版本:2.3.1
+3. PaddlePaddle 版本:2.4rc
-4. PaddleNLP 版本:2.3.5 (develop)
+4. PaddleNLP 版本:2.4.3
5. 评估设置
@@ -88,7 +88,7 @@
- 提示学习
```
- python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这句话包含的要素有" --model_name_or_path ernie-3.0-base-zh --max_seq_length 128 --learning_rate 3e-5 --ppt_learning_rate 3e-4 --do_train --do_eval --num_train_epochs 100 --logging_steps 5 --per_device_eval_batch_size 32 --per_device_train_batch_size 8 --do_predict --metric_for_best_model macro_f1_score --load_best_model_at_end --evaluation_strategy epoch --save_strategy epoch
+ python train.py --data_dir ./data/ --output_dir ./checkpoints/ --prompt "这句话包含的要素有" --model_name_or_path ernie-3.0-base-zh --max_seq_length 128 --learning_rate 3e-5 --ppt_learning_rate 3e-4 --do_train --do_eval --num_train_epochs 100 --logging_steps 5 --per_device_eval_batch_size 32 --per_device_train_batch_size 8 --do_predict --metric_for_best_model macro_f1_score --load_best_model_at_end --eval_steps 100 --save_total_limit 1
```
6. 精度评价指标:Micro F1分数、Macro F1分数
@@ -96,7 +96,7 @@
| model_name | 训练方式 | Micro F1分数 | Macro F1分数 |
| ---------- | ------- | ----------- | ----------- |
| ernie-3.0-base-zh | 微调学习 | 0.7419 | 0.5105 |
- | ernie-3.0-base-zh | 提示学习 | 0.7838 | 0.6985 |
+ | ernie-3.0-base-zh | 提示学习 | 0.7839 | 0.6003 |
## 3.定制训练
@@ -106,10 +106,10 @@
### 3.1 运行环境
-- python >= 3.6
-- paddlepaddle > 2.3(2.4版本发布前推荐安装[develop版本](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html))
-- paddlenlp >= 2.3.5
-- paddle2onnx >= 1.0.0rc3
+- python >= 3.7
+- paddlepaddle >= 2.4rc
+- paddlenlp >= 2.4.3
+- paddle2onnx >= 1.0.3
### 3.2 代码结构
@@ -223,6 +223,7 @@ python train.py \
--do_export \
--num_train_epochs 100 \
--logging_steps 5 \
+--save_total_limit 1 \
--per_device_eval_batch_size 32 \
--per_device_train_batch_size 8 \
--metric_for_best_model macro_f1_score \
@@ -248,6 +249,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \
--do_export \
--num_train_epochs 100 \
--logging_steps 5 \
+--save_total_limit 1 \
--per_device_eval_batch_size 32 \
--per_device_train_batch_size 8 \
--metric_for_best_model macro_f1_score \
@@ -274,6 +276,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \
- `do_export`: 是否在运行结束时将模型导出为静态图,保存路径为`output_dir/export`。
- `num_train_epochs`: 训练的最大轮数。
- `max_steps`: 训练的最大步数。此设置将会覆盖`num_train_epochs`。
+- `save_total_limit`: 模型检查点保存数量。
- `device`: 使用的设备,默认为`gpu`。
- `eval_steps`: 评估模型的间隔步数。
- `logging_steps`: 打印日志的间隔步数。
@@ -352,9 +355,9 @@ python infer.py --model_path_prefix checkpoints/export/model --data_dir ./data -
可配置参数说明:
- `model_path_prefix`: 导出的静态图模型路径及文件前缀。
-- `model_name_or_path`: 内置预训练模型名,或者模型参数配置目录路径,用于加载tokenizer。默认为`ernie-3.0-base-zh`。
+- `model_name`: 内置预训练模型名,用于加载tokenizer。默认为`ernie-3.0-base-zh`。
- `data_dir`: 待推理数据所在路径,数据应存放在该目录下的`data.txt`文件。
-- `max_seq_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。
+- `max_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。
- `batch_size`: 每次预测的样本数量。
- `device`: 选择推理设备,包括`cpu`和`gpu`。默认为`gpu`。
- `device_id`: 指定GPU设备ID。
diff --git a/applications/text_classification/multi_label/few-shot/infer.py b/applications/text_classification/multi_label/few-shot/infer.py
index 48d42d294e96..2014afaef56b 100644
--- a/applications/text_classification/multi_label/few-shot/infer.py
+++ b/applications/text_classification/multi_label/few-shot/infer.py
@@ -14,23 +14,24 @@
import os
import six
+import json
import psutil
import argparse
import numpy as np
from paddlenlp.utils.log import logger
-from paddlenlp.prompt import AutoTemplate, Verbalizer, InputExample
-from paddlenlp.transformers import AutoTokenizer
+from paddlenlp.prompt import AutoTemplate, PromptDataCollatorWithPadding
+from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM
import paddle2onnx
import onnxruntime as ort
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_path_prefix", type=str, required=True, help="The path prefix of inference model to be used.")
-parser.add_argument("--model_name_or_path", default="ernie-3.0-base-zh", type=str, help="The directory or name of model.")
+parser.add_argument("--model_name", default="ernie-3.0-base-zh", type=str, help="The name of pretrained model.")
parser.add_argument("--data_dir", default=None, type=str, help="The path to the prediction data, including label.txt and data.txt.")
-parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization.")
+parser.add_argument("--max_length", default=128, type=int, help="The maximum total input sequence length after tokenization.")
parser.add_argument("--use_fp16", action='store_true', help="Whether to use fp16 inference, only takes effect when deploying on gpu.")
parser.add_argument("--batch_size", default=200, type=int, help="Batch size per GPU/CPU for predicting.")
parser.add_argument("--num_threads", default=psutil.cpu_count(logical=False), type=int, help="num_threads for cpu.")
@@ -103,11 +104,6 @@ def __init__(self,
'device_id':
device_id
}])
- self.input_handles = [
- self.predictor.get_inputs()[0].name,
- self.predictor.get_inputs()[1].name,
- self.predictor.get_inputs()[2].name
- ]
if device == "gpu":
try:
@@ -122,27 +118,52 @@ def __init__(self,
logger.info(">>> [InferBackend] Engine Created ...")
def infer(self, input_dict: dict):
- input_dict = {
- k: v
- for k, v in input_dict.items() if k in self.input_handles
- }
result = self.predictor.run(None, input_dict)
return result
class MultiLabelPredictor(object):
- def __init__(self, args, label_list):
- self._label_list = label_list
- self._tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
- self._max_seq_length = args.max_seq_length
- self._batch_size = args.batch_size
- self.inference_backend = InferBackend(args.model_path_prefix,
- args.device, args.device_id,
- args.use_fp16, args.num_threads)
- self._template = AutoTemplate.load_from(
- os.path.dirname(args.model_path_prefix), self._tokenizer,
- args.max_seq_length)
+ def __init__(self, args):
+ self.args = args
+ self.tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+ self.model = AutoModelForMaskedLM.from_pretrained(args.model_name)
+ self.template, self.labels, self.input_handles = self.post_init()
+ self.collate_fn = PromptDataCollatorWithPadding(
+ self.tokenizer,
+ padding=True,
+ return_tensors="np",
+ return_attention_mask=True)
+
+ self.inference_backend = InferBackend(self.args.model_path_prefix,
+ self.args.device,
+ self.args.device_id,
+ self.args.use_fp16,
+ self.args.num_threads)
+
+ def post_init(self):
+ export_path = os.path.dirname(self.args.model_path_prefix)
+ template_path = os.path.join(export_path, "template_config.json")
+ with open(template_path, "r") as fp:
+ prompt = json.load(fp)
+ template = AutoTemplate.create_from(prompt, self.tokenizer,
+ self.args.max_length,
+ self.model)
+ keywords = template.extract_template_keywords(template.prompt)
+ inputs = [
+ "input_ids", "token_type_ids", "position_ids", "attention_mask",
+ "masked_positions"
+ ]
+ if "soft" in keywords:
+ inputs.append("soft_token_ids")
+ if "encoder" in keywords:
+ inputs.append("encoder_ids")
+ verbalizer_path = os.path.join(export_path, "verbalizer_config.json")
+ with open(verbalizer_path, "r") as fp:
+ label_words = json.load(fp)
+ labels = sorted(list(label_words.keys()))
+
+ return template, labels, inputs
def predict(self, input_data: list):
encoded_inputs = self.preprocess(input_data)
@@ -155,14 +176,27 @@ def _infer(self, input_dict):
infer_data = self.inference_backend.infer(input_dict)
return infer_data
- def infer_batch(self, encoded_inputs):
- num_sample = len(encoded_inputs["input_ids"])
+ def infer_batch(self, inputs):
+ num_sample = len(inputs)
infer_data = None
num_infer_data = None
- for idx in range(0, num_sample, self._batch_size):
- l, r = idx, idx + self._batch_size
- keys = encoded_inputs.keys()
- input_dict = {k: encoded_inputs[k][l:r] for k in keys}
+ for index in range(0, num_sample, self.args.batch_size):
+ left, right = index, index + self.args.batch_size
+ batch_dict = self.collate_fn(inputs[left:right])
+ input_dict = {}
+ for key in self.input_handles:
+ value = batch_dict[key]
+ if key == "attention_mask":
+ if value.ndim == 2:
+ value = (1 - value[:, np.newaxis, np.newaxis, :]) * -1e4
+ elif value.ndim != 4:
+ raise ValueError(
+ "Expect attention mask with ndim=2 or 4, but get ndim={}"
+ .format(value.ndim))
+ value = value.astype("float32")
+ else:
+ value = value.astype("int64")
+ input_dict[key] = value
results = self._infer(input_dict)
if infer_data is None:
infer_data = [[x] for x in results]
@@ -175,16 +209,8 @@ def infer_batch(self, encoded_inputs):
return infer_data
def preprocess(self, input_data: list):
- text = [InputExample(text_a=x) for x in input_data]
- inputs = [self._template.wrap_one_example(x) for x in text]
- inputs = {
- "input_ids":
- np.array([x["input_ids"] for x in inputs], dtype="int64"),
- "mask_ids":
- np.array([x["mask_ids"] for x in inputs], dtype="int64"),
- "soft_token_ids":
- np.array([x["soft_token_ids"] for x in inputs], dtype="int64")
- }
+ text = [{"text_a": x} for x in input_data]
+ inputs = [self.template(x) for x in text]
return inputs
@staticmethod
@@ -197,7 +223,7 @@ def postprocess(self, infer_data):
label_ids = np.argwhere(probs > threshold)
labels = [[] for _ in range(probs.shape[0])]
for idx, label_id in label_ids:
- labels[idx].append(self._label_list[label_id])
+ labels[idx].append(self.labels[label_id])
return {"label": labels}
def printer(self, result, input_data):
@@ -212,12 +238,10 @@ def printer(self, result, input_data):
for arg_name, arg_value in vars(args).items():
logger.info("{:20}: {}".format(arg_name, arg_value))
- export_path = os.path.dirname(args.model_path_prefix)
- labels, _ = Verbalizer.load_from(export_path)
+ predictor = MultiLabelPredictor(args)
text_dir = os.path.join(args.data_dir, "data.txt")
with open(text_dir, "r", encoding="utf-8") as f:
text_list = [x.strip() for x in f.readlines()]
- predictor = MultiLabelPredictor(args, labels)
predictor.predict(text_list)
diff --git a/applications/text_classification/multi_label/few-shot/requirements_cpu.txt b/applications/text_classification/multi_label/few-shot/requirements_cpu.txt
new file mode 100644
index 000000000000..bbe76e363f00
--- /dev/null
+++ b/applications/text_classification/multi_label/few-shot/requirements_cpu.txt
@@ -0,0 +1,5 @@
+psutil
+paddlepaddle>=2.4rc
+paddlenlp>=2.4.3
+paddle2onnx>=1.0.3
+onnxruntime
diff --git a/applications/text_classification/multi_label/few-shot/requirements_gpu.txt b/applications/text_classification/multi_label/few-shot/requirements_gpu.txt
new file mode 100644
index 000000000000..66454bd8b6b5
--- /dev/null
+++ b/applications/text_classification/multi_label/few-shot/requirements_gpu.txt
@@ -0,0 +1,7 @@
+psutil
+paddlepaddle-gpu>=2.4rc
+paddlenlp>=2.4.3
+paddle2onnx>=1.0.3
+onnxruntime-gpu
+onnx
+onnxconverter-common
diff --git a/applications/text_classification/multi_label/few-shot/train.py b/applications/text_classification/multi_label/few-shot/train.py
index 44f0c190c90f..6d12f6b3e5fd 100644
--- a/applications/text_classification/multi_label/few-shot/train.py
+++ b/applications/text_classification/multi_label/few-shot/train.py
@@ -15,6 +15,7 @@
from dataclasses import dataclass, field
import os
import sys
+from collections import defaultdict
import paddle
import paddle.nn.functional as F
@@ -41,8 +42,6 @@
class DataArguments:
data_dir: str = field(default="./data", metadata={"help": "The dataset dictionary includes train.txt, dev.txt and label.txt files."})
prompt: str = field(default=None, metadata={"help": "The input prompt for tuning."})
- soft_encoder: str = field(default="lstm", metadata={"help": "The encoder type of soft template, `lstm`, `mlp` or None."})
- encoder_hidden_size: int = field(default=200, metadata={"help": "The dimension of soft embeddings."})
@dataclass
@@ -67,17 +66,20 @@ def main():
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
# Define the template for preprocess and the verbalizer for postprocess.
- template = AutoTemplate.create_from(
- data_args.prompt,
- tokenizer,
- training_args.max_seq_length,
- model=model,
- prompt_encoder=data_args.soft_encoder,
- encoder_hidden_size=data_args.encoder_hidden_size)
- logger.info("Using template: {}".format(template.template))
+ template = AutoTemplate.create_from(data_args.prompt,
+ tokenizer,
+ training_args.max_seq_length,
+ model=model)
+ logger.info("Using template: {}".format(template.prompt))
label_file = os.path.join(data_args.data_dir, "label.txt")
- verbalizer = SoftVerbalizer.from_file(tokenizer, model, label_file)
+ with open(label_file, "r", encoding="utf-8") as fp:
+ label_words = defaultdict(list)
+ for line in fp:
+ data = line.strip().split("==")
+ word = data[1] if len(data) > 1 else data[0].split("##")[-1]
+ label_words[data[0]].append(word)
+ verbalizer = SoftVerbalizer(label_words, tokenizer, model)
# Load the few-shot datasets.
train_ds, dev_ds, test_ds = load_local_dataset(
@@ -139,11 +141,24 @@ def compute_metrics(eval_preds):
# Export static model.
if training_args.do_export:
+ template = prompt_model.template
+ template_keywords = template.extract_template_keywords(template.prompt)
input_spec = [
- InputSpec(shape=[None, None], dtype="int64"), # input_ids
- InputSpec(shape=[None, None], dtype="int64"), # mask_ids
- InputSpec(shape=[None, None], dtype="int64"), # soft_token_ids
+ InputSpec(shape=[None, None], dtype="int64"), # input_ids,
+ InputSpec(shape=[None, None], dtype="int64"), # token_type_ids
+ InputSpec(shape=[None, None], dtype="int64"), # position_ids
+ InputSpec(shape=[None, None, None, None],
+ dtype="float32") # attention_mask
]
+ if "mask" in template_keywords:
+ input_spec.append(InputSpec(shape=[None],
+ dtype="int64")) # masked_positions
+ if "soft" in template_keywords:
+ input_spec.append(InputSpec(shape=[None, None],
+ dtype="int64")) # soft_token_ids
+ if "encoder" in template_keywords:
+ input_spec.append(InputSpec(shape=[None, None],
+ dtype="int64")) # encoder_ids
export_path = os.path.join(training_args.output_dir, 'export')
trainer.export_model(export_path,
input_spec=input_spec,
diff --git a/applications/text_classification/multi_label/few-shot/utils.py b/applications/text_classification/multi_label/few-shot/utils.py
index 6891c22829fc..4855e43e7bf3 100644
--- a/applications/text_classification/multi_label/few-shot/utils.py
+++ b/applications/text_classification/multi_label/few-shot/utils.py
@@ -15,7 +15,6 @@
import os
from paddlenlp.datasets import load_dataset
-from paddlenlp.prompt import InputExample
def load_local_dataset(data_path, splits, label_list):
@@ -39,14 +38,14 @@ def _reader(data_file, label_list):
for idx, line in enumerate(fp):
data = line.strip().split("\t")
if len(data) == 1:
- yield InputExample(text_a=data[0])
+ yield {"text_a": data[0]}
else:
text, label = data
label = label.strip().split(",")
label = [
float(1) if x in label else float(0) for x in label_list
]
- yield InputExample(text_a=text, labels=label)
+ yield {"text_a": text, "labels": label}
split_map = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
datasets = []
diff --git a/docs/advanced_guide/prompt.md b/docs/advanced_guide/prompt.md
index e45aca61d4b6..c8b720261302 100644
--- a/docs/advanced_guide/prompt.md
+++ b/docs/advanced_guide/prompt.md
@@ -22,11 +22,11 @@ Prompt API 提供了这类算法实现的基本模块,支持[PET](https://arxi
* [如何定义模板](#如何定义模板)
* [离散型模板](#离散型模板)
* [连续型模板](#连续型模板)
+ * [前缀连续型模板](#前缀连续型模板)
* [快速定义模板](#快速定义模板)
* [如何定义标签词映射](#如何定义标签词映射)
- * [单掩码映射](#单掩码映射)
- * [多掩码映射](#多掩码映射)
- * [标签词映射分类](#标签词映射分类)
+ * [离散型标签词映射](#离散型标签词映射)
+ * [连续型标签词映射](#连续型标签词映射)
* [快速开始训练](#快速开始训练)
* [数据准备](#数据准备)
* [预训练参数准备](#预训练参数准备)
@@ -39,18 +39,28 @@ Prompt API 提供了这类算法实现的基本模块,支持[PET](https://arxi
## 如何定义模板
-**模板**(Template)的功能是在原有输入文本上增加提示语句,从而将原任务转化为 MLM 任务,可以分为离散型和连续型两种。Prompt API 中提供了统一的数据结构来构造不同类型的模板,输入相应格式的**字符串**,通过解析得到对应的输入模板,即字典构成的列表。
+**模板**(Template)的功能是在原有输入文本上增加提示语句,从而将原任务转化为 MLM 任务,可以分为离散型和连续型两种。Prompt API 中提供了统一的数据结构来构造不同类型的模板,输入相应格式的**字符串**,通过解析得到对应的输入模板。模板由不同字段构成,可任意组合。每个字段中的关键字定义了数据文本或者提示文本,即 `input_ids`,属性可定义该字段是否可截断,以及对应的 `position_ids`,`token_type_ids` 等。
### 离散型模板
离散型模板 `ManualTemplate` 是直接将提示语句与原始输入文本拼接起来,二者的词向量矩阵共享,均为预训练模型学到的词向量矩阵。可用于实现 PET、RGL 等算法。
-**模板关键字**
+**模板关键字及属性**
-- ``text`` :数据集中原始输入文本对应的关键字,包括`text_a`和`text_b`。[数据准备](#数据准备)中介绍了如何将自定义数据集转化为统一格式。
-- ``hard`` :自定义的文本提示语句。
+- ``text`` :数据集中原始输入文本对应的关键字,例如,`text_a`、`text_b` 和 `content`。
+- ``hard`` :自定义的提示语句文本。
- ``mask`` :待预测词的占位符。
-- ``sep`` :用于区分不同的句子。`sep`前后的句子对应不同的`token_type_id`。
+ - ``length`` :定义 ``mask`` 的数量。
+- ``sep`` :句间的标志符。不同句子的 `token_type_ids` 需使用 `token_type` 属性定义,默认相同。
+- ``options`` :数据集字典或者文件中的候选标签序列。
+ - ``add_omask`` :在每个标签前新增 `[O-MASK]` 字符,用于计算候选标签的预测值。支持实现 [UniMC](https://arxiv.org/pdf/2210.08590.pdf) 算法。
+ - ``add_prompt`` :给每个标签拼接固定的提示文本,标签位置由 `[OPT]` 标记。支持实现 [EFL](https://arxiv.org/pdf/2104.14690.pdf) 算法。
+
+**模版通用属性**
+
+- `position`: 定义当前字段的起始 `position id`。
+- `token_type`: 定义当前字段及后续字段的 `token type id`。
+- `truncate`: 定义当提示和文本总长度超过最大长度时,当前字段是否可截断。可选 `True` 和 `False`。
**模板定义**
@@ -64,16 +74,25 @@ Prompt API 提供了这类算法实现的基本模块,支持[PET](https://arxi
“{'text': 'text_a'}”和“{'text': 'text_b'}”之间的逻辑关系是{'mask'}
```
+```
+{'options': './data/label.txt'}{'sep'}下边两句话间的逻辑关系是什么?{'text': 'text_a'}{'sep': None, 'token_type': 1}{'text': 'text_b'}
+```
+其中 `label.txt` 为候选标签的本地文件路径,每行一个候选标签,例如
+
+```
+中立
+蕴含
+矛盾
+```
+
**样本示例**
例如,对于自然语言推理任务,给定样本
```python
-from paddlenlp.prompt import InputExample
-sample = InputExample(uid=0,
- text_a="心里有些生畏,又不知畏惧什么",
- text_b="心里特别开心",
- labels="矛盾")
+sample = {
+ "text_a": "心里有些生畏,又不知畏惧什么", "text_b": "心里特别开心", "labels": "矛盾"
+}
```
按照模板修改拼接后,最终输入模型的文本数据为
@@ -90,17 +109,17 @@ from paddlenlp.prompt import ManualTemplate
from paddlenlp.transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh")
-template = ManualTemplate(tokenizer=tokenizer,
- max_seq_length=512,
- template="“{'text': 'text_a'}”和“{'text': 'text_b'}”之间的逻辑关系是{'mask'}")
-input_dict = template.wrap_one_example(sample)
+template = ManualTemplate(prompt="“{'text': 'text_a'}”和“{'text': 'text_b'}”之间的逻辑关系是{'mask'}",
+ tokenizer=tokenizer,
+ max_length=512)
+input_dict = template(sample)
```
其中初始化参数定义如下
+- ``prompt`` :定义提示语句以及与输入文本组合方式的字符串。
- ``tokenizer`` :预训练模型的 tokenizer,用于文本编码。
-- ``max_seq_length`` :定义输入模型文本的最大长度,包括提示部分。当输入长度超过最大长度时,只会截断`text`关键字对应的输入文本,提示部分不做处理。
-- ``template`` :定义提示语句以及与输入文本组合方式的字符串。
+- ``max_length`` :定义输入模型文本的最大长度,包括提示部分。
**使用技巧**
@@ -115,24 +134,42 @@ input_dict = template.wrap_one_example(sample)
**模板关键字**
-- ``text`` :数据集中原始输入文本对应的关键字,包括`text_a`和`text_b`。[数据准备](#数据准备)中介绍了如何将自定义数据集转化为统一格式。
+- ``text`` :数据集中原始输入文本对应的关键字,例如,`text_a`和`text_b`。
- ``hard`` :自定义的文本提示语句。
- ``mask`` :待预测词的占位符。
-- ``sep`` :用于区分不同的句子。`sep`前后的句子对应不同的`token_type_id`。
-- ``soft`` 表示连续型提示。若值为 ``None`` ,则使用对应数量的随机初始化向量作为提示;若值为文本,则使用对应长度的连续性向量作为提示,并预训练词向量中文本对应的向量进行初始化。
+- ``sep`` :句间的标志符。不同句子的 `token_type_ids` 需使用 `token_type` 属性定义,默认相同。
+- ``soft`` 表示连续型提示。若值为 ``None`` ,则随机初始化提示向量;若值为文本,则使用文本对应的预训练字向量初始化提示向量。
+ - ``length`` :定义 ``soft token`` 的数量。若定义文本长度小于该值,超过部分随机初始化。
+ - ``encoder`` :定义 `soft token` 的编码器类型,可选 `lstm`,`mlp`。默认为 `None`, 不使用编码器。
+ - ``hidden_size`` :定义编码器的隐藏层维度。默认与预训练词向量维度相同。
+- ``options`` :数据集字典或者文件中的候选标签序列。
+ - ``add_omask`` :在每个标签前新增 `[O-MASK]` 字符,用于计算候选标签的预测值。支持实现 [UniMC](https://arxiv.org/pdf/2210.08590.pdf) 算法。
+ - ``add_prompt`` :给每个标签拼接固定的提示文本,标签位置由 `[OPT]` 标记。支持实现 [EFL](https://arxiv.org/pdf/2104.14690.pdf) 算法。
+
+**模版通用属性**
+
+- `position`: 定义当前字段的起始 `position id`。
+- `token_type`: 定义当前字段及后续字段的 `token type id`。
+- `truncate`: 定义当提示和文本总长度超过最大长度时,当前字段是否可截断。可选 `True` 和 `False`。
**模板定义**
- 定义长度为 1 的连续型提示,随机初始化:
```python
-"{'soft': None}{'text': 'text_a'}{'sep'}{'text': 'text_b'}"
+"{'soft'}{'text': 'text_a'}{'sep': None, 'token_type': 1}{'text': 'text_b'}"
+```
+
+- 定义长度为 10 的连续型提示,随机初始化,编码器为 `mlp`:
+
+```python
+"{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': None, 'length':10, 'encoder': 'mlp'}{'mask'}"
```
-- 定义长度为 10 的连续型提示,随机初始化,其中 ``duplicate`` 参数表示连续型提示的长度(仅在随机初始化时有效,即`soft`值为`None`):
+- 定义长度为 15 的连续型提示,使用 `请判断` 初始化前三个 soft token,其余随机初始化,编码器为隐藏层维度为 100 的双层 LSTM:
```python
-"{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': None, `duplicate`:10}{'mask'}"
+"{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断:', 'length': 15, 'encoder': 'lstm', 'hidden_size': 100}{'mask'}"
```
- 定义长度为 15 的连续型提示,使用 `"请判断这两个句子间的逻辑关系:"` 的预训练词向量逐一进行初始化:
@@ -156,38 +193,98 @@ from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("ernie-3.0-base-zh")
tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh")
-template = SoftTemplate(tokenizer=tokenizer,
- max_seq_length=512,
- model=model,
- template="{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断这两个句子间的逻辑关系:'}{'mask'}",
- prompt_encoder='lstm',
- encoder_hidden_size=200)
+template = SoftTemplate(prompt="{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断这两个句子间的逻辑关系:'}{'mask'}",
+ tokenizer=tokenizer,
+ max_length=512,
+ word_embeddings=model.get_input_embeddings())
```
其中初始化参数定义如下
+- ``prompt`` :定义连续型模板的提示语句、初始化以及与输入文本组合方式的字符串。
- ``tokenizer`` :预训练模型的 tokenizer,用于文本编码。
-- ``max_seq_length`` :定义输入模型文本的最大长度,包括提示部分。当输入长度超过最大长度时,只会截断`text`关键字对应的输入文本,提示部分不做处理。
-- ``model`` : 预训练语言模型,为了取预训练词向量用于连续型提示向量初始化。
-- ``template`` :定义连续型模板的提示语句、初始化以及与输入文本组合方式的字符串。
-- ``prompt_encoder`` : 连续型提示向量的编码器,可选 ``mlp`` 和 ``lstm``。默认为 ``None`` ,即无编码器,直接使用向量。
-- ``encoder_hidden_size`` : 连续型提示向量的维度。默认为 ``None`` ,即与预训练词向量维度相同。
+- ``max_seq_length`` :定义输入模型文本的最大长度,包括提示部分。
+- ``word_embeddings`` :预训练语言模型的词向量,用于连续型提示向量初始化。
+- ``soft_embeddings`` :连续型提示向量矩阵,可用于不同模板间的连续型参数共享。设置后将覆盖默认连续型向量矩阵。
**使用技巧**
- 对于分类任务,推荐的连续型提示长度一般为10-20。
- 对于随机初始化的连续性 prompt 向量,通常用比预训练模型微调更大的学习率来更新参数。
- 与离散型模板相似,连续型模板对初始化参数也比较敏感。自定义提示语句作为连续性 prompt 向量的初始化参数通常比随机初始化效果好。
-- prompt_encoder 为已有论文中的策略,用于建模不同连续型提示向量之间的序列关系。在实际应用中推荐先去掉 prompt_encoder 调整向量初始化。
+- prompt_encoder 为已有论文中的策略,用于建模不同连续型提示向量之间的序列关系。
+
+
+### 前缀连续型模板
+
+`PrefixTemplate` 同样使用了连续型向量作为提示,与 `SoftTemplate` 的不同,该模版的提示向量不仅仅作用于输入层,每层都会有相应的提示向量。可用于实现 P-Tuning 等算法。
+
+**模板关键字**
+
+- ``text`` :数据集中原始输入文本对应的关键字,例如,`text_a`和`text_b`。
+- ``hard`` :自定义的文本提示语句。
+- ``mask`` :待预测词的占位符。
+- ``sep`` :句间的标志符。不同句子的 `token_type_ids` 需使用 `token_type` 属性定义,默认相同。
+- ``prefix`` 表示连续型提示,该字段**必须**位于模板首位。若值为 ``None`` ,则随机初始化提示向量;若值为文本,则使用文本对应的预训练字向量初始化提示向量。
+ - ``length`` :定义 ``soft token`` 的数量。若定义文本长度小于该值,超过部分随机初始化。
+ - ``encoder`` :定义 `soft token` 的编码器类型,可选 `lstm`,`mlp`。默认为 `None`, 不使用编码器。
+ - ``hidden_size`` :定义编码器的隐藏层维度。默认与预训练词向量维度相同。
+- ``options`` :数据集字典或者文件中的候选标签序列。
+ - ``add_omask`` :在每个标签前新增 `[O-MASK]` 字符,用于计算候选标签的预测值。支持实现 [UniMC](https://arxiv.org/pdf/2210.08590.pdf) 算法。
+ - ``add_prompt`` :给每个标签拼接固定的提示文本,标签位置由 `[OPT]` 标记。支持实现 [EFL](https://arxiv.org/pdf/2104.14690.pdf) 算法。
+
+**模版通用属性**
+
+- `position`: 定义当前字段的起始 `position id`。
+- `token_type`: 定义当前字段及后续字段的 `token type id`。
+- `truncate`: 定义当提示和文本总长度超过最大长度时,当前字段是否可截断。可选 `True` 和 `False`。
+
+**模板定义**
+
+- 定义长度为 15 的连续型提示,随机初始化:
+
+```python
+"{'prefix': '新闻类别', 'length': 10, 'encoder': 'lstm'}{'text': 'text_a'}"
+```
+
+- 定义混合模板,这里`prefix`关键字对应的提示和`hard`对应的提示对应两套不同的向量:
+
+```python
+"{'prefix': '自然语言推理任务:', 'encoder': 'mlp'}{'text': 'text_a'}{'sep'}{'text': 'text_b'}这两个句子间的逻辑关系是{'mask'}"
+```
+
+
+**调用 API**
+
+```python
+from paddlenlp.prompt import PrefixTemplate
+from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM
+
+model = AutoModelForMaskedLM.from_pretrained("ernie-3.0-base-zh")
+tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh")
+template = PrefixTemplate(prompt="{'prefix': '任务描述'}{'text': 'text_a'}{'mask'}",
+ tokenizer=tokenizer,
+ max_length=512,
+ model=model,
+ prefix_dropout=0.1)
+```
+
+其中初始化参数定义如下
+
+- ``prompt`` :定义连续型模板的提示语句、初始化以及与输入文本组合方式的字符串。
+- ``tokenizer`` :预训练模型的 tokenizer,用于文本编码。
+- ``max_length`` :定义输入模型文本的最大长度,包括提示部分。
+- ``model`` :预训练语言模型,用于连续型提示向量初始化,以及根据模型结构生成每层对应的提示向量。
+- ``prefix_dropout`` :连续型提示向量的丢弃概率,用于正则化。
### 快速定义模板
-PaddleNLP 提供了 ``AutoTemplate`` API 以便快速定义单句输入的手工初始化的连续型模板,同时支持直接按照模板类型自动切换离散型模板和离散型模板。
+PaddleNLP 提供了 ``AutoTemplate`` API 快速定义简化离散型模板,也可根据完整模板字符串自动切换 ManualTemplate、SoftTemplate 和 PrefixTemplate。
**模板定义**
-- 只定义用于初始化连续型向量的文本提示,即可得到拼接到句尾的连续型模板输入。例如,
+- 快速定义离散型的文本提示。例如,
```python
"这篇文章表达了怎样的情感?"
@@ -196,7 +293,7 @@ PaddleNLP 提供了 ``AutoTemplate`` API 以便快速定义单句输入的手工
等价于
```python
-"{'text': 'text_a'}{'soft': '这篇文章表达了怎样的情感?'}{'mask'}"
+"{'text': 'text_a'}{'hard': '这篇文章表达了怎样的情感?'}{'mask'}"
```
- 当输入为完整模板字符串时,解析得到的模板与[离散型模板](#离散型模板)和[连续型模板](#连续型模板)中描述的一致。
@@ -210,40 +307,37 @@ from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("ernie-3.0-base-zh")
tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh")
# 离散型模板,返回值为 ManualTemplate 实例
-template = AutoTemplate.create_from(template="{'text': 'text_a'}和{'text': 'text_b'}之间的逻辑关系是{'mask'}",
+template = AutoTemplate.create_from(prompt="这个句子表达了怎样的情感?",
+ tokenizer=tokenizer,
+ max_length=512)
+
+template = AutoTemplate.create_from(prompt="这个句子表达了怎样的情感?{'text': 'text_a'}{'mask'}",
tokenizer=tokenizer,
- max_seq_length=512)
+ max_length=512)
# 连续型模板,返回值为 SoftTemplate 实例
-template = AutoTemplate.create_from(template="{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断这两个句子间的逻辑关系:'}{'mask'}",
+template = AutoTemplate.create_from(prompt="{'text': 'text_a'}{'sep'}{'text': 'text_b'}{'soft': '请判断这两个句子间的逻辑关系:'}{'mask'}",
tokenizer=tokenizer,
- max_seq_length=512,
- model=model,
- prompt_encoder='lstm',
- encoder_hidden_size=200)
+ max_length=512,
+ model=model)
-# 快速定义单句连续型模板,返回值为 SoftTemplate 实例
-template = AutoTemplate.create_from(template="这篇文章表达了怎样的情感?",
+# 前缀连续型模板,返回值为 PrefixTemplate 实例
+template = AutoTemplate.create_from(prompt="{'prefix': None, 'encoder': 'mlp', 'hidden_size': 50}{'text': 'text_a'}",
tokenizer=tokenizer,
- max_seq_length=512,
- model=model,
- prompt_encoder='lstm',
- encoder_hidden_size=200)
+ max_length=512,
+ model=model)
```
其中初始化参数定义如下
+- ``prompt`` :定义离散型/连续型提示、初始化以及和输入文本的组合方式。
- ``tokenizer`` :预训练模型的 tokenizer,用于文本编码。
-- ``max_seq_length`` :定义输入模型文本的最大长度,包括提示部分。当输入长度超过最大长度时,只会截断`text`关键字对应的输入文本,提示部分不做处理。
+- ``max_length`` :定义输入模型文本的最大长度,包括提示部分。
- ``model`` :预训练语言模型,为了取预训练词向量用于连续型提示向量初始化。
-- ``template`` :定义离散型/连续型提示、初始化以及和输入文本的组合方式。
-- ``prompt_encoder`` :连续型提示向量的编码器,可选 ``mlp`` 和 ``lstm`` 。默认为 ``None`` ,即无编码器,直接使用向量。
-- ``encoder_hidden_size`` :连续型提示向量的维度。默认为 ``None`` ,即与预训练词向量维度相同。
-
## 如何定义标签词映射
-**标签词映射**(Verbalizer)也是提示学习中可选的重要模块,用于建立预测词和标签之间的映射,将“预训练-微调”模式中预测标签的任务转换为预测模板中掩码位置的词语,从而将下游任务统一为预训练任务的形式。目前框架支持了离散型标签词映射和 [Word-level Adversarial ReProgramming (WARP)](https://aclanthology.org/2021.acl-long.381/) 方法。
+**标签词映射**(Verbalizer)也是提示学习中可选的重要模块,用于建立预测词和标签之间的映射,将“预训练-微调”模式中预测标签的任务转换为预测模板中掩码位置的词语,从而将下游任务统一为预训练任务的形式。目前框架支持了离散型标签词映射和连续型标签词映射 [Word-level Adversarial ReProgramming (WARP)](https://aclanthology.org/2021.acl-long.381/) 方法。
例如,在情感二分类任务中,微调方法和提示学习的标签体系如下
@@ -259,9 +353,9 @@ template = AutoTemplate.create_from(template="这篇文章表达了怎样的情
具体来说,对于模板 ``{'text':'text_a'}这句话表示我{'mask'}满意。`` ,我们使用映射 ``{'负向': '不', '正向': '很'}`` 将标签 ``负向`` 映射为 ``不`` ,将标签 ``正向`` 映射为 ``很`` 。也就是说,我们期望对于正向情感的文本,预测结果为 ``...这句话表示我很满意。`` ,对于负向情感的文本,预测结果为 ``...这句话表示我不满意。``
-### 单掩码映射
+### 离散型标签词映射
-``ManualVerbalizer`` 支持构造简单的单 ``{'mask'}`` 标签词映射,直接作用于 ``AutoMaskedLM`` 模型结构。当标签对应的预测词长度大于 ``1`` 时取均值。
+``ManualVerbalizer`` 支持构造 ``{'mask'}`` 对应的标签词映射,支持多``{'mask'}``,直接作用于 ``AutoMaskedLM`` 模型结构。当标签对应的预测词长度大于 ``1`` 时,默认取均值。
**调用 API**
@@ -271,41 +365,15 @@ from paddlenlp.transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh")
verbalizer = ManualVerbalizer(tokenizer=tokenizer,
- labels=['负向', '正向'],
- label_words={'负向': '不', '正向': '很'},
- prefix=None)
+ label_words={'负向': '不', '正向': '很'})
```
其中初始化参数定义如下
+- ``label_words`` : 原标签到预测词之间的映射字典。
- ``tokenizer`` : 预训练模型的 tokenizer,用于预测词的编码。
-- ``labels`` : 数据集的原标签列表(可选)。
-- ``label_words`` : 原标签到预测词之间的映射字典。如果同时定义了 ``labels`` ,二者的标签集合需要相同。
-- ``prefix`` : 预测词解码前增加的前缀,用于 ``RoBERTa`` 等对前缀敏感的模型,例如 `roberta-large`, `good` 和 ` good` 经过 tokenize 会得到不同的 id。默认为 ``None`` ,无前缀。
-
-### 多掩码映射
-
-``MultiMaskVerbalizer`` 继承自 ``ManualVerbalizer`` ,支持多 ``{'mask'}`` 标签词映射。预测词长度需与 ``{'mask'}`` 长度一致。
-
-**调用 API**
-
-```python
-from paddlenlp.prompt import MultiMaskVerbalizer
-from paddlenlp.transformers import AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh")
-verbalizer = MultiMaskVerbalizer(tokenizer=tokenizer,
- labels=['负向', '正向'],
- label_words={'负向': '生气', '正向': '高兴'},
- prefix=None)
-```
-
-
-其中初始化参数定义同[单掩码映射](#单掩码映射) 。
-
-
-### 标签词映射分类
+### 连续型标签词映射
标签词映射分类器 ``SoftVerbalizer`` 修改了原 ``AutoMaskedLM`` 的模型结构,将预训练模型最后一层“隐藏层-词表”替换为“隐藏层-标签”的映射。该层网络的初始化参数由标签词映射中的预测词词向量来决定,如果预测词长度大于 ``1`` ,则使用词向量均值进行初始化。当前支持的预训练模型包括 ``ErnieForMaskedLM`` 、 ``BertForMaskedLM`` 、 ``AlbertForMaskedLM`` 和 ``RobertaForMaskedLM`` 。可用于实现 WARP 算法。
@@ -318,15 +386,13 @@ from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("ernie-3.0-base-zh")
tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh")
-verbalizer = SoftVerbalizer(tokenizer=tokenizer,
- model=model,
- labels=['负向', '正向'],
- label_words={'负向': '生气', '正向': '高兴'},
- prefix=None)
+verbalizer = SoftVerbalizer(label_words={'负向': '生气', '正向': '高兴'},
+ tokenizer=tokenizer,
+ model=model)
```
-其中初始化参数定义同[单掩码映射](#单掩码映射) ,此外
-
+- ``label_words`` : 原标签到预测词之间的映射字典。
+- ``tokenizer`` : 预训练模型的 tokenizer,用于预测词的编码。
- ``model`` :预训练语言模型,用于取预训练词向量进行“隐藏层-标签”网络的修改和初始化。
## 快速开始训练
@@ -335,29 +401,24 @@ verbalizer = SoftVerbalizer(tokenizer=tokenizer,
### 数据准备
-Prompt 框架定义了统一的样本结构 ``InputExample`` 以便进行数据处理,数据集样本需要封装在 ``MapDataset`` 中。
+数据集封装为 ``MapDataset`` 类型。每条数据格式为字典结构,字典中关键字与模板中 `text` 定义的值相对应,统一使用 `labels` 关键字表示样本标签。
-例如,对于文本语义相似度 BUSTM 数据集中的原始样本
+例如,文本语义相似度 BUSTM 数据集中的数据样本
```python
-data = [
+from paddlenlp.datasets import MapDataset
+
+data_ds = MapDataset([
{'id': 3, 'sentence1': '你晚上吃了什么', 'sentence2': '你晚上吃啥了', 'label': 1},
{'id': 4, 'sentence1': '我想打开滴滴叫的士', 'sentence2': '你叫小欧吗', 'label': 0},
{'id': 5, 'sentence1': '女孩子到底是不是你', 'sentence2': '你不是女孩子吗', 'label': 1}
-]
-```
+])
+def convert_label_keyword(input_dict):
+ input_dict["labels"] = input_dict.pop("label")
+ return input_dict
-需要转换为统一格式
-
-```python
-from paddlenlp.datasets import MapDataset
-from paddlenlp.prompt import InputExample
-
-data_ds = MapDataset([InputExample(uid=example["id"],
- text_a=example["sentence1"],
- text_b=example["sentence2"],
- labels=example["label"]) for example in data])
+data_ds = data_ds.map(convert_label_keyword)
```
### 预训练参数准备
@@ -383,13 +444,13 @@ from paddlenlp.prompt import ManualVerbalizer
from paddlenlp.prompt import PromptModelForSequenceClassification
# 定义模板
-template = AutoTemplate.create_from(template="{'text': 'text_a'}和{'text': 'text_b'}说的是{'mask'}同的事情。",
+template = AutoTemplate.create_from(prompt="{'text': 'text_a'}和{'text': 'text_b'}说的是{'mask'}同的事情。",
tokenizer=tokenizer,
- max_seq_length=512)
+ max_length=512)
# 定义标签词映射
-verbalizer = ManualVerbalizer(tokenizer=tokenizer,
- label_words={0: '不', 1: '相'})
+verbalizer = ManualVerbalizer(label_words={0: '不', 1: '相'},
+ tokenizer=tokenizer)
# 定义文本分类提示模型
prompt_model = PromptModelForSequenceClassification(model,
@@ -404,8 +465,8 @@ prompt_model = PromptModelForSequenceClassification(model,
- ``model`` : 预训练模型实例,支持 ``AutoModelForMaskedLM`` 和 ``AutoModelForSequenceClassification`` 。
- ``template`` : 模板实例。
- ``verbalizer`` : 标签词映射实例。当设为 ``None`` 时,不使用标签词映射,模型输出及损失值计算由 ``model`` 类型定义。
-- ``freeze_plm`` : 在训练时是否固定预训练模型参数。对于规模较小的预训练模型,推荐更新预训练模型参数。
-- ``freeze_dropout`` : 在训练时是否固定预训练模型参数并关闭 ``dropout`` 。 当 ``freeze_dropout=True`` ,``freeze_plm`` 也为 ``True`` 。
+- ``freeze_plm`` : 在训练时固定预训练模型参数,默认为 `False`。对于轻量级预训练模型,推荐使用默认值。
+- ``freeze_dropout`` : 在训练时固定预训练模型参数并关闭 ``dropout`` 。 当 ``freeze_dropout=True`` ,``freeze_plm`` 也为 ``True`` 。
### 使用PromptTrainer训练
@@ -497,6 +558,8 @@ if training_args.do_train:
- WARP: Word-level Adversarial ReProgramming. [[PDF]](https://aclanthology.org/2021.acl-long.381/)
- RGL: A Simple yet Effective Relation Graph Augmented Prompt-based Tuning Approach for Few-Shot Learning. [[PDF]](https://aclanthology.org/2022.findings-naacl.81/)
- R-Drop: Regularized Dropout for Neural Networks. [[PDF]](https://arxiv.org/abs/2106.14448)
+- Openprompt: An open-source framework for prompt-learning. [[PDF]](https://arxiv.org/abs/2111.01998)
+
### 附录
@@ -506,9 +569,9 @@ if training_args.do_train:
| 参数 | 类型 | 默认值 | 含义 |
| ---------------- | ------ | ------- | ------------------------------------------------------- |
-| max_seq_length | int | 512 | 模型输入的最大长度,包括模板部分 |
-| freeze_plm | bool | False | 是否在训练时固定预训练模型的参数 |
-| freeze_dropout | bool | False | 是否在训练时固定预训练模型的参数,同时关闭 dropout |
+| max_seq_length | int | 512 | 模型输入的最大长度,包括模板部分 |
+| freeze_plm | bool | False | 是否在训练时固定预训练模型的参数 |
+| freeze_dropout | bool | False | 是否在训练时固定预训练模型的参数,同时关闭 dropout |
| use_rdrop | bool | False | 是否使用 RDrop 策略,详见 [RDrop 论文](https://arxiv.org/abs/2106.14448) |
| alpha_rdrop | float | 5.0 | RDrop Loss 的权重 |
| use_rgl | bool | False | 是否使用 RGL 策略,详见 [RGL 论文](https://aclanthology.org/2022.findings-naacl.81/) |
diff --git a/paddlenlp/prompt/prompt_args.py b/paddlenlp/prompt/prompt_args.py
index 7c8ca507c82f..fcab68b63754 100644
--- a/paddlenlp/prompt/prompt_args.py
+++ b/paddlenlp/prompt/prompt_args.py
@@ -91,7 +91,7 @@ class PromptTuningArguments(TrainingArguments):
metadata={"help": "Epsilon for the AdamW optimizer of prompt."})
def __post_init__(self):
- super().__post_init__()
+ super(PromptTuningArguments, self).__post_init__()
if self.use_rgl and self.alpha_rgl == 0.0:
logger.warning("Ignore `use_rgl` because `alpha_rgl` = 0. Please "\
"set `alpha_rgl` a positive float to use RGL loss.")
diff --git a/paddlenlp/prompt/prompt_tokenizer.py b/paddlenlp/prompt/prompt_tokenizer.py
index efb2e80bce40..46af802586e1 100644
--- a/paddlenlp/prompt/prompt_tokenizer.py
+++ b/paddlenlp/prompt/prompt_tokenizer.py
@@ -16,122 +16,194 @@
import warnings
from functools import partial
from collections import defaultdict
+from typing import Any, Dict, List, Union
import numpy as np
-from .prompt_utils import InputFeatures
-
__all__ = ["MLMPromptTokenizer"]
class MLMPromptTokenizer(object):
- def __init__(self, tokenizer, max_seq_length, **kwargs):
- self._tokenizer = tokenizer
- self._max_seq_len = max_seq_length
- self._num_special_tokens = self._tokenizer.num_special_tokens_to_add()
- self._special_map = {
- "": "cls_token",
- "": "sep_token",
- "": "pad_token",
- "": "unk_token",
- "": "mask_token"
- }
- self.mask_token_id = self._tokenizer.mask_token_id
- self.pad_token_id = self._tokenizer.pad_token_id
- self.soft_token_id = self._tokenizer.unk_token_id
-
- def __call__(self, input_list):
- encoded_input = defaultdict(list)
-
- for input_dict in input_list:
- # Format text and special tokens, then convert them to ids.
- if input_dict["mask_ids"] == 1:
- text = [self.mask_token_id]
-
- if input_dict["text"] in self._special_map:
- special_token = getattr(self._tokenizer,
- self._special_map[input_dict["text"]])
- input_dict["text"] = special_token
-
- soft_ids = input_dict.get("soft_token_ids", None)
- if soft_ids is not None and soft_ids == 1:
- text = [self.soft_token_id]
- else:
- text = self._tokenizer.encode(
- input_dict["text"],
+ omask_token = "[O-MASK]"
+
+ def __init__(self, tokenizer, max_length):
+ self.tokenizer = tokenizer
+ self.max_length = max_length
+
+ def __call__(self, inputs: List[Dict[str, Any]]):
+ part_text = [part["text"] for part in inputs]
+ part_do_truncate = [part["do_truncate"] for part in inputs]
+ max_lengths = self._create_max_lengths_from_do_truncate(
+ part_text, part_do_truncate)
+
+ encoded_inputs = defaultdict(list)
+ option_length = None
+ last_position = 1 # Id 0 denotes special token '[CLS]'.
+ last_token_type = 0
+ for index, part in enumerate(inputs):
+ # Create input_ids.
+ soft_token_ids = part.get("soft_tokens", None)
+ if soft_token_ids is None or len(
+ soft_token_ids) == 1 and soft_token_ids[0] == 0:
+ input_ids = self.tokenizer.encode(
+ part["text"],
add_special_tokens=False,
- return_token_type_ids=False)["input_ids"]
- encoded_input["input_ids"].append(text)
-
- # Extend other features as the same length of input ids.
- for key in input_dict:
- if key != "text":
- encoded_input[key].append([input_dict[key]] * len(text))
-
- max_seq_len = self._max_seq_len - self._num_special_tokens
- encoded_input = self.truncate(encoded_input, max_seq_len)
- encoded_input.pop("shortenable_ids")
- encoded_input = self.join(encoded_input)
-
- encoded_input = self.add_special_tokens(encoded_input)
- encoded_input = self.pad(encoded_input, self._max_seq_len,
- self.pad_token_id)
- return encoded_input
-
- def add_special_tokens(self, input_dict):
+ return_token_type_ids=False,
+ truncation=True,
+ max_length=max_lengths[index])["input_ids"]
+ encoded_inputs["soft_token_ids"].append([0] * len(input_ids))
+ else:
+ input_ids = soft_token_ids
+ encoded_inputs["soft_token_ids"].append(soft_token_ids)
+ encoded_inputs["input_ids"].append(input_ids)
+ part_length = len(input_ids)
+
+ # Create position_ids.
+ position_ids, last_position = self._create_position_ids_from_part(
+ input_ids, part, last_position)
+ encoded_inputs["position_ids"].append(position_ids)
+
+ # Create token_type_ids.
+ if "token_types" in part:
+ last_token_type = part["token_types"]
+ encoded_inputs["token_type_ids"].append([last_token_type] *
+ part_length)
+
+ # Create other features like encoder_ids.
+ for name in part:
+ if name not in [
+ "text", "soft_tokens", "positions", "token_types"
+ ]:
+ encoded_inputs[name].append([part[name]] * part_length)
+
+ # Record the length of options if exists.
+ if self.omask_token in part["text"]:
+ if option_length is not None:
+ raise ValueError(
+ "There are more than one sequence of options, which "
+ "will cause wrong attention masks.")
+ option_length = len(input_ids)
+
+ encoded_inputs.pop("do_truncate")
+ encoded_inputs = self.join(encoded_inputs)
+ encoded_inputs = self.add_special_tokens(encoded_inputs)
+ attention_mask = self._create_attention_mask(
+ encoded_inputs["input_ids"], option_length)
+ if attention_mask is not None:
+ encoded_inputs["attention_mask"] = attention_mask
+ masked_positions = self._create_masked_positions(
+ encoded_inputs["input_ids"], encoded_inputs["soft_token_ids"])
+ if masked_positions is not None:
+ encoded_inputs["masked_positions"] = masked_positions
+ return encoded_inputs
+
+ def _create_position_ids_from_part(self, input_ids: List[int],
+ part: Dict[str,
+ Any], last_position: int):
+ """
+ Create position ids from prompt for each part.
+ """
+ part_length = len(input_ids)
+ if "positions" in part and part["positions"] > 0:
+ last_position = part["positions"]
+ if self.omask_token in part["text"]:
+ omask_id = self.tokenizer.convert_tokens_to_ids(self.omask_token)
+ omask_index = [
+ x for x in range(part_length) if input_ids[x] == omask_id
+ ]
+ omask_index = [0] + omask_index
+ position_ids = []
+ max_index = 0
+ for start_id, end_id in zip(omask_index[:-1], omask_index[1:]):
+ position_ids.extend(
+ list(range(last_position,
+ last_position + end_id - start_id)))
+ max_index = max(end_id - start_id, max_index)
+ if len(position_ids) < part_length:
+ difference = part_length - len(position_ids)
+ position_ids.extend(
+ range(last_position, last_position + difference))
+ max_index = max(difference, max_index)
+ last_position += max_index
+ else:
+ position_ids = list(
+ range(last_position, last_position + part_length))
+ last_position += part_length
+ return position_ids, last_position
+
+ def _create_max_lengths_from_do_truncate(self, part_text: List[str],
+ part_do_truncate: List[bool]):
+ """
+ Create the max sequence length of each part.
+ """
+ text_length = sum([len(x) for x in part_text])
+ if text_length < self.max_length:
+ return [None] * len(part_text)
+
+ num_special_token = self.tokenizer.num_special_tokens_to_add()
+ cut_length = text_length - self.max_length + num_special_token
+ max_lengths = []
+ if self.tokenizer.truncation_side == "right":
+ for index, part in enumerate(part_text[::-1]):
+ if part_do_truncate[-1 - index] and cut_length > 0:
+ max_lengths.append(max(len(part) - cut_length, 0))
+ cut_length = cut_length - len(part)
+ else:
+ max_lengths.append(None)
+ max_lengths = max_lengths[::-1]
+ else:
+ for index, part in enumerate(text):
+ if part_do_truncate[index] and cut_length > 0:
+ max_lengths.append(max(len(part) - cut_length, 0))
+ cut_length = cut_length - len(part)
+ else:
+ max_lengths.append(None)
+ return max_lengths
+
+ def _create_attention_mask(self, input_ids: List[int],
+ option_length: Union[int, None]):
+ if option_length is None:
+ return None
+ omask_id = self.tokenizer.convert_tokens_to_ids(self.omask_token)
+ input_ids = np.array(input_ids)
+ attention_mask = np.zeros([len(input_ids), len(input_ids)])
+ pad_index = np.where(input_ids == self.tokenizer.pad_token_id)[0]
+ attention_mask[:, pad_index] = 1
+ attention_mask[pad_index, :] = 1
+ omask_index = np.where(input_ids == omask_id)[0].tolist()
+ opt_begin, opt_end = omask_index[0], omask_index[0] + option_length
+ attention_mask[opt_begin:opt_end, opt_begin:opt_end] = 1
+ omask_index.append(opt_end)
+ for opt_begin, opt_end in zip(omask_index[:-1], omask_index[1:]):
+ attention_mask[opt_begin:opt_end, opt_begin:opt_end] = 0
+ attention_mask = (1 - attention_mask) * -1e4
+ return attention_mask
+
+ def _create_masked_positions(self, input_ids: List[int],
+ soft_token_ids: List[int]):
+ non_soft_ids = np.array(input_ids) * (np.array(soft_token_ids) == 0)
+ mask_id = self.tokenizer.mask_token_id
+
+ masked_positions = np.where(non_soft_ids == mask_id)[0]
+ if masked_positions.shape[0] == 0:
+ return None
+ return masked_positions.tolist()
+
+ def add_special_tokens(self, input_dict: Dict[str, Any]):
for key in input_dict:
- new_inputs = self._tokenizer.build_inputs_with_special_tokens(
+ new_inputs = self.tokenizer.build_inputs_with_special_tokens(
input_dict[key])
if key != "input_ids":
special_mask = np.array(
- self._tokenizer.get_special_tokens_mask(input_dict[key]))
+ self.tokenizer.get_special_tokens_mask(input_dict[key]))
new_inputs = np.array(new_inputs)
+ # TODO (Huijuan): Use different ids according to specific keyword.
new_inputs[special_mask == 1] = 0
new_inputs = new_inputs.tolist()
input_dict[key] = new_inputs
return input_dict
- @staticmethod
- def truncate(input_dict, max_seq_len):
- total_tokens = sum([len(text) for text in input_dict["input_ids"]])
- trunc_length = total_tokens - max_seq_len
- if trunc_length > 0:
- truncated_dict = defaultdict(list)
- trunc_mask = input_dict["shortenable_ids"]
- for key in input_dict:
- content = input_dict[key]
- count = trunc_length
- for idx, text in enumerate(content[::-1]):
- index = -idx - 1
- if len(text) == 0 or trunc_mask[index][0] == 0:
- continue
- if count < len(text):
- content[index] = text[:-count]
- else:
- content[index] = []
- count -= len(text)
- if count <= 0:
- break
- truncated_dict[key] = content
- return truncated_dict
- else:
- return input_dict
-
- @staticmethod
- def pad(input_dict, max_seq_len, pad_id, other_pad_id=0):
- for key, content in input_dict.items():
- if len(content) > max_seq_len:
- raise ValueError(
- f"Truncated length of {key} is still longer than "
- f"{max_seq_len}, please use a shorter prompt.")
- if key == "input_ids":
- pad_seq = [pad_id] * (max_seq_len - len(content))
- else:
- pad_seq = [other_pad_id] * (max_seq_len - len(content))
- input_dict[key].extend(pad_seq)
- return input_dict
-
@staticmethod
def join(input_dict):
for key in input_dict:
diff --git a/paddlenlp/prompt/prompt_trainer.py b/paddlenlp/prompt/prompt_trainer.py
index 2ffd2d9fd58e..e46d8f810ed4 100644
--- a/paddlenlp/prompt/prompt_trainer.py
+++ b/paddlenlp/prompt/prompt_trainer.py
@@ -18,25 +18,22 @@
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
-from paddle.static import InputSpec
from ..datasets import MapDataset
from ..utils.log import logger
from ..trainer import Trainer, TrainerCallback
-from ..trainer.trainer_utils import EvalPrediction
+from ..trainer.trainer_utils import EvalPrediction, get_scheduler
from ..data import DataCollator
from ..losses import RDropLoss
from ..transformers import PretrainedTokenizer, export_model
from .template import AutoTemplate
from .verbalizer import SoftVerbalizer
-from .prompt_utils import InputFeatures, signature
+from .prompt_utils import signature, PromptDataCollatorWithPadding
from .prompt_args import PromptTuningArguments
__all__ = ["PromptTrainer", "PromptModelForSequenceClassification"]
-PROMPT_NAME = "prompt.pdparams"
-
class PromptTrainer(Trainer):
"""
@@ -67,18 +64,20 @@ def __init__(self,
args = PromptTuningArguments(output_dir=output_dir)
if data_collator is None:
- data_collator = InputFeatures.collate_fn
-
- super().__init__(model=model,
- criterion=criterion,
- args=args,
- data_collator=data_collator,
- train_dataset=train_dataset,
- eval_dataset=eval_dataset,
- tokenizer=tokenizer,
- compute_metrics=compute_metrics,
- callbacks=callbacks,
- optimizers=optimizers)
+ data_collator = PromptDataCollatorWithPadding(tokenizer,
+ padding=True,
+ return_tensors='pd')
+
+ super(PromptTrainer, self).__init__(model=model,
+ criterion=criterion,
+ args=args,
+ data_collator=data_collator,
+ train_dataset=train_dataset,
+ eval_dataset=eval_dataset,
+ tokenizer=tokenizer,
+ compute_metrics=compute_metrics,
+ callbacks=callbacks,
+ optimizers=optimizers)
self.load_state_dict_from_checkpoint(args.resume_from_checkpoint)
@@ -88,86 +87,74 @@ def __init__(self,
if self.args.use_rdrop:
self.rdrop_criterion = RDropLoss()
+ def _get_model(self):
+ model = self.model
+ if isinstance(model, paddle.DataParallel):
+ model = model._layers
+ return model
+
@property
def template(self):
- return self.model.template
+ return self._get_model().template
@template.setter
def template(self, template):
- self.model.template = template
+ self._get_model().template = template
@property
def verbalizer(self):
- return getattr(self.model, "verbalizer", None)
+ return self._get_model().verbalizer
@verbalizer.setter
def verbalizer(self, verbalizer):
- setattr(self.model, "verbalizer", verbalizer)
+ self._get_model().verbalizer = verbalizer
@property
- def plm(self):
- return self.model.plm
+ def pretrained_model(self):
+ self._set_model_attributes(self.model, "plm")
- @plm.setter
- def plm(self, model):
- self.model.plm = model
+ @pretrained_model.setter
+ def pretrained_model(self, model):
+ self._set_model_attributes(self.model, "plm", model)
- def _map_dataset(self, dataset):
+ def _map_dataset(self, dataset: MapDataset):
if dataset is None:
return None
if not isinstance(dataset, MapDataset):
raise ValueError("Expected `MapDataset` but received {}.".format(
type(dataset)))
- return dataset.map(self._convert_example)
- def _convert_example(self, example):
- encoded_inputs = self.template.wrap_one_example(example)
- return encoded_inputs
+ def encode_with_template(example):
+ return self.template(example)
- def _prepare_input(self, inputs: InputFeatures):
+ return dataset.map(encode_with_template)
+
+ def _prepare_input(self, inputs: Dict):
return inputs
- def _save(self, output_dir: Optional[str] = None, state_dict=None):
- super()._save(output_dir, state_dict)
+ def _save(self,
+ output_dir: Optional[str] = None,
+ state_dict: Dict[str, Any] = None):
+ super(PromptTrainer, self)._save(output_dir, state_dict)
output_dir = output_dir if output_dir is not None else self.args.output_dir
if self.template:
- self.template.save_to(output_dir)
- if self.verbalizer:
- self.verbalizer.save_to(output_dir)
+ self.template.save(output_dir)
+ if self.verbalizer is not None:
+ self.verbalizer.save(output_dir)
- def load_state_dict_from_checkpoint(self, resume_from_checkpoint=None):
+ def load_state_dict_from_checkpoint(
+ self, resume_from_checkpoint: os.PathLike = None):
if resume_from_checkpoint is not None:
- self.template = AutoTemplate.load_from(
- resume_from_checkpoint, self.tokenizer,
- self.args.max_seq_length, self.plm,
- getattr(self.template, "_prompt_encoder", None),
- getattr(self.template, "encoder_hidden_size", None))
-
- super().load_state_dict_from_checkpoint(resume_from_checkpoint)
-
- def get_eval_dataloader(self, eval_dataset=None):
- """
- Return the evaluation [`~paddle.io.DataLoader`].
-
- Args:
- eval_dataset (`paddlenlp.datasets.MapDataset`):
- Created by `paddlenlp.prompt.load_dataset`,
- where every item is an InputExample object.
- """
- eval_dataset = self._map_dataset(eval_dataset)
- return super().get_eval_dataloader(eval_dataset)
+ self.template = AutoTemplate.load_from(resume_from_checkpoint,
+ self.tokenizer,
+ self.args.max_seq_length,
+ self._get_model())
+ super(PromptTrainer,
+ self).load_state_dict_from_checkpoint(resume_from_checkpoint)
def get_test_dataloader(self, test_dataset):
- """
- Return the test [`~paddle.io.DataLoader`].
-
- Args:
- test_dataset (`paddlenlp.datasets.MapDataset`):
- The test dataset created by `paddlenlp.prompt.load_dataset`,
- where every item is an InputExample object.
- """
test_dataset = self._map_dataset(test_dataset)
- return super().get_test_dataloader(test_dataset)
+ return super(PromptTrainer, self).get_test_dataloader(test_dataset)
def create_optimizer(self, lr_scheduler=None):
"""
@@ -178,20 +165,21 @@ def create_optimizer(self, lr_scheduler=None):
self.args)
plm_parameters = []
- if not self.model.freeze_plm:
+ if not self.args.freeze_plm:
plm_parameters.extend([
- p for p in self.model.plm.parameters()
+ p for p in self._get_model().plm.parameters()
if not p.stop_gradient
])
ppt_parameters = []
if self.template is not None:
ppt_parameters.extend([
- x for x in self.template.parameters() if not x.stop_gradient
+ x for n, x in self.template.named_parameters()
+ if not x.stop_gradient
])
if self.verbalizer is not None:
if isinstance(self.verbalizer, SoftVerbalizer):
- if not self.model.freeze_plm:
+ if not self.args.freeze_plm:
plm_parameters.extend([
p for n, p in self.verbalizer.non_head_parameters()
if not p.stop_gradient
@@ -203,7 +191,7 @@ def create_optimizer(self, lr_scheduler=None):
[p for n, p in self.verbalizer.parameters()])
decay_parameters = [
- p.name for n, p in self.model.named_parameters()
+ p.name for n, p in self._get_model().named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
]
apply_decay_param_fun = lambda x: x in decay_parameters
@@ -225,7 +213,16 @@ def create_optimizer(self, lr_scheduler=None):
else:
params = plm_parameters
else:
- lr = self.args.ppt_learning_rate
+ args = self.init_num_steps(self.args, len(self.train_dataset))
+ warmup = args.warmup_steps if args.warmup_steps > 0 else int(
+ args.warmup_ratio * args.num_training_steps)
+ self.lr_scheduler = get_scheduler(
+ args.lr_scheduler_type,
+ learning_rate=self.args.ppt_learning_rate,
+ num_warmup_steps=warmup,
+ num_training_steps=args.num_training_steps,
+ )
+ lr = self.lr_scheduler
params = ppt_parameters
self.optimizer = optim_cls(
@@ -243,20 +240,20 @@ def compute_loss(self, model, inputs, return_outputs=False):
Compute the total loss for every batch.
"""
if "labels" not in inputs:
- raise ValueError("Fail to compute loss as there are no labels "\
- "in {}.".format(inputs))
+ raise ValueError(
+ "Fail to compute loss as `labels` not in {}.".format(inputs))
labels = inputs["labels"]
- soft_token_ids = inputs.get("soft_token_ids", None)
- outputs, hidden_states = model(inputs["input_ids"],
- inputs["mask_ids"],
- soft_token_ids,
- return_hidden_states=True)
+ input_dict = inputs.copy()
+ input_dict["return_hidden_states"] = True
+ outputs, hidden_states = model(**input_dict)
+
if self.criterion is not None:
loss = self.criterion(outputs, labels)
if self.args.use_rdrop:
- loss = self._compute_rdrop_loss(model, inputs, outputs, loss)
+ loss = self._compute_rdrop_loss(model, input_dict, outputs,
+ loss)
if self.args.use_rgl:
loss += self._compute_rgl_loss(hidden_states, labels)
@@ -267,10 +264,10 @@ def compute_loss(self, model, inputs, return_outputs=False):
return (loss, outputs) if return_outputs else loss
- def _compute_rdrop_loss(self, model, inputs, outputs, loss):
- re_outputs = model(inputs["input_ids"], inputs["mask_ids"],
- inputs.get("soft_token_ids", None))
- ce_loss = (self.criterion(re_outputs, inputs["labels"]) + loss) * 0.5
+ def _compute_rdrop_loss(self, model, input_dict, outputs, loss):
+ re_outputs, _ = model(**input_dict)
+ labels = input_dict["labels"]
+ ce_loss = (self.criterion(re_outputs, labels) + loss) * 0.5
kl_loss = self.rdrop_criterion(outputs, re_outputs)
loss = ce_loss + self.args.alpha_rdrop * kl_loss
return loss
@@ -312,14 +309,11 @@ def _raw_equal(x, y):
return loss
- def export_model(self, export_path, input_spec=None, export_type="paddle"):
+ def export_model(self, export_path, input_spec, export_type="paddle"):
os.makedirs(export_path, exist_ok=True)
- self.template.save_to(export_path)
- self.verbalizer.save_to(export_path)
- if input_spec is None and hasattr(self.model, "get_input_spec"):
- input_spec = self.model.get_input_spec()
- if input_spec is None:
- raise ValueError("Please define input_spec to export model.")
+ self.template.save(export_path)
+ if self.verbalizer is not None:
+ self.verbalizer.save(export_path)
export_model(self.model, input_spec, export_path, export_type)
@@ -334,65 +328,62 @@ def __init__(self,
verbalizer=None,
freeze_plm: bool = False,
freeze_dropout: bool = False):
- super().__init__()
+ super(PromptModelForSequenceClassification, self).__init__()
self.plm = model
self.template = template
self.verbalizer = verbalizer
self.freeze_plm = freeze_plm
self.freeze_dropout = freeze_dropout
- if self.verbalizer is not None and hasattr(verbalizer, "process_model"):
- self.plm = self.verbalizer.process_model(self.plm)
if self.freeze_plm:
for param in self.plm.parameters():
param.stop_gradient = True
- if self.freeze_dropout:
- self.plm.eval()
+ if self.freeze_dropout:
+ self.plm.eval()
self.forward_keys = signature(self.plm.forward)
self._mask_token_id = self.template.tokenizer.mask_token_id
self._pad_token_id = self.template.tokenizer.pad_token_id
def forward(self,
- input_ids=None,
- mask_ids=None,
+ input_ids,
+ token_type_ids=None,
+ position_ids=None,
+ attention_mask=None,
+ masked_positions=None,
soft_token_ids=None,
+ encoder_ids=None,
**kwargs):
- return_hidden_states = kwargs.pop('return_hidden_states', False)
- if self.freeze_dropout:
- self.plm.eval()
- attention_mask = (input_ids != self._pad_token_id).astype("int64")
- inputs = InputFeatures(input_ids=input_ids,
- mask_ids=mask_ids,
- attention_mask=attention_mask,
- soft_token_ids=soft_token_ids)
- if hasattr(self.template, "process_batch"):
- inputs = self.template.process_batch(inputs)
+ input_dict = {
+ "input_ids": input_ids,
+ "token_type_ids": token_type_ids,
+ "position_ids": position_ids,
+ "masked_positions": masked_positions,
+ "soft_token_ids": soft_token_ids,
+ "attention_mask": attention_mask,
+ "encoder_ids": encoder_ids
+ }
+ input_dict = self.template.process_batch(input_dict)
model_inputs = {
- k: inputs[k]
- for k in inputs.keys(keep_none=True) if k in self.forward_keys
+ k: input_dict[k]
+ for k in input_dict if k in self.forward_keys
}
+ model_inputs["masked_positions"] = None
outputs = self.plm(**model_inputs)
- hidden_states = outputs
- if hasattr(self.template, "post_process_batch"):
- outputs = self.template.post_process_batch(outputs)
- if self.verbalizer and hasattr(self.verbalizer, "process_outputs"):
- outputs = self.verbalizer.process_outputs(outputs, inputs=inputs)
-
- if return_hidden_states:
- return outputs, hidden_states
+ if self.verbalizer is not None:
+ label_outputs = self.verbalizer.process_outputs(
+ outputs, input_dict["masked_positions"])
+ else:
+ label_outputs = outputs
+
+ if kwargs.pop('return_hidden_states', False):
+ return label_outputs, outputs
else:
- return outputs
+ return label_outputs
def prompt_parameters(self):
"""
Get the parameters of template and verbalizer.
"""
- return [p for p in self.template.parameters()
- ] + [p for p in self.verbalizer.parameters()]
-
- def get_input_spec(self):
- input_spec = [
- InputSpec(shape=[None, None], dtype="int64"), # input_ids
- InputSpec(shape=[None, None], dtype="int64"), # mask_ids
- InputSpec(shape=[None, None], dtype="int64") # soft_token_ids
- ]
- return input_spec
+ params = [p for p in self.template.parameters()]
+ if self.verbalizer is not None:
+ params += [p for p in self.verbalizer.parameters()]
+ return params
diff --git a/paddlenlp/prompt/prompt_utils.py b/paddlenlp/prompt/prompt_utils.py
index f51d64552ade..cbf00cf29bc7 100644
--- a/paddlenlp/prompt/prompt_utils.py
+++ b/paddlenlp/prompt/prompt_utils.py
@@ -1,217 +1,103 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections import defaultdict
-import json
+"""
+Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+This module defines the itermediate data structure of inputs.
+"""
+
import inspect
-from dataclasses import dataclass, field
+from typing import Any, Dict, List, Union, Optional
+from dataclasses import dataclass
import numpy as np
import paddle
-from ..utils.log import logger
-
-__all__ = ["InputExample", "InputFeatures"]
+from ..transformers.tokenizer_utils_base import (PretrainedTokenizerBase,
+ PaddingStrategy)
-@dataclass
-class InputExample(object):
- """Data structure of every example in datasets."""
- uid: str = field(default=None,
- metadata={'help': 'A unique identifier of the example.'})
- text_a: str = field(
- default=None,
- metadata={'help': 'The first text sequence in each example.'})
- text_b: str = field(
- default=None,
- metadata={'help': 'The other text sequences in each example.'})
- labels: int = field(default=None,
- metadata={'help': 'The label in each example.'})
- meta: dict = field(
- default=None,
- metadata={
- 'help': 'An optional dictionary of other data for each example.'
- })
-
- def __repr__(self):
- content = {k: v for k, v in self.__dict__.items() if v is not None}
- content = json.dumps(content, indent=2, sort_keys=True) + '\n'
- return str(content)
-
- def keys(self, keep_none=False):
- return [
- key for key in self.__dict__.keys()
- if getattr(self, key) is not None
- ]
-
-
-class InputFeatures(dict):
- """
- Data structure of every wrapped example or a batch of examples as the input of model.
-
- Args:
- input_ids (paddle.Tensor):
- The token ids.
- attention_mask (paddle.Tensor):
- The mask ids.
- token_type_ids (paddle.Tensor, optional):
- The token type ids.
- inputs_embeds (paddle.Tensor, optional):
- The embeddings of soft tokens.
- mask_ids (paddle.Tensor, optional):
- The mask ids where 1 denotes that a token is a mask, 0 denotes it is not a mask.
- labels (list, optional):
- The labels of classification task.
- uid (list, optional):
- The unique id(s) for example(s).
- """
- input_keys = [
- 'input_ids', 'attention_mask', 'token_type_ids', 'inputs_embeds', 'uid',
- 'labels', 'mask_ids', 'soft_token_ids'
- ]
- tensorable = [
- 'input_ids', 'attention_mask', 'token_type_ids', 'inputs_embeds',
- 'labels', 'mask_ids', 'soft_token_ids'
- ]
-
- def __init__(self,
- input_ids=None,
- attention_mask=None,
- token_type_ids=None,
- inputs_embeds=None,
- mask_ids=None,
- labels=None,
- uid=None,
- soft_token_ids=None):
- self.input_ids = input_ids
- self.attention_mask = attention_mask
- self.token_type_ids = token_type_ids
- self.inputs_embeds = inputs_embeds
- self.labels = labels
- self.mask_ids = mask_ids
- self.uid = uid
- self.soft_token_ids = soft_token_ids
-
- @classmethod
- def add_keys(cls, *args):
- cls.input_keys.extend(args)
-
- def keys(self, keep_none=False):
- if keep_none:
- return self.input_keys
- else:
- return [
- key for key in self.input_keys if getattr(self, key) is not None
- ]
-
- @property
- def tensorable_keys(self, keep_none=False):
- if keep_none:
- return self.tensorable
- else:
- return [
- key for key in self.tensorable if getattr(self, key) is not None
- ]
-
- @tensorable_keys.setter
- def tensorable_keys(self, keys):
- diff_keys = set(keys) - set(self.input_keys)
- if len(diff_keys) > 0:
- raise ValueError("{} not in predefined keys.".format(
- ["`%s`" % k for k in diff_keys].join(", ")))
- self.tensorable = keys
-
- def values(self, keep_none=False):
- return [getattr(self, key) for key in self.keys(keep_none=keep_none)]
-
- def items(self):
- return [(key, getattr(self, key)) for key in self.keys()]
-
- def __len__(self):
- return len(self.keys())
-
- def __repr__(self):
- content = {}
- for key, value in self.items():
- if isinstance(value, paddle.Tensor):
- value = value.numpy().tolist()
- elif isinstance(value, paddle.static.Variable):
- value = value.to_string(True)
- content[key] = value
- return str(json.dumps(content))
-
- def __getitem__(self, key):
- return getattr(self, key)
-
- def __iter__(self):
- return iter(self.keys())
-
- def __contains__(self, key, keep_none):
- return key in self.keys(keep_none)
-
- def __setitem__(self, key, value):
- if key not in self.input_keys:
- logger.warning(
- "`{}` is not a predefined key in InputFeatures. Perhaps it "\
- "brings unexpected results.".format(key))
- self.add_keys(key)
- setattr(self, key, value)
-
- def __eq__(self, other):
- if not isinstance(other, InputFeatures):
- return False
- if self.keys() != other.keys():
- return False
- for key in self.keys():
- value = getattr(self, key)
- other_value = getattr(other, key)
- if type(value) != type(other_value):
- return False
- if isinstance(value, paddle.Tensor):
- value = value.numpy()
- other_value = other_value.numpy()
- if isinstance(value, list):
- value = np.array(value)
- other_value = np.array(other_value)
- if not (value == other_value).all():
- return False
- return True
-
- def __hash__(self):
- return hash(self.__repr__())
-
- @classmethod
- def collate_fn(cls, batch):
- """Collate batch data in form of InputFeatures."""
- new_batch = {}
- for key in batch[0]:
- values = [b[key] for b in batch]
- if key in cls.tensorable:
- new_batch[key] = paddle.to_tensor(values)
- else:
- new_batch[key] = values
-
- return InputFeatures(**new_batch)
-
-
-def signature(fn):
+def signature(function):
"""
Obtain the input arguments of the given function.
"""
- sig = inspect.signature(fn)
+ sig = inspect.signature(function)
args = [
p.name for p in sig.parameters.values()
if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
]
return args
+
+
+@dataclass
+class PromptDataCollatorWithPadding:
+ """
+ Data collator that will group inputs by keywords and dynamically
+ pad the inputs to the longest sequence in the batch.
+
+ Args:
+ tokenizer (`paddlennlp.transformers.PretrainedTokenizer`):
+ The tokenizer used for encoding the data from PromptTokenizer.
+ """
+
+ tokenizer: PretrainedTokenizerBase
+ padding: Union[bool, str, PaddingStrategy] = True
+ max_length: Optional[int] = None
+ pad_to_multiple_of: Optional[int] = None
+ return_tensors: str = "pd"
+ return_attention_mask: Optional[bool] = None
+ default_model_input_names: List = ("input_ids", "token_type_ids",
+ "special_tokens_mask", "offset_mapping",
+ "position_ids")
+
+ def _convert_to_tensors(self, data):
+ if self.return_tensors == "np":
+ return np.array(data)
+ else:
+ return paddle.to_tensor(data)
+
+ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+ batch = {}
+ for key in features[0]:
+ if key in self.default_model_input_names:
+ batch[key] = [b[key] for b in features]
+
+ batch = self.tokenizer.pad(
+ batch,
+ padding=self.padding,
+ max_length=self.max_length,
+ pad_to_multiple_of=self.pad_to_multiple_of,
+ return_tensors=self.return_tensors,
+ return_attention_mask=self.return_attention_mask)
+ max_length = batch["input_ids"].shape[1]
+ for key in features[0]:
+ if key not in self.default_model_input_names:
+ values = [b[key] for b in features]
+ if key == "masked_positions":
+ new_values = []
+ for index, value in enumerate(values):
+ value = np.array(value) + index * max_length
+ new_values.extend(value.tolist())
+ values = new_values
+ elif key == "attention_mask":
+ new_values = np.zeros(
+ [len(values), 1, max_length, max_length])
+ for index, value in enumerate(values):
+ length = len(value)
+ new_values[index][0, :length, :length] = value
+ values = new_values
+ elif key != "labels":
+ for index, value in enumerate(values):
+ values[index] = value + [0] * (max_length - len(value))
+ batch[key] = self._convert_to_tensors(values)
+ return batch
diff --git a/paddlenlp/prompt/template.py b/paddlenlp/prompt/template.py
index 400101082d0c..b9b98addd436 100644
--- a/paddlenlp/prompt/template.py
+++ b/paddlenlp/prompt/template.py
@@ -1,463 +1,814 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+"""
+Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+This module provide prompt definition methods.
+"""
-from abc import abstractmethod
import os
import re
import json
+import traceback
+from abc import abstractmethod
+from typing import Any, Dict, List, Tuple, Optional, Union
+
+import numpy as np
import paddle
import paddle.nn as nn
+from paddle import Tensor
+from paddlenlp.utils.log import logger
+from paddlenlp.transformers import PretrainedTokenizer, PretrainedModel
-from .prompt_utils import InputExample, InputFeatures
from .prompt_tokenizer import MLMPromptTokenizer
-from ..utils.log import logger
-
-__all__ = ["Template", "ManualTemplate", "SoftTemplate", "AutoTemplate"]
-
-TEMPLATE_FILE = "template.json"
-
-
-def parse_template(inputs: str, part_start="{", part_end="}"):
- """ Parse items from the input template text. """
- parsed = []
- i_start = 0
- while i_start < len(inputs):
- space = ' ' if (i_start > 0 and inputs[i_start - 1] == ' ') else ''
- p = {"add_prefix_space": space}
- while i_start < len(inputs) and inputs[i_start] == ' ':
- p["add_prefix_space"] = ' '
- i_start += 1
- if i_start == len(inputs): break
-
- if inputs[i_start] == part_start:
- i_end = i_start + 1
- count_part = 1
- while i_end < len(inputs):
- if inputs[i_end] == part_end:
- count_part -= 1
- if count_part == 0: break
- elif inputs[i_end] == part_start:
- count_part += 1
- i_end += 1
- if i_end == len(inputs):
- raise ValueError(
- '{} at position {} has no corresponding {}'.format(
- part_start, i_start, part_end))
- try:
- part = eval('{%s}' % inputs[i_start + 1:i_end])
- if isinstance(part, set):
- part = {k: None for k in part}
- p.update(part)
- except:
- import traceback
- logger.error(traceback.format_exc())
- logger.error(
- 'syntax error in {}'.format(f"{inputs[i_start + 1:i_end]}"))
- exit()
- i_start = i_end + 1
- else:
- i_end = i_start + 1
- while i_end < len(inputs):
- if inputs[i_end] == part_start:
- break
- i_end += 1
- p['hard'] = inputs[i_start:i_end].rstrip(' ')
- i_start = i_end
- parsed.append(p)
- return parsed
+__all__ = [
+ "Template", "ManualTemplate", "SoftTemplate", "PrefixTemplate",
+ "AutoTemplate"
+]
+
+# Template used to be saved in a file.
+TEMPLATE_CONFIG_FILE = "template_config.json"
+TEMPLATE_PARAMETER_FILE = "template_state.pdparams"
+
+# Default values for some template attributes.
+DEFAULT_MAX_OPTIONS = 10
class Template(nn.Layer):
"""
- Base template class used to preprocess the inputs of model.
+ Base class for [`Template`].
Args:
- tokenizer (paddlenlp.transformers.PretrainedTokenizer):
- The tokenizer of pretrained models.
-
+ prompt (`str`):
+ A template string which defines how to combine text and prompt.
+ tokenizer (`PretrainedTokenizer`):
+ An instance of PretrainedTokenizer used for tokenization.
+ max_length (`int`):
+ If set to a number, it will limit the total sequence returned so
+ that it has a maximum length, including prompts.
"""
- registered_input_names = ['mask_ids', 'shortenable_ids']
- registered_text_keys = ['text_a', 'text_b']
-
- def __init__(self, tokenizer, max_seq_length):
- super().__init__()
+ template_special_tokens = [
+ "text", "hard", "soft", "soft_id", "prefix", "sep", "mask", "options"
+ ]
+ template_attributes = [
+ "length", "encoder", "position", "token_type", "hidden_size",
+ "add_omask", "add_prompt", "add_space"
+ ]
+ input_feature_names = ["do_truncate", "token_types", "positions"]
+ opt_token = "[OPT]"
+ omask_token = "[O-MASK]"
+
+ def __init__(self, prompt: str, tokenizer: PretrainedTokenizer,
+ max_length: int, **kwargs):
+ super(Template, self).__init__()
+ for key, value in kwargs.items():
+ setattr(self, key, value)
self.tokenizer = tokenizer
- self.wrapped_tokenizer = MLMPromptTokenizer(tokenizer, max_seq_length)
+ self.prompt_tokenizer = MLMPromptTokenizer(tokenizer, max_length)
+ self.prompt = prompt
@property
- def template(self):
- if not hasattr(self, '_template'):
- raise RuntimeError(
- 'Property template has not been set before used.')
- return self._template
-
- @template.setter
- def template(self, template):
- if template is None:
- return
- self._template = template
- self._process_template()
+ def prompt(self):
+ return self._prompt
+
+ @prompt.setter
+ def prompt(self, prompt: str):
+ if prompt is not None:
+ if isinstance(prompt, str):
+ self._prompt = self.parse_template_string(prompt)
+ else:
+ self._prompt = prompt
+ self._check_template_special_tokens()
+ self.example_keys = self.create_example_keys_from_prompt()
+ self.token_types = self.create_token_type_sequence_from_prompt()
+ self.do_truncate = self.create_truncation_sequence_from_prompt()
+ self.positions = self.create_position_sequence_from_prompt()
+ self.create_prompt_parameters()
@abstractmethod
- def _process_template(self):
- """ A hook to process template text when it is set. """
+ def create_prompt_parameters(self):
raise NotImplementedError
- def parse_inputs(self, inputs):
- return parse_template(inputs)
+ def _check_template_special_tokens(self):
+ valid_attr = self.template_special_tokens + self.template_attributes
+ prompt_attr = []
+ for part in self._prompt:
+ prompt_attr.extend(list(part.keys()))
+ if "add_prompt" in part:
+ opt_prompt = part["add_prompt"]
+ if self.opt_token not in opt_prompt:
+ raise ValueError("'{}' not found in option prompt.".format(
+ self.opt_token))
+ if "add_omask" in part:
+ self._check_omask_token()
+ diff_attr = set(prompt_attr) - set(valid_attr)
+ if len(diff_attr) > 0:
+ raise ValueError(
+ "Invalid attributes found in template: {}.".format(diff_attr))
+ return True
+
+ def _check_example_name(self, name: str, example: Dict[str, Any]):
+ if name not in example:
+ raise ValueError(
+ "Unexpected value in template. Can not find keyword {} in example: {}"
+ .format(name, example))
+ return True
+
+ def _check_omask_token(self):
+ omask_example = """
+ Add '[O-MASK]' to tokenizer to use `add_omask`.
+
+ Examples:
+
+ ```python
+ omask_dict = {"additional_special_tokens": ["[O-MASK]"]}
+ tokenizer.add_special_tokens(omask_dict)
+ model.resize_token_embeddings(len(tokenizer))
+ ```"""
+ if self.omask_token not in self.tokenizer.additional_special_tokens:
+ self.tokenizer.add_special_tokens(
+ {"additional_special_tokens": [self.omask_token]})
+ return True
+ raise ValueError(
+ "'{}' not found in tokenizer.".format(self.omask_token) +
+ omask_example)
+ return True
+
+ def build_inputs_with_prompt(
+ self,
+ example: Dict[str, Any],
+ prompt: Optional[List[Dict[str, Any]]] = None) -> List[str]:
+ """
+ Build input text sequences according to both prompt and example.
+
+ Args:
+ example (`Dict[str, Any]`):
+ A data sample with corresponding keys as `prompt`.
+ prompt (`Optional[List[Dict[str, Any]]]`):
+ A sequence of dictionary which defines positions of prompt,
+ input text and special tokens.
+ """
+ inputs = self._prompt.copy() if prompt is None else prompt.copy()
+
+ for index, part in enumerate(inputs):
+ if "text" in part:
+ self._check_example_name(part["text"], example)
+ inputs[index] = str(example[part["text"]])
+ elif "mask" in part:
+ if "length" not in part:
+ part["length"] = 1
+ inputs[index] = self.tokenizer.mask_token * part["length"]
+ elif "sep" in part:
+ inputs[index] = self.tokenizer.sep_token
+ elif "hard" in part:
+ inputs[index] = part["hard"]
+ elif "options" in part:
+ if not isinstance(part["options"], list):
+ self._check_example_name(part["options"], example)
+ labels = example[part["options"]]
+ labels = [labels] if isinstance(labels, str) else labels
+ else:
+ labels = part["options"]
+ if "add_prompt" in part:
+ opt_prompt = part["add_prompt"]
+ labels = [
+ opt_prompt.replace(self.opt_token, x) for x in labels
+ ]
+ if "add_omask" in part:
+ labels = [self.omask_token + x for x in labels]
+ inputs[index] = "".join(labels)
+ else:
+ inputs[index] = part
- def get_default_mask_ids(self):
- """ List to denote whether an item in template is a mask token. """
- return [1 if 'mask' in p else 0 for p in self.template]
+ if "add_space" in part:
+ inputs[index] = " " + inputs[index]
+ return inputs
- def get_default_shortenable_ids(self):
- """ List to denote whther an item in template can be truncated. """
- idx = []
- for p in self.template:
- if 'shortenable' in p:
- idx.append(1 if p['shortenable'] else 0)
+ def create_token_type_sequence_from_prompt(
+ self, prompt: Optional[List[Dict[str, Any]]] = None) -> List[int]:
+ prompt = self._prompt if prompt is None else prompt
+ last_token_type = 0
+ token_type_ids = []
+ for part in prompt:
+ if "token_type" in part:
+ last_token_type = part["token_type"]
+ token_type_ids.append(last_token_type)
+ return token_type_ids
+
+ def create_position_sequence_from_prompt(
+ self, prompt: Optional[List[Dict[str, Any]]] = None) -> List[int]:
+ prompt = self._prompt if prompt is None else prompt
+ position_ids = []
+ for part in prompt:
+ if "position" in part:
+ position_ids.append(part["position"])
else:
- idx.append(1 if 'text' in p else 0)
- return idx
-
- def incorporate_template_text(self, example, template=None):
- """ Replace each item in template with real text. """
- inputs = template.copy(
- ) if self.template is None else self.template.copy()
-
- for i, p in enumerate(inputs):
- if 'text' in p:
- inputs[i] = p['add_prefix_space'] + getattr(example, p['text'])
- elif 'mask' in p:
- inputs[i] = self.tokenizer.mask_token
- elif 'hard' in p:
- inputs[i] = p['add_prefix_space'] + p['hard']
- elif 'sep' in p:
- inputs[i] = self.tokenizer.sep_token
+ position_ids.append(-1)
+ return position_ids
+
+ def create_truncation_sequence_from_prompt(
+ self, prompt: Optional[List[Dict[str, Any]]] = None) -> List[int]:
+ prompt = self._prompt.copy() if prompt is None else prompt.copy()
+ do_truncate = []
+ for part in prompt:
+ if "truncate" in part:
+ do_truncate.append(part["truncation"])
+ prompt_tokens = set(part.keys()) - set(["text"])
+ if len(prompt_tokens) > 0 and part["truncation"]:
+ logger.warning("{} in template will be truncated, ".format(
+ prompt_tokens) + "which might degrade performance.")
+ elif "text" in part:
+ do_truncate.append(True)
else:
- raise ValueError('Can not parse {}'.format(p))
+ do_truncate.append(False)
+ return do_truncate
+
+ def create_example_keys_from_prompt(self):
+ example_keys = set()
+ for part in self._prompt:
+ if "text" in part:
+ example_keys.add(part["text"])
+ if "options" in part and isinstance(part["options"], list):
+ example_keys.update(set(part["options"]))
+ return example_keys
+
+ def encode(self, example: Dict[str, Any]):
+ input_text = self.build_inputs_with_prompt(example)
+ input_names, input_values = ["text"], [input_text]
+ for name in self.input_feature_names:
+ input_names.append(name)
+ input_values.append(getattr(self, name, None))
- return inputs
+ inputs = []
+ for value in list(zip(*input_values)):
+ inputs.append(dict(zip(input_names, value)))
- def wrap_one_example(self, example):
- """ Process InputExample according to the predefined template. """
- if self.template is None:
- raise ValueError('The template has not been initialized.')
- if isinstance(example, InputExample):
- text = self.incorporate_template_text(example)
-
- non_empty_keys = example.keys()
- for key in self.registered_text_keys:
- if key in non_empty_keys:
- non_empty_keys.remove(key)
-
- keys, values = ['text'], [text]
- for name in self.registered_input_names:
- keys.append(name)
- v = None
- if hasattr(self, name) and getattr(self, name) is not None:
- v = getattr(self, name)
- elif hasattr(self, 'get_default_' + name):
- v = getattr(self, 'get_default_' + name)()
- setattr(self, name, v)
- else:
- raise ValueError("""
- Template's part attribute '{}' is registered but not
- initialized. Try using template.{} = [...] to
- initialize or create a get_default_{}(self)
- method in your template.""".format(name, name, name))
- values.append(v)
-
- wrapped_parts_to_tokenize = []
- for value in list(zip(*values)):
- wrapped_parts_to_tokenize.append(dict(zip(keys, value)))
-
- wrapped_parts_not_to_tokenize = {
- key: getattr(example, key)
- for key in non_empty_keys
- }
- wrapped_parts_to_tokenize = self.wrapped_tokenizer(
- wrapped_parts_to_tokenize)
-
- return InputFeatures(**wrapped_parts_to_tokenize,
- **wrapped_parts_not_to_tokenize)
- else:
- raise TypeError('InputExample')
+ input_dict = self.prompt_tokenizer(inputs)
+ unused_example = {
+ k: v
+ for k, v in example.items() if k not in self.example_keys
+ }
- def process_batch(self, batch):
- return batch
+ return {**input_dict, **unused_example}
- def save_to(self, data_dir):
- with open(os.path.join(data_dir, TEMPLATE_FILE), "w") as f:
- json.dump(self.template, f)
+ def __call__(self, example: Dict[str, Any]):
+ return self.encode(example=example)
+ @abstractmethod
+ def process_batch(self, input_dict):
+ raise NotImplementedError
-class ManualTemplate(Template):
- """
- ManualTemplate for hard prompt methods, such as PET, EFL.
+ def save(self, save_path):
+ if not os.path.exists(save_path):
+ os.makedirs(save_path, exist_ok=True)
+ template_config_file = os.path.join(save_path, TEMPLATE_CONFIG_FILE)
+ with open(template_config_file, "w", encoding="utf-8") as fp:
+ fp.write(json.dumps(self._prompt, ensure_ascii=False))
+ template_param_file = os.path.join(save_path, TEMPLATE_PARAMETER_FILE)
+ template_state_dict = self.state_dict()
+ if len(template_state_dict) > 0:
+ paddle.save(template_state_dict, template_param_file)
+
+ @staticmethod
+ def extract_template_keywords(prompt: List[Dict[str, Any]]):
+ keywords = set()
+ for part in prompt:
+ keywords.update(part.keys())
+ return keywords
+
+ @staticmethod
+ def parse_template_string(prompt: str,
+ left_token: Optional[str] = "{",
+ right_token: Optional[str] = "}"):
+ """
+ Parse the defined string as a sequence of dictionaries.
- Args:
- tokenizer (paddlenlp.transformers.PretrainedTokenizer):
- The same as `Template`.
- template (str | list):
- It describes how to combine text and prompts. For example,
- `str`: "{'text':'text_a'} It is {'mask'}." or a corresponding
- list of dictionary/set parsed by `parse_template` method.
- """
+ Args:
+ prompt: A string comprised of nestable {}, [], integers and strings.
- def __init__(self, tokenizer, max_seq_length, template=None):
- super().__init__(tokenizer=tokenizer, max_seq_length=max_seq_length)
- self.template = template
+ Returns:
+ A list of dictionaries corresponding to the input string.
+
+ For example, if we define `prompt` as
- def _process_template(self):
- if isinstance(self._template, str):
- self._template = self.parse_inputs(self._template)
+ "{'text': 'hypothesis'}基于这一假设{'mask'}推断出{'options': 'label.txt'}",
+ then this function returns
-class SoftTemplate(Template):
+ [{"text": "hypothesis"}, {"hard": "基于这一假设"}, {"mask": null},
+ {"hard": "推断出"}, {"options": ["正确", "错误"]}].
+
+ Raises:
+ ValueError: A error occurred parsing an string with unmatched punctuations.
+ """
+ left_stack = []
+ parsed = []
+ index = 0
+ while index < len(prompt):
+ # Delete extra spaces.
+ part = {"add_space": " "} if prompt[index] == " " else {}
+ while index < len(prompt) and prompt[index] == " ":
+ index += 1
+ if index == len(prompt):
+ break
+ # Parse blocks with paired tokens like "{ }".
+ if prompt[index] == left_token:
+ left_index = index
+ while index < len(prompt):
+ if prompt[index] == left_token:
+ left_stack.append(index)
+ elif prompt[index] == right_token:
+ left_stack.pop()
+ if len(left_stack) == 0:
+ break
+ index += 1
+ if index == len(prompt) and len(left_stack) > 0:
+ raise ValueError(
+ "{} at position {} has no corresponding {}".format(
+ left_token, left_index, right_token))
+ try:
+ part_dict = eval(prompt[left_index:index + 1])
+ if isinstance(part_dict, set):
+ part_dict = {k: None for k in part_dict}
+ part.update(part_dict)
+ except SyntaxError as error:
+ logger.error(traceback.format_exc())
+ exit()
+ index += 1
+ # Parse simplified discrete prompts.
+ else:
+ left_index = index
+ while index < len(prompt) and prompt[index] != left_token:
+ index += 1
+ part["hard"] = prompt[left_index:index].rstrip(" ")
+
+ if "options" in part:
+ if os.path.isfile(part["options"]):
+ with open(part["options"], "r") as fp:
+ labels = [x.strip() for x in fp]
+ part["options"] = labels
+ part["length"] = len(labels)
+ elif "length" not in "options":
+ part["length"] = DEFAULT_MAX_OPTIONS
+ logger.warning(
+ "[options]: The maximum number of options not defined,"
+ " set as {} by default.".format(DEFAULT_MAX_OPTIONS))
+ if "length" in part:
+ assert part["length"] > 0
+ if "hard" in part:
+ logger.warning(
+ "Ignore `length` attribute for keyword `hard`.")
+ if "position" in part:
+ assert part["position"] >= 0
+ if "token_type" in part:
+ assert part["token_type"] in (0, 1)
+ parsed.append(part)
+ return parsed
+
+
+class ManualTemplate(Template):
"""
- SoftTemplate on the input layer for soft prompt methods, such as p-tuning.
+ ManualTemplate for discrete prompt methods, such as PET, EFL.
Args:
- tokenizer (paddlenlp.transformers.PretrainedTokenizer):
- The same as `Template`.
- template (str | list):
- It describes how to combine text with both manual and soft prompts.
- prompt_encoder (str):
- The encoder to project soft embeddings. Support `lstm` and 'mlp'.
- Use soft embeddings directly when prompt_encoder is `None`.
+ prompt (`str`):
+ A template string which defines how to combine text and prompt.
+ tokenizer (`PretrainedTokenizer`):
+ An instance of PretrainedTokenizer used for tokenization.
+ max_length (`int`):
+ If set to a number, it will limit the total sequence returned so
+ that it has a maximum length, including prompts.
"""
- registered_input_names = ['soft_token_ids', 'mask_ids', 'shortenable_ids']
+ template_special_tokens = ["text", "hard", "sep", "mask", "options"]
+ template_attributes = [
+ "length", "position", "token_type", "add_prompt", "add_space",
+ "add_omask"
+ ]
- def __init__(self,
- tokenizer,
- max_seq_length,
- model=None,
- template=None,
- prompt_encoder=None,
- encoder_hidden_size=None):
- super().__init__(tokenizer=tokenizer, max_seq_length=max_seq_length)
- if model is None:
- self.token_embeddings = None
- logger.warning(
- "SoftTemplate: The pretrained model is not given. It would "
- "lead to error unless it is initialized for deployment.")
- else:
- if type(model).__name__.endswith('Model'):
- self.token_embeddings = model.embeddings.word_embeddings
- else:
- for module in model.children():
- if type(module).__name__.endswith('Model'):
- self.token_embeddings = module.embeddings.word_embeddings
- break
- self.token_embeddings.weight.stop_gradient = True
- self.embedding_size = self.token_embeddings.weight.shape[-1]
- self.encoder_hidden_size = encoder_hidden_size
- if self.encoder_hidden_size is not None and prompt_encoder is None:
- logger.warning("`prompt_encoder` is not set yet. Use MLP for "
- "soft embeddings' projection by default.")
- prompt_encoder = "mlp"
- self.prompt_encoder = prompt_encoder
- self.template = template
-
- def _process_template(self):
- if isinstance(self._template, str):
- self._template = self.parse_inputs(self._template)
- self.parse_soft_tokens()
- self.generate_parameters()
+ def __init__(self, prompt: str, tokenizer: PretrainedTokenizer,
+ max_length: int):
+ super(ManualTemplate, self).__init__(prompt, tokenizer, max_length)
- @property
- def prompt_encoder(self):
- return self._prompt_encoder
+ def create_prompt_parameters(self):
+ return None
- @prompt_encoder.setter
- def prompt_encoder(self, prompt_encoder):
+ def process_batch(self, input_dict):
+ return input_dict
- if prompt_encoder is None:
- return None
- if getattr(self, "_prompt_encoder", None) is not None:
- logger.warning(
- f"Encoder has already set as {self._prompt_encoder}, change " +
- "`prompt_encoder` will reset parameters.")
+class SoftLSTM(nn.Layer):
+ """
+ LSTM encoder for soft token embeddings.
+ """
- self._prompt_encoder = prompt_encoder
+ def __init__(self, input_size, hidden_size, output_size, activation):
+ super(SoftLSTM, self).__init__()
+ self.lstm = nn.LSTM(input_size=input_size,
+ hidden_size=hidden_size,
+ num_layers=2,
+ direction='bidirect',
+ time_major=False)
+ self.mlp = nn.Sequential(nn.Linear(2 * hidden_size,
+ hidden_size), activation,
+ nn.Linear(hidden_size, output_size))
- if self.encoder_hidden_size is None:
- hidden_size = self.embedding_size
- else:
- hidden_size = self.encoder_hidden_size
- if prompt_encoder == 'lstm':
- self.lstm_head = nn.LSTM(input_size=self.embedding_size,
- hidden_size=hidden_size,
- num_layers=2,
- direction='bidirect',
- time_major=False)
- self.mlp_head = nn.Sequential(
- nn.Linear(2 * hidden_size, hidden_size), nn.ReLU(),
- nn.Linear(hidden_size, self.embedding_size))
- elif prompt_encoder == 'mlp':
- self.mlp_head = nn.Sequential(
- nn.Linear(self.embedding_size, hidden_size), nn.ReLU(),
- nn.Linear(hidden_size, self.embedding_size))
- if hasattr(self, "lstm_head"):
- delattr(self, "lstm_head")
- else:
- raise ValueError(
- "Unsupported soft token encoder: {}".format(prompt_encoder))
-
- def incorporate_template_text(self, example, template=None):
- """ Replace each item in template with real text. """
- inputs = template.copy(
- ) if self.template is None else self.template.copy()
-
- for i, p in enumerate(inputs):
- if 'text' in p:
- inputs[i] = p['add_prefix_space'] + getattr(example, p['text'])
- elif 'mask' in p:
- inputs[i] = self.tokenizer.mask_token
- elif 'hard' in p:
- inputs[i] = p['add_prefix_space'] + p['hard']
- elif 'soft' in p:
- inputs[i] = p['add_prefix_space'] + p['soft']
- elif 'sep' in p:
- inputs[i] = self.tokenizer.sep_token
- else:
- raise ValueError('can not parse {}'.format(p))
+ def forward(self, embeds):
+ hidden_states, _ = self.lstm(embeds)
+ return self.mlp(hidden_states)
- return inputs
- def parse_soft_tokens(self):
- inputs = []
- soft_token_ids = []
- num_soft_token = 0
- soft2word_init = {}
- soft_id_reindex = {}
+class SoftTemplate(Template):
+ """
+ SoftTemplate for continuous prompt methods on the input layer.
- for part in self._template:
- if 'soft' not in part and 'soft_id' not in part:
- soft_token_ids.append(0)
- inputs.append(part)
- continue
+ Args:
+ prompt (`str`):
+ A template string which defines how to combine text and prompt.
+ tokenizer (`PretrainedTokenizer`):
+ An instance of PretrainedTokenizer used for tokenization.
+ max_length (`int`):
+ If set to a number, it will limit the total sequence returned so
+ that it has a maximum length, including prompts.
+ word_embeddings (`Tensor`):
+ The word embeddings of pretrained models, which can be obtained by
+ calling `model.get_input_embeddings().weight`.
+ soft_embeddings (`Tensor`):
+ The embeddings of soft tokens, which overwrites `word_embeddings`
+ as initial weights when defined.
+ """
+ template_special_tokens = [
+ "text", "hard", "soft", "soft_id", "sep", "mask", "options"
+ ]
+ input_feature_names = [
+ "do_truncate", "token_types", "positions", "soft_tokens", "encoder_ids"
+ ]
- if 'soft' in part and part['soft'] is not None:
- if 'duplicate' in part:
- logger.warning(
- 'Ignore ``duplicate``. It is '
- 'incompatible with ``soft`` with text values.')
+ def __init__(self,
+ prompt: str,
+ tokenizer: PretrainedTokenizer,
+ max_length: int,
+ word_embeddings: Tensor,
+ soft_embeddings: Tensor = None):
+ super(SoftTemplate, self).__init__(prompt,
+ tokenizer,
+ max_length,
+ word_embeddings=word_embeddings,
+ soft_embeddings=soft_embeddings)
+
+ def named_parameters(self):
+ named_params = [(n, p)
+ for n, p in self.soft_embeddings.named_parameters()]
+ named_params.extend([(n, p)
+ for n, p in self.encoder_list.named_parameters()])
+ return named_params
+
+ def parameters(self):
+ return [p for n, p in self.named_parameters()]
+
+ def create_prompt_parameters(self):
+ self._prompt, soft_token_config = self.parse_soft_prompt()
+ self.embed_size = self.word_embeddings.weight.shape[1]
+ soft2word, self.soft_tokens, self.num_soft_token = soft_token_config
+ self._init_soft_parameters(soft2word)
+ self.encoder_ids, self.encoder_list = self._create_soft_encoders()
+
+ def process_batch(self, input_dict: Dict[str, Tensor]) -> Dict[str, Tensor]:
+ """
+ Convert input_ids to inputs_embeds.
+
+ Soft tokens are encoded soft_embeddings with predefined encoders.
+ For other tokens, use word embeddings in pretrained model.
+ """
+ word_embeds = self.word_embeddings(input_dict["input_ids"])
+ if "attention_mask" not in input_dict or input_dict[
+ "attention_mask"] is None:
+ pad_token_id = self.tokenizer.pad_token_id
+ attention_mask = paddle.unsqueeze(
+ (input_dict["input_ids"] == pad_token_id).astype("float32") *
+ -1e4,
+ axis=[1, 2])
+ input_dict["attention_mask"] = attention_mask
+ input_dict["input_ids"] = None
+ soft_embeds = self.soft_embeddings(input_dict["soft_token_ids"])
+ soft_shape = soft_embeds.shape
+ soft_embeds = soft_embeds.reshape([-1, soft_shape[-1]])
+ for encoder_id in range(1, len(self.encoder_list)):
+ to_encode = paddle.where(input_dict["encoder_ids"] == encoder_id)
+ to_encode = to_encode[0] * soft_shape[1] + to_encode[1]
+ to_encode = to_encode.squeeze(1)
+ to_encode_embeds = soft_embeds[to_encode]
+ to_encode_embeds = to_encode_embeds.reshape(
+ [soft_shape[0], -1, soft_shape[-1]])
+ encoder = self.encoder_list[encoder_id]
+ encoded = encoder(to_encode_embeds)
+ encoded = encoded.reshape([-1, soft_shape[-1]])
+ soft_embeds = paddle.scatter(soft_embeds, to_encode, encoded)
+ soft_embeds = soft_embeds.reshape([soft_shape[0], -1, soft_shape[-1]])
+ soft_token_ids = input_dict["soft_token_ids"].unsqueeze(2)
+ input_dict["inputs_embeds"] = paddle.where(soft_token_ids > 0,
+ soft_embeds, word_embeds)
+ return input_dict
+
+ def parse_soft_prompt(self):
+ """
+ Unify the form of continuous prompts as {"soft": "xxx"} and create
+ continuous token id sequence for each part in template.
+
+ Returns:
+ `List[Dict[str, str]]`: Template with continuous prompt formated as {"soft": "xxx"}.
+ `Tuple[Dict[int, int], List[List[int]], int]`:
+ - Mapping from continuous ids to word ids for initialization.
+ - Continuous ids for each part. Id 0 denotes none-continuous part.
+ - Number of unique coutinuous tokens.
+ """
+ prompt = self._prompt.copy()
+ num_soft_token = 1
+ soft_prompt = []
+ soft_token_ids = []
+ soft2word = {}
+ soft_id_reindex = {}
- # Get word tokens and ids for soft token initialization.
- init_token_ids = self.tokenizer(
- part['add_prefix_space'] + part['soft'],
+ for part in prompt:
+ part_prompt = None
+ # Copy non-continuous prompt part.
+ if "soft" not in part and "soft_id" not in part:
+ soft_prompt.append(part)
+ soft_token_ids.append(None)
+
+ # Deal with continuous prompt with specific initialization.
+ elif "soft" in part and part["soft"] is not None:
+
+ # Get word tokens for initialization.
+ if "add_space" in part:
+ part["soft"] = part["add_space"] + part["soft"]
+ word_token_ids = self.tokenizer(
+ part["soft"],
add_special_tokens=False,
- return_token_type_ids=False)['input_ids']
- init_tokens = self.tokenizer.convert_ids_to_tokens(
- init_token_ids)
- assert len(init_tokens) == len(init_token_ids)
-
- # Create soft ids and corresponding ``soft`` part in template.
- next_num_soft = num_soft_token + 1
- num_soft_token += len(init_tokens)
- id_list = list(range(next_num_soft, num_soft_token + 1))
-
- soft_token_ids.extend(id_list)
- inputs.extend([{
- 'add_prefix_space': part['add_prefix_space'],
- 'soft': token
- } for token in init_tokens])
- for soft_id, word_id in zip(id_list, init_token_ids):
- soft2word_init[soft_id] = word_id
-
- # Check the ids of ``soft`` and ``soft_id``.
- if 'soft_id' in part:
- if part['soft_id'] in soft_id_reindex:
- assert id_list == soft_id_reindex[part['soft_id']]
+ return_token_type_ids=False)["input_ids"]
+
+ # Create continuous token ids.
+ soft_id_list = list(
+ range(num_soft_token, num_soft_token + len(word_token_ids)))
+ num_soft_token += len(word_token_ids)
+
+ for soft_id, word_id in zip(soft_id_list, word_token_ids):
+ soft2word[soft_id] = word_id
+
+ # Check `length` if exists.
+ if "length" in part:
+ if part["length"] < len(word_token_ids):
+ logger.warning("Ignore `length` because it is less than"
+ " the length of defined word sequence.")
+ elif part["length"] > len(word_token_ids):
+ length = part["length"] - len(word_token_ids)
+ soft_id_list += list(
+ range(num_soft_token, num_soft_token + length))
+ num_soft_token += length
+ part["soft"] += self.tokenizer.unk_token * length
+
+ soft_token_ids.append(soft_id_list)
+ part_prompt = {"soft": part["soft"]}
+
+ # Check or record `soft_id` if exists.
+ if "soft_id" in part:
+ if part["soft_id"] in soft_id_reindex:
+ assert soft_id_list == soft_id_reindex[part["soft_id"]]
else:
- soft_id_reindex[part['soft_id']] = id_list
- continue
-
- if 'soft_id' in part and part['soft_id'] in soft_id_reindex:
- if 'duplicate' in part:
- logger.warnings('Ignore ``duplicate``. Initialize '
- '``soft`` by ``soft_id`` directly.')
- id_list = soft_id_reindex[part['soft_id']]
-
- elif 'duplicate' in part:
- assert isinstance(part['duplicate'], int)
- if 'same' in part:
- num_soft_token += 1
- id_list = [num_soft_token for _ in range(part['duplicate'])]
- else:
- next_num_soft = num_soft_token + 1
- num_soft_token += part['duplicate']
- id_list = list(range(next_num_soft, num_soft_token + 1))
+ soft_id_reindex[part["soft_id"]] = soft_id_list
+
+ # Deal with continous prompt defined by `soft_id`.
+ elif "soft_id" in part and part["soft_id"] in soft_id_reindex:
+ soft_id_list = soft_id_reindex[part["soft_id"]]
+ if "length" in part:
+ logger.warning("Ignore `length` because it is incompatible"
+ " with existing `soft_id`.")
+ soft_token_ids.append(soft_id_list)
+ part_prompt = {
+ "soft": [self.tokenizer.unk_token] * len(soft_id_list)
+ }
+
+ # Deal with continous prompt with random initialization.
else:
- num_soft_token += 1
- id_list = [num_soft_token]
-
- if 'soft_id' in part:
- soft_id_reindex[part['soft_id']] = id_list
+ if "length" not in part:
+ part["length"] = 1
+ soft_id_list = list(
+ range(num_soft_token, num_soft_token + part["length"]))
+ num_soft_token += part["length"]
+ soft_token_ids.append(soft_id_list)
+ if "soft_id" in part:
+ soft_id_reindex[part["soft_id"]] = soft_id_list
+ part_prompt = {
+ "soft": [self.tokenizer.unk_token] * len(soft_id_list)
+ }
+ if part_prompt is not None:
+ for key in part:
+ if key not in ["soft", "soft_id", "length", "add_space"]:
+ part_prompt[key] = part[key]
+ soft_prompt.append(part_prompt)
+
+ if num_soft_token == 1:
+ raise ValueError("Soft prompt expected for SoftTemplate, but"
+ " get {}.".format(self._prompt))
+
+ soft_token_config = (soft2word, soft_token_ids, num_soft_token)
+
+ return soft_prompt, soft_token_config
+
+ def _init_soft_parameters(self, soft2word: Dict[int, int]):
+ if self.soft_embeddings is not None:
+ if self.soft_embeddings.weight.shape[0] != self.num_soft_token:
+ raise ValueError(
+ "Given soft embeddings are incompatible with those "
+ "defined in template \"{}\"".format(self._prompt))
+ else:
+ self.soft_embeddings = nn.Embedding(self.num_soft_token,
+ self.embed_size)
+ weight = self.soft_embeddings.weight.clone().detach()
+ for soft_id, word_id in soft2word.items():
+ word_id = paddle.to_tensor(word_id)
+ weight[soft_id] = self.word_embeddings(word_id)
+ self.soft_embeddings.weight.set_value(weight)
+
+ def _create_soft_encoders(self,
+ output_size: int = None,
+ activation: nn.Layer = None):
+ encoder_list = [nn.Identity()]
+ encoder2id = {}
+ encoder_ids = []
+ output_size = self.embed_size if output_size is None else output_size
+ activation = nn.ReLU() if activation is None else activation
+ for part in self._prompt:
+ if "encoder" not in part or part["encoder"] is None:
+ encoder_ids.append(0)
+ else:
+ if part["encoder"] not in encoder2id:
+ encoder2id[part["encoder"]] = len(encoder_list)
+ encoder_ids.append(len(encoder_list))
+ if "hidden_size" in part:
+ hidden_size = part["hidden_size"]
+ else:
+ hidden_size = self.embed_size
+ if part["encoder"] == "lstm":
+ encoder_list.append(
+ SoftLSTM(self.embed_size, hidden_size, output_size,
+ activation))
+ elif part["encoder"] == "mlp":
+ encoder_list.append(
+ nn.Sequential(
+ nn.Linear(self.embed_size,
+ hidden_size), activation,
+ nn.Linear(hidden_size, output_size)))
+ else:
+ raise ValueError("Encoder {} not supported.".format(
+ part["encoder"]))
+ else:
+ encoder_ids.append(encoder2id[part["encoder"]])
+ encoder_list = nn.LayerList(encoder_list)
+ return encoder_ids, encoder_list
+
+ def build_inputs_with_prompt(
+ self,
+ example: Dict[str, Any],
+ prompt: Optional[List[Dict[str, Any]]] = None) -> List[str]:
+ inputs = super(SoftTemplate,
+ self).build_inputs_with_prompt(example, prompt)
+ for index, part in enumerate(inputs):
+ if isinstance(part, dict) and "soft" in part:
+ inputs[index] = part["soft"]
+ return inputs
- soft_token_ids.extend(id_list)
- inputs.extend([{
- 'add_prefix_space': part['add_prefix_space'],
- 'soft': self.tokenizer.cls_token
- } for _ in range(len(id_list))])
+ def save(self, save_path):
+ super(SoftTemplate, self).save(save_path)
+ template_param_file = os.path.join(save_path, TEMPLATE_PARAMETER_FILE)
+ paddle.save(self.state_dict(), template_param_file)
- self._template = inputs
- self.soft_token_ids = soft_token_ids
- self.num_soft_token = num_soft_token
- self.soft2word_init = soft2word_init
- if self.num_soft_token == 0:
- logger.warning('No soft tokens in template. '\
- 'Use ManualTemplate for better performance.')
+class PrefixTemplate(SoftTemplate):
+ """
+ PrefixTemplate for continuous prompt methods on every layer.
- def generate_parameters(self):
- """
- Generate parameters for soft tokens.
- """
- if self.num_soft_token == 0 or self.token_embeddings is None:
- return None
- self.soft_embeddings = nn.Embedding(self.num_soft_token + 1,
- self.embedding_size)
-
- weight = self.soft_embeddings.weight.clone().detach()
- for soft_id, word_id in self.soft2word_init.items():
- weight[soft_id] = self.token_embeddings(paddle.to_tensor(word_id))
- self.soft_embeddings.weight.set_value(weight)
-
- def process_batch(self, batch):
- word_embeds = self.token_embeddings(batch["input_ids"])
- batch["input_ids"] = None
- if not hasattr(self,
- "soft_embeddings") or batch["soft_token_ids"] is None:
- batch["inputs_embeds"] = word_embeds
- else:
- soft_embeds = self.soft_embeddings(batch["soft_token_ids"])
- if hasattr(self, "lstm_head"):
- soft_embeds = self.lstm_head(soft_embeds)[0]
- if hasattr(self, "mlp_head"):
- soft_embeds = self.mlp_head(soft_embeds)
+ Args:
+ prompt (`str`):
+ A template string which defines how to combine text and prompt.
+ tokenizer (`PretrainedTokenizer`):
+ An instance of PretrainedTokenizer used for tokenization.
+ max_length (`int`):
+ If set to a number, it will limit the total sequence returned so
+ that it has a maximum length, including prompts.
+ model (`PretrainedModel`):
+ An instance of PretrainedModel.
+ """
+ template_special_tokens = [
+ "text", "hard", "prefix", "soft", "sep", "mask", "options"
+ ]
+ input_feature_names = [
+ "do_truncate", "token_types", "positions", "soft_tokens", "encoder_ids"
+ ]
- inputs_embeds = paddle.where(
- (batch["soft_token_ids"] > 0).unsqueeze(-1), soft_embeds,
- word_embeds)
- batch["inputs_embeds"] = inputs_embeds
- return batch
+ def __init__(self,
+ prompt: str,
+ tokenizer: PretrainedTokenizer,
+ max_length: int,
+ model: PretrainedModel,
+ prefix_dropout: float = 0.1):
+ self.n_layer, self.n_heads = self._get_config(model)
+ super(PrefixTemplate).__init__(prompt, tokenizer, max_length,
+ model.get_input_embeddings())
+ self.dropout = nn.Dropout(p=prefix_dropout)
+
+ @staticmethod
+ def _get_config(model):
+ names = [n for n, p in model.named_parameters() if "layers" in n]
+ pattern = re.compile(r".*?\.(\d+)\..*?")
+ indices = []
+ for name in names:
+ result = pattern.match(name)
+ if result is not None:
+ indices.append(int(result.group(1)))
+ num_layer = max(indices) + 1
+ layer_names = names[0].split(".")[:-2]
+ layer = model
+ for name in layer_names:
+ layer = getattr(layer, name)
+ num_heads = layer.num_heads
+
+ return num_layer, num_heads
+
+ def parse_soft_prompt(self):
+ prompt = self._prompt.copy()
+
+ for index, part in enumerate(prompt):
+ if "soft" in part:
+ raise ValueError("Keyward `soft` should not be used in "
+ "PrefixTemplate.")
+ if "prefix" not in part:
+ continue
+ if index != 0:
+ raise ValueError("Keyword `prefix` should locate at the "
+ "beginning of template.")
+ part["soft"] = part["prefix"]
+ part.pop("prefix")
+ prompt[index] = part
+
+ self._prompt = prompt
+ return super(PrefixTemplate, self).parse_soft_prompt()
+
+ def process_batch(self, input_dict: Dict[str, Tensor]) -> Dict[str, Tensor]:
+ word_embeds = self.word_embeddings(input_dict["input_ids"])
+ if "attention_mask" not in input_dict or input_dict[
+ "attention_mask"] is None:
+ pad_token_id = self.tokenizer.pad_token_id
+ attention_mask = paddle.unsqueeze(
+ (input_dict["input_ids"] == pad_token_id).astype("float32") *
+ -1e4,
+ axis=[1, 2])
+ input_dict["attention_mask"] = attention_mask
+ input_dict["input_ids"] = None
+
+ batch_size, _ = input_dict["soft_token_ids"].shape
+ soft_token_ids = paddle.masked_select(input_dict["soft_token_ids"],
+ input_dict["soft_token_ids"] > 0)
+ soft_token_ids = soft_token_ids.reshape([batch_size, -1])
+ _, soft_len = soft_token_ids.shape
+
+ input_dict["inputs_embeds"] = word_embeds[:, soft_len:, :]
+
+ soft_embeds = self.soft_embeddings(soft_token_ids)
+ for encoder_id in range(1, len(self.encoder_list)):
+ to_encode = paddle.where(input_dict["encoder_ids"] == encoder_id)
+ encoded = self.encoder_list[encoder_id](to_encode)
+ soft_embeds = paddle.where(input_dict["encoder_ids"] == encoder_id,
+ encoded, soft_embeds)
+ soft_embeds = soft_embeds.reshape([
+ batch_size, soft_len, self.n_layer * 2, self.n_heads,
+ self.embed_size // self.n_heads
+ ])
+ soft_embeds = self.dropout(soft_embeds)
+ soft_embeds = paddle.transpose(soft_embeds, perm=[2, 0, 3, 1, 4])
+ soft_embeds = paddle.split(soft_embeds, num_or_sections=self.n_layer)
+ soft_embeds = [paddle.split(emb, 2) for emb in soft_embeds]
+ soft_embeds = [[x.squeeze(0) for x in emb] for emb in soft_embeds]
+ input_dict["past_key_values"] = tuple(
+ [tuple(emb) for emb in soft_embeds])
+ return input_dict
+
+ def _create_soft_encoders(self):
+ output_size = self.embed_size * self.n_layer * 2
+ activation = nn.Tanh()
+ return super(PrefixTemplate,
+ self)._create_soft_encoders(output_size, activation)
class AutoTemplate(object):
@@ -465,83 +816,70 @@ class AutoTemplate(object):
AutoTemplate can help you automatically create the relevant Template
given the provided prompt.
"""
- registered_text_keys = ['text_a', 'text_b']
+ default_text_keyword = "text_a"
def __init__(self, *args, **kwargs):
raise EnvironmentError(
'{} is designed to be instantiated using {}.create_from('\
- 'template, tokenizer, text_list, ...)'.format(
+ 'prompt, tokenizer, max_length, ...)'.format(
self.__class__.__name__, self.__class__.__name__))
- @classmethod
- def parse_inputs(cls, inputs):
- return parse_template(inputs)
-
@classmethod
def create_from(cls,
- template,
- tokenizer,
- max_seq_length,
- model=None,
- prompt_encoder=None,
- encoder_hidden_size=None):
- if template is None:
- template = "{'soft'}"
- if isinstance(template, str):
- template = cls.parse_inputs(template)
- template_keys = cls._extract_template_keys(template)
- if 'text' not in template_keys:
- soft_template = []
- for item in template:
- if 'hard' in item:
- soft_template.append({
- 'add_prefix_space': '',
- 'soft': item['hard']
- })
- else:
- soft_template.append(item)
- text_item = [{
- 'add_prefix_space': ' ',
- 'text': cls.registered_text_keys[0]
- }]
- template = text_item + soft_template
- template_keys = cls._extract_template_keys(template)
-
- if 'mask' not in template_keys:
- template.append({'add_prefix_space': ' ', 'mask': None})
-
- if 'soft' in template_keys:
- return SoftTemplate(tokenizer=tokenizer,
- template=template,
- max_seq_length=max_seq_length,
- model=model,
- prompt_encoder=prompt_encoder,
- encoder_hidden_size=encoder_hidden_size)
+ prompt: str,
+ tokenizer: PretrainedTokenizer,
+ max_length: int = 512,
+ model: PretrainedModel = None,
+ soft_embeddings: Tensor = None):
+ # Default template if not defined.
+ if prompt is None:
+ prompt = "{'soft'}{'text': 'text_a'}{'mask'}"
+
+ if isinstance(prompt, str):
+ prompt = Template.parse_template_string(prompt)
+ template_keywords = Template.extract_template_keywords(prompt)
+
+ # Complement simplified template as ManualTemplate-style in form.
+ if "text" not in template_keywords:
+ prompt = [{"text": cls.default_text_keyword}] + prompt
+ if "mask" not in template_keywords:
+ prompt = prompt + [{"mask": None}]
+
+ # Choose Template according to template keywords.
+ if "prefix" in template_keywords:
+ return PrefixTemplate(prompt=prompt,
+ tokenizer=tokenizer,
+ max_length=max_length,
+ model=model)
+ elif "soft" in template_keywords or "soft_id" in template_keywords:
+ word_embeddings = model.get_input_embeddings()
+ return SoftTemplate(prompt=prompt,
+ tokenizer=tokenizer,
+ max_length=max_length,
+ word_embeddings=word_embeddings)
else:
- return ManualTemplate(tokenizer=tokenizer,
- max_seq_length=max_seq_length,
- template=template)
+ return ManualTemplate(prompt=prompt,
+ tokenizer=tokenizer,
+ max_length=max_length)
@classmethod
def load_from(cls,
- data_dir,
- tokenizer,
- max_seq_length,
- model=None,
- prompt_encoder=None,
- encoder_hidden_size=None):
- with open(os.path.join(data_dir, TEMPLATE_FILE), "r") as f:
- template = json.load(f)
- return cls.create_from(template, tokenizer, max_seq_length, model,
- prompt_encoder, encoder_hidden_size)
-
- @classmethod
- def _extract_template_keys(cls, inputs: list):
- template_keys = set()
- for item_dict in inputs:
- for key, value in item_dict.items():
- template_keys.add(key)
- if key == 'text':
- assert value in cls.registered_text_keys, 'No ``{}`` attribute '\
- 'in InputExample.'.format(value)
- return template_keys
+ data_path: os.PathLike,
+ tokenizer: PretrainedTokenizer,
+ max_length: int,
+ model: PretrainedModel = None):
+ template_config_file = os.path.join(data_path, TEMPLATE_CONFIG_FILE)
+ if not os.path.isfile(template_config_file):
+ raise ValueError("{} not found under {}".format(
+ TEMPLATE_CONFIG_FILE, data_path))
+ with open(template_config_file, "r") as fp:
+ prompt = json.loads(fp.readline().strip())
+ # TODO (Huijuan): Load all configs from data_path.
+ template = cls.create_from(prompt=prompt,
+ tokenizer=tokenizer,
+ max_length=max_length,
+ model=model)
+ template_param_file = os.path.join(data_path, TEMPLATE_PARAMETER_FILE)
+ if os.path.isfile(template_param_file):
+ template.set_state_dict(paddle.load(template_param_file))
+ return template
diff --git a/paddlenlp/prompt/verbalizer.py b/paddlenlp/prompt/verbalizer.py
index ac64c861cc64..bb412259d78b 100644
--- a/paddlenlp/prompt/verbalizer.py
+++ b/paddlenlp/prompt/verbalizer.py
@@ -12,312 +12,337 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from abc import abstractmethod
-from collections import defaultdict
import os
import copy
import json
+from abc import abstractmethod
+from typing import Any, Dict, List
import numpy as np
-from typing import List, Dict, Union
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
+from paddle import Tensor
+from paddlenlp.transformers import PretrainedTokenizer, PretrainedModel
+from paddlenlp.utils.log import logger
-from ..utils.log import logger
-
-__all__ = [
- "Verbalizer", "MultiMaskVerbalizer", "ManualVerbalizer", "SoftVerbalizer"
-]
+__all__ = ["Verbalizer", "ManualVerbalizer", "SoftVerbalizer"]
-VERBALIZER_FILE = "verbalizer.json"
+# Verbalizer used to be saved in a file.
+VERBALIZER_CONFIG_FILE = "verbalizer_config.json"
+VERBALIZER_PARAMETER_FILE = "verbalizer_state.pdparams"
class Verbalizer(nn.Layer):
"""
- Base verbalizer class used to process the outputs and labels.
+ Base class for [`Verbalizer`].
Args:
- labels (list):
- The sequence of labels in task.
-
+ label_words (`dict`):
+ Define the mapping from labels to a single or multiple words.
+ tokenizer (`PretrainedTokenizer`):
+ An instance of PretrainedTokenizer for label word tokenization.
"""
- def __init__(self, labels):
- super().__init__()
- self.labels = labels
+ def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer,
+ **kwargs):
+ super(Verbalizer, self).__init__()
+ for key, value in kwargs.items():
+ setattr(self, key, value)
+ self.tokenizer = tokenizer
+ self.token_aggregate_type = kwargs.get("token_aggregate_type", "mean")
+ self.word_aggregate_type = kwargs.get("word_aggregate_type", "mean")
+ self.mask_aggregate_type = kwargs.get("mask_aggregate_type", "product")
+ self.post_log_softmax = kwargs.get("post_log_sigmoid", True)
+ self.label_token_weight = kwargs.get("label_token_weight", None)
+ if self.label_token_weight is not None:
+ self.label_token_weight = self.normalize(
+ self.project(self.label_token_weight.unsqueeze(0)))
+ self.label_words = label_words
@property
def labels(self):
- labels = getattr(self, "_labels", None)
- if labels is None:
- raise RuntimeError("`labels` is not set yet.")
- return labels
+ if not hasattr(self, "_labels"):
+ raise RuntimeError("Attribute `labels` is not set yet.")
+ return self._labels
@labels.setter
def labels(self, labels):
if labels is not None:
self._labels = sorted(labels)
- else:
- self._labels = None
@property
def label_words(self):
- label_words = getattr(self, "_label_words", None)
- if label_words is None:
- raise RuntimeError("`label_words not set yet.")
- return label_words
+ if not hasattr(self, "_label_words"):
+ raise RuntimeError("Mapping from labels to words is not set yet.")
+ return self._label_words
@label_words.setter
- def label_words(self, label_words: Union[List, Dict]):
+ def label_words(self, label_words: Dict):
if label_words is None:
return None
- if isinstance(label_words, dict):
- new_labels = sorted(list(label_words.keys()))
- if self._labels is None:
- self._labels = new_labels
- elif new_labels != self.labels:
- raise ValueError(
- f"The given `label_words` {new_labels} does not match " +
- f"predefined labels {self.labels}.")
- self._label_words = [label_words[k] for k in self.labels]
- elif isinstance(label_words, list):
- if self._labels is None:
- raise ValueError(
- "`labels` should be set as the given `label_words` is "
- "a list. Make sure that the order is compatible.")
- if len(self.labels) != len(label_words):
- raise ValueError(
- "The length of given `label_words` and predefined " +
- "labels do not match.")
- self._label_words = label_words
- else:
- raise TypeError('Unsupported type {} for label_words'.format(
- type(label_words)))
- self.process_label_words()
-
- @property
- def labels_to_ids(self):
- if not hasattr(self, 'labels'):
- raise RuntimeError(
- 'Property labels_to_ids has not been set before used.')
- return {k: i for i, k in enumerate(self.labels)}
-
- @property
- def ids_to_labels(self):
- if not hasattr(self, 'labels'):
- raise RuntimeError(
- 'Property ids_to_labels has not been set before used.')
- return {i: k for i, k in enumerate(self.labels)}
-
- @staticmethod
- def add_prefix(label_words, prefix):
- """ Add prefix to get expected token ids. """
- if isinstance(label_words[0], str):
- label_words = [[word] for word in label_words]
-
- new_label_words = []
- for words_per_label in label_words:
- new_words_per_label = []
- for word in words_per_label:
- new_words_per_label.append(prefix + word)
- new_label_words.append(new_words_per_label)
- return new_label_words
+ self.labels = list(label_words.keys())
+ self.labels_to_ids = {
+ label: idx
+ for idx, label in enumerate(self._labels)
+ }
+ self._words = []
+ for label in self._labels:
+ words = label_words[label]
+ if isinstance(words, str):
+ words = [words]
+ self._words.append(words)
+ self._label_words = {
+ label: word
+ for label, word in zip(self._labels, self._words)
+ }
+ self.preprocess_label_words()
+ self.create_parameters()
@abstractmethod
- def process_label_words(self, ):
- """ A hook to process verbalizer when it is set. """
- raise NotImplementedError
-
- @abstractmethod
- def project(self, logits, **kwargs):
- """
- Project the logits with shape ```[..., vocab_size]``` into
- label_word_logits with shape ```[..., label_words]```.
+ def create_parameters(self):
+ """
+ A hook to create parameters for mapping from labels to words.
"""
raise NotImplementedError
- @staticmethod
- def aggregate(embeddings, mask=None, atype='mean', ndim=2):
- """
- Aggregate embeddings at the last dimension according to `atype`
- if its number of dimensions is greater than `ndim`.
- Used to handle multiple tokens for words and multiple words
- for labels.
+ def preprocess_label_words(self):
+ label_token_ids = []
+ for label_word in self._words:
+ word_token_ids = []
+ for word in label_word:
+ token_ids = self.tokenizer.encode(word,
+ add_special_tokens=False,
+ return_token_type_ids=False)
+ word_token_ids.append(token_ids["input_ids"])
+ label_token_ids.append(word_token_ids)
+
+ max_num_words = max([len(words) for words in self._words])
+ max_num_tokens = max([
+ max([len(token_ids) for token_ids in word_token_ids])
+ for word_token_ids in label_token_ids
+ ])
+ token_ids_shape = [len(self.labels), max_num_words, max_num_tokens]
+ token_ids = np.zeros(token_ids_shape)
+ word_mask = np.zeros(token_ids_shape[:-1])
+ token_mask = np.zeros(token_ids_shape)
+ for label_id, word_token_ids in enumerate(label_token_ids):
+ word_mask[label_id][:len(word_token_ids)] = 1
+ for word_id, tokens in enumerate(word_token_ids):
+ token_ids[label_id][word_id][:len(tokens)] = tokens
+ token_mask[label_id][word_id][:len(tokens)] = 1
+ self.token_ids = paddle.to_tensor(token_ids,
+ dtype="int64",
+ stop_gradient=True)
+ self.word_mask = paddle.to_tensor(word_mask,
+ dtype="float32",
+ stop_gradient=True)
+ self.token_mask = paddle.to_tensor(token_mask,
+ dtype="int64",
+ stop_gradient=True)
- Args:
- embeddings (paddle.Tensor):
- The original embeddings.
- atype (str):
- The aggregation strategy, including mean and first.
- ndim (str):
- The aggregated embeddings' number of dimensions.
+ def convert_labels_to_ids(self, label: str):
+ assert isinstance(label, str)
+ return self.labels_to_ids[label]
- """
- if embeddings.ndim > ndim and atype is not None:
- if atype == 'mean':
- if mask is None:
- return embeddings.mean(axis=-1)
- return (embeddings * mask.unsqueeze(0)).sum(
- axis=-1) / (mask.unsqueeze(0).sum(axis=-1) + 1e-10)
- elif atype == 'max':
- if mask is None:
- return embeddings.max(axis=-1)
- return (embeddings - 1e4 * (1 - mask.unsqueeze(0))).max(axis=-1)
- elif atype == 'first':
- return embeddings[..., 0]
- else:
- raise ValueError('Unsupported aggregate type {}'.format(atype))
- return embeddings
+ def convert_ids_to_labels(self, index: int):
+ assert isinstance(index, int)
+ return self.labels[index]
- def normalize(self, logits):
- """ Normalize the logits of every example. """
- new_logits = F.softmax(logits.reshape(logits.shape[0], -1), axis=-1)
- return new_logits.reshape(*logits.shape)
+ def project(self, outputs):
+ """
+ Fetch label word predictions from outputs over vocabulary.
+ """
+ token_ids = self.token_ids.reshape([-1])
+ label_token_outputs = outputs.index_select(index=token_ids, axis=-1)
+ label_shape = [*outputs.shape[:-1], *self.token_ids.shape]
+ label_token_outputs = label_token_outputs.reshape(label_shape)
+ label_word_outputs = self.aggregate(label_token_outputs,
+ self.token_mask,
+ self.token_aggregate_type)
+ label_word_outputs -= 1e4 * (1 - self.word_mask)
+ return label_word_outputs
+
+ def process_outputs(self, outputs, masked_positions: Tensor = None):
+ """
+ Process outputs of `PretrainedModelForMaskedLM` over vocabulary.
+ """
+ if masked_positions is None:
+ return outputs
+ batch_size, _, num_pred = outputs.shape
+ outputs = outputs.reshape([-1, num_pred])
+ outputs = paddle.gather(outputs, masked_positions)
+ outputs = outputs.reshape([batch_size, -1, num_pred])
+ return outputs
+
+ def aggregate(self, outputs: Tensor, mask: Tensor, atype: str):
+ """
+ Aggregate multiple tokens/words for each word/label.
+ """
+ mask = mask.unsqueeze(0)
+ if atype == "mean":
+ outputs = outputs * mask
+ outputs = outputs.sum(axis=-1) / (mask.sum(axis=-1) + 1e-15)
+ elif atype == "max":
+ outputs = (outputs - 1e4 * (1 - mask)).max(axis=-1)
+ elif atype == "first":
+ index = paddle.to_tensor([0])
+ outputs = paddle.index_select(outputs, index, axis=-1)
+ else:
+ raise ValueError(
+ "Strategy {} is not supported to aggregate multiple "
+ "tokens.".format(atype))
+ return outputs
- def from_file(self, path):
+ def normalize(self, outputs: Tensor):
"""
- Load labels and corresponding words from files.
+ Normalize the outputs over the whole vocabulary.
"""
- raise NotImplementedError
+ batch_size = outputs.shape[0]
+ outputs = F.softmax(outputs.reshape([batch_size, -1]),
+ axis=-1).reshape(outputs.shape)
+ return outputs
- def save_to(self, path):
- label_state = [self.labels, self.token_ids.numpy().tolist()]
- with open(os.path.join(path, VERBALIZER_FILE), "w") as f:
- json.dump(label_state, f)
+ def calibrate(self, label_word_outputs: Tensor):
+ """
+ Calibrate predictions with pre-defined weights over the whole vocabulary.
+ """
+ if self.label_token_weight.dim() != 1:
+ raise ValueError("Weights of label tokens should be a 1-D tensor.")
+ weight_shape = self.label_token_weight.shape
+ output_shape = label_word_outputs.shape
+ if weight_shape[1:] != output_shape[1:] or weight_shape[0] != 1:
+ raise ValueError(
+ "Shapes of label token weights and predictions do not match, "
+ "got {} and {}.".format(weight_shape, output_shape))
+ label_word_outputs /= (self.label_token_weight + 1e-15)
+ batch_size = label_word_outputs.shape0[0]
+ label_word_outputs = paddle.mean(
+ label_word_outputs.reshape([batch_size, -1])).reshape(output_shape)
+
+ return label_word_outputs
+
+ def save(self, save_path):
+ if not os.path.exists(save_path):
+ os.makedirs(save_path, exist_ok=True)
+ verb_config_file = os.path.join(save_path, VERBALIZER_CONFIG_FILE)
+ with open(verb_config_file, "w", encoding="utf-8") as fp:
+ json.dump(self.label_words, fp, ensure_ascii=False)
+ verb_params_file = os.path.join(save_path, VERBALIZER_PARAMETER_FILE)
+ verb_state_dict = self.state_dict()
+ if len(verb_state_dict) > 0:
+ paddle.save(self.state_dict(), VERBALIZER_PARAMETER_FILE)
@classmethod
- def load_from(cls, path):
- with open(os.path.join(path, VERBALIZER_FILE), "r") as f:
- label_state = json.load(f)
- return label_state
+ def load_from(cls, data_path: os.PathLike, tokenizer: PretrainedTokenizer):
+ verb_config_file = os.path.join(data_path, VERBALIZER_CONFIG_FILE)
+ if not os.path.isfile(verb_config_file):
+ raise ValueError("{} not found under {}".format(
+ VERBALIZER_CONFIG_FILE, data_path))
+ with open(verb_config_file, "r") as fp:
+ label_words = json.load(fp)
+
+ verbalizer = cls(label_words, tokenizer)
+ verb_state_file = os.path.join(data_path, VERBALIZER_PARAMETER_FILE)
+ if os.path.isfile(verb_state_file):
+ verbalizer.set_state_dict(paddle.load(verb_state_file))
+ logger.info(
+ "Loading verbalizer state dict from {}".format(verb_state_file))
+ return verbalizer
class ManualVerbalizer(Verbalizer):
"""
- Manual Verbalizer to map labels to words for hard prompt methods.
+ ManualVerbalizer defines mapping from labels to words manually.
Args:
- tokenizer (paddlenlp.transformers.PretrainedTokenizer):
- The tokenizer of pretrained models.
- labels (list):
- The sequence of all labels.
- label_words (dict or list):
- The dictionary or corresponding list to map labels to words.
- prefix (str):
- The prefix string of words, used in PLMs like RoBERTa, which is sensitive to the prefix.
+ label_words (`dict`):
+ Define the mapping from labels to a single or multiple words.
+ tokenizer (`PretrainedTokenizer`):
+ An instance of PretrainedTokenizer for label word tokenization.
"""
- def __init__(self, tokenizer, labels=None, label_words=None, prefix=None):
- super().__init__(labels=labels)
- self.tokenizer = tokenizer
- self.prefix = prefix
- self.label_words = label_words
-
- def process_label_words(self):
- """ Create the label-word-token array and its corresponding mask. """
- if self.prefix is not None:
- self._label_words = self.add_prefix(self.label_words, self.prefix)
-
- all_ids = []
- for words_per_label in self.label_words:
- word_ids = []
- for word in words_per_label:
- word_ids.append(
- self.tokenizer.encode(
- word,
- add_special_tokens=False,
- return_token_type_ids=False)["input_ids"])
- all_ids.append(word_ids)
-
- max_num_words = max([len(words) for words in self.label_words])
- max_num_tokens = max([
- max([len(token_ids) for token_ids in word_ids])
- for word_ids in all_ids
- ])
- token_ids_shape = [len(self.labels), max_num_words, max_num_tokens]
- token_ids = np.zeros(shape=token_ids_shape)
- token_mask = np.zeros(shape=token_ids_shape)
- word_mask = np.zeros(shape=[len(self.labels), max_num_words])
- for label_i, ids_per_label in enumerate(all_ids):
- word_mask[label_i][:len(ids_per_label)] = 1
- for word_i, ids_per_word in enumerate(ids_per_label):
- token_ids[label_i][word_i][:len(ids_per_word)] = ids_per_word
- token_mask[label_i][word_i][:len(ids_per_word)] = 1
- self.token_ids = paddle.to_tensor(token_ids,
- dtype="int64",
- stop_gradient=True)
- self.token_ids_mask = paddle.to_tensor(token_mask,
- dtype="int64",
- stop_gradient=True)
- self.word_ids_mask = paddle.to_tensor(word_mask,
- dtype="float32",
- stop_gradient=True)
-
- def project(self, logits):
- word_shape = [*logits.shape[:-1], *self.token_ids.shape]
- token_logits = logits.index_select(index=self.token_ids.reshape([-1]),
- axis=-1).reshape(word_shape)
- word_logits = self.aggregate(token_logits, self.token_ids_mask)
- return word_logits
-
- def process_outputs(self, logits, inputs, **kwargs):
- mask_ids = inputs["mask_ids"].unsqueeze(2)
- real_len = logits.shape[1]
- mask_ids = mask_ids[:, -real_len:]
- logits = paddle.where(mask_ids == 1, logits, paddle.zeros_like(logits))
- logits = logits.sum(axis=1) / mask_ids.sum(axis=1)
-
- word_logits = self.project(logits)
- label_logits = self.aggregate(word_logits, self.word_ids_mask)
- return label_logits
+ def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer):
+ super(ManualVerbalizer, self).__init__(label_words=label_words,
+ tokenizer=tokenizer)
+
+ def create_parameters(self):
+ return None
+
+ def aggregate_multiple_mask(self, outputs: Tensor, atype: str = None):
+ if atype is None:
+ return outputs
+ assert outputs.ndim == 3
+ if atype == "mean":
+ outputs = outputs.sum(axis=1)
+ elif atype == "max":
+ outputs = outputs.max(axis=1)
+ elif atype == "first":
+ index = paddle.to_tensor([0])
+ outputs = paddle.index_select(outputs, index, axis=1)
+ elif atype == "product":
+ new_outputs = outputs[:, 0, :]
+ for index in range(1, outputs.shape[1]):
+ new_outputs *= outputs[:, index, :]
+ outputs = new_outputs
+ else:
+ raise ValueError(
+ "Strategy {} is not supported to aggregate multiple "
+ "tokens.".format(atype))
+ return outputs
+
+ def process_outputs(self,
+ outputs: Tensor,
+ masked_positions: Tensor = None,
+ **kwargs):
+ """
+ Process outputs over the vocabulary, including the following steps:
- @classmethod
- def from_file(cls, tokenizer, label_file, prefix=None, delimiter="=="):
- with open(label_file, "r", encoding="utf-8") as fp:
- label_words = defaultdict(list)
- for line in fp:
- data = line.strip().split(delimiter)
- word = data[1] if len(data) > 1 else data[0].split("##")[-1]
- label_words[data[0]].append(word)
- return cls(tokenizer,
- labels=set(label_words.keys()),
- label_words=dict(label_words),
- prefix=prefix)
+ (1) Project outputs into the outputs of corresponding word.
+ If self.post_log_softmax is True:
+
+ (2) Normalize over all label words.
-class MultiMaskVerbalizer(ManualVerbalizer):
+ (3) Calibrate (optional)
- def __init__(self, tokenizer, labels=None, label_words=None, prefix=None):
- super().__init__(tokenizer, labels, label_words, prefix)
+ (4) Aggregate multiple words for each label.
- def process_outputs(self, logits, inputs, **kwargs):
- """
- Process logits according to mask ids and label words.
Args:
- logits (paddle.Tensor):
- The output of ForMaskedLM model with shape
- [batch_size, max_seq_length, vocab_size].
- inputs (InputFeatures):
- The input features of model, including mask_ids.
+ outputs (`Tensor`):
+ The outputs of `PretrainedModel` which class name ends with
+ `ForMaskedLM`.
+ Returns:
+ The prediction outputs over labels (`Tensor`).
"""
- batch_size, seq_len, vocab_size = logits.shape
- batch_ids, word_ids = paddle.where(inputs["mask_ids"] == 1)
- mask_ids = batch_ids * seq_len + word_ids
- mask_logits = logits.reshape([-1, vocab_size])[mask_ids]
- mask_logits = mask_logits.reshape([batch_size, -1, vocab_size])
- return mask_logits
+ outputs = super(ManualVerbalizer,
+ self).process_outputs(outputs, masked_positions)
+ label_word_outputs = self.project(outputs)
+
+ if self.post_log_softmax:
+ label_word_outputs = self.normalize(label_word_outputs)
+
+ if self.label_token_weight is not None:
+ label_word_outputs = self.calibrate(label_word_outputs)
+
+ label_word_outputs = paddle.log(label_word_outputs + 1e-15)
+ label_outputs = self.aggregate(label_word_outputs, self.word_mask,
+ self.word_aggregate_type)
+ label_outputs = self.aggregate_multiple_mask(label_outputs,
+ self.mask_aggregate_type)
+ return label_outputs
-class Identity(nn.Layer):
+
+class MaskedLMIdentity(nn.Layer):
"""
- Identity layer to replace the last linear layer in MLM model, which
- outputs the input `sequence_output` directly.
+ Identity layer with the same arguments as the last linear layer in
+ `PretrainedModel` whose name ends with `ForMaskedLM`.
"""
def __init__(self):
- super().__init__()
+ super(MaskedLMIdentity, self).__init__()
def forward(self, sequence_output, masked_positions=None):
return sequence_output
@@ -325,59 +350,44 @@ def forward(self, sequence_output, masked_positions=None):
class SoftVerbalizer(Verbalizer):
"""
- Soft Verbalizer to encode labels as embeddings.
+ SoftVerbalizer for the WARP method.
Args:
- tokenizer (paddlenlp.transformers.PretrainedTokenizer):
- The tokenizer of pretrained models.
- model (paddlenlp.transformers.PretrainedModel):
- The pretrained language model.
- labels (list):
- The sequence of all labels.
- label_words (dict or list):
- The dictionary or corresponding list to map labels to words.
- prefix (str):
- The prefix string of words, used in PLMs like RoBERTa, which is sensitive to the prefix.
+ label_words (`dict`):
+ Define the mapping from labels to a single or multiple words.
+ tokenizer (`PretrainedTokenizer`):
+ An instance of PretrainedTokenizer for label word tokenization.
+ model (`PretrainedModel`):
+ An instance of PretrainedModel with class name ends with `ForMaskedLM`
"""
-
LAST_WEIGHT = ["ErnieForMaskedLM", "BertForMaskedLM"]
LAST_LINEAR = ["AlbertForMaskedLM", "RobertaForMaskedLM"]
- def __init__(self, tokenizer, model, labels, label_words=None, prefix=''):
- super().__init__(labels=labels)
- self.tokenizer = tokenizer
- self.labels = labels
- self.prefix = prefix
- self.label_words = label_words
-
- self._extract_head(model)
-
- def process_label_words(self):
- """ Create the label-token array and its corresponding mask. """
- if self.prefix is not None:
- self._label_words = self.add_prefix(self.label_words, self.prefix)
-
- all_ids = []
- for words_per_label in self.label_words:
- if len(words_per_label) > 1:
- logger.warning("Only the first word used for every label.")
- all_ids.append(
- self.tokenizer.encode(words_per_label[0],
- add_special_tokens=False,
- return_token_type_ids=False)["input_ids"])
-
- max_num_tokens = max([len(tokens) for tokens in all_ids])
- token_ids = np.zeros(shape=[len(self.labels), max_num_tokens])
- token_mask = np.zeros(shape=[len(self.labels), max_num_tokens])
- for label_i, ids_per_label in enumerate(all_ids):
- token_ids[label_i][:len(ids_per_label)] = ids_per_label
- token_mask[label_i][:len(ids_per_label)] = 1
- self.token_ids = paddle.to_tensor(token_ids,
- dtype="int64",
- stop_gradient=True)
- self.token_ids_mask = paddle.to_tensor(token_mask,
- dtype="int64",
- stop_gradient=True)
+ def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer,
+ model: PretrainedModel):
+ super(SoftVerbalizer, self).__init__(label_words=label_words,
+ tokenizer=tokenizer,
+ model=model)
+ del self.model
+ setattr(model, self.head_name[0], MaskedLMIdentity())
+
+ def create_parameters(self):
+ # Only the first word used for initialization.
+ if self.token_ids.shape[1] != 1:
+ logger.warning("Only the first word for each label is used for"
+ " initialization.")
+ index = paddle.to_tensor([0])
+ self.token_ids = paddle.index_select(self.token_ids, index, axis=1)
+ self.token_mask = paddle.index_select(self.token_mask,
+ index,
+ axis=1)
+ self.word_mask = paddle.ones([len(self.labels), 1])
+ self._extract_head(self.model)
+
+ def process_outputs(self, outputs: Tensor, masked_positions: Tensor = None):
+ outputs = super(SoftVerbalizer,
+ self).process_outputs(outputs, masked_positions)
+ return self.head(outputs).squeeze(1)
def head_parameters(self):
if isinstance(self.head, nn.Linear):
@@ -393,31 +403,6 @@ def non_head_parameters(self):
return [(n, p) for n, p in self.head.named_parameters()
if self.head_name[1] not in n]
- def process_model(self, model):
- setattr(model, self.head_name[0], Identity())
- return model
-
- def process_outputs(self, logits, inputs=None, **kwargs):
- mask_ids = inputs["mask_ids"].unsqueeze(2)
- real_len = logits.shape[1]
- mask_ids = mask_ids[:, -real_len:]
- logits = (logits * mask_ids).sum(axis=1) / mask_ids.sum(axis=1)
- return self.head(logits)
-
- @classmethod
- def from_file(cls, tokenizer, model, label_file, prefix=None):
- with open(label_file, "r", encoding="utf-8") as fp:
- label_words = defaultdict(list)
- for line in fp:
- data = line.strip().split("==")
- word = data[1] if len(data) > 1 else data[0]
- label_words[data[0]].append(word)
- return cls(tokenizer,
- model,
- labels=set(label_words.keys()),
- label_words=dict(label_words),
- prefix=prefix)
-
def _extract_head(self, model):
model_type = model.__class__.__name__
if model_type in self.LAST_LINEAR:
@@ -439,45 +424,51 @@ def _extract_head(self, model):
self.head_name.append(name)
break
elif model_type in self.LAST_WEIGHT:
- # OnlyMLMHead
last_name = [n for n, p in model.named_children()][-1]
head = getattr(model, last_name)
self.head_name = [last_name]
- # LMPredictionHead
- last_name = [n for n, p in head.named_children()][-1]
- self.head = copy.deepcopy(getattr(head, last_name))
- self.head_name.append("decoder")
+ # OnlyMLMHead
+ if model_type in ["ErnieForMaskedLM", "BertForMaskedLM"]:
+ last_name = [n for n, p in head.named_children()][-1]
+ self.head = copy.deepcopy(getattr(head, last_name))
+ self.head_name.append("decoder")
+ else:
+ self.head = copy.deepcopy(head)
+ # LMPredictionHead
module = paddle.to_tensor(getattr(self.head, "decoder_weight"))
- bias = paddle.to_tensor(getattr(self.head, "decoder_bias"))
new_head = nn.Linear(len(self.labels),
module.shape[1],
bias_attr=False)
new_head.weight.set_value(self._create_init_weight(module.T).T)
setattr(self.head, "decoder_weight", new_head.weight)
getattr(self.head, "decoder_weight").stop_gradient = False
- setattr(
- self.head, "decoder_bias",
- self.head.create_parameter(shape=[len(self.labels)],
- dtype=new_head.weight.dtype,
- is_bias=True))
- getattr(self.head, "decoder_bias").stop_gradient = False
+ if hasattr(self.head, "decoder_bias"):
+ bias = paddle.to_tensor(getattr(self.head, "decoder_bias"))
+ setattr(
+ self.head, "decoder_bias",
+ self.head.create_parameter(shape=[len(self.labels)],
+ dtype=new_head.weight.dtype,
+ is_bias=True))
+ getattr(self.head, "decoder_bias").stop_gradient = False
else:
raise NotImplementedError(
f"Please open an issue to request for support of {model_type}" +
f" or contribute to PaddleNLP.")
def _create_init_weight(self, weight, is_bias=False):
+ token_ids = self.token_ids.squeeze(1)
+ token_mask = self.token_mask.squeeze(1)
+ aggr_type = self.token_aggregate_type
if is_bias:
- bias = paddle.index_select(weight,
- self.token_ids.reshape([-1]),
- axis=0).reshape(self.token_ids.shape)
- bias = self.aggregate(bias, self.token_ids_mask)
+ bias = paddle.index_select(weight, token_ids.reshape([-1]),
+ axis=0).reshape(token_ids.shape)
+ bias = self.aggregate(bias, token_mask, aggr_type)
return bias
else:
- word_shape = [weight.shape[0], *self.token_ids.shape]
+ word_shape = [weight.shape[0], *token_ids.shape]
weight = paddle.index_select(weight,
- self.token_ids.reshape([-1]),
+ token_ids.reshape([-1]),
axis=1).reshape(word_shape)
- weight = self.aggregate(weight, self.token_ids_mask)
+ weight = self.aggregate(weight, token_mask, aggr_type)
return weight