Skip to content

Commit

Permalink
merge develop branch
Browse files Browse the repository at this point in the history
  • Loading branch information
wj-Mcat committed Nov 16, 2022
2 parents 1b77e4e + e198d1b commit f663930
Show file tree
Hide file tree
Showing 25 changed files with 2,063 additions and 1,564 deletions.
27 changes: 14 additions & 13 deletions applications/text_classification/hierarchical/few-shot/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@

内存: 630 GB

3. PaddlePaddle 版本:2.3.1
3. PaddlePaddle 版本:2.4rc

4. PaddleNLP 版本:2.3.5 (develop)
4. PaddleNLP 版本:2.4.3

5. 评估设置

Expand All @@ -91,7 +91,7 @@
| model_name | 训练方式 | Micro F1分数 | Macro F1分数 |
| ---------- | ------- | ----------- | ----------- |
| ernie-3.0-base-zh | 微调学习 | 0.7172 | 0.3821 |
| ernie-3.0-base-zh | 提示学习 | 0.8855 | 0.8443 |
| ernie-3.0-base-zh | 提示学习 | 0.8945 | 0.8516 |
<a name="定制训练"></a>
Expand All @@ -102,10 +102,10 @@
<a name="运行环境"></a>
### 3.1 运行环境
- python >= 3.6
- paddlepaddle > 2.3(2.4版本发布前推荐安装[develop版本](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html))
- paddlenlp >= 2.3.5
- paddle2onnx >= 1.0.0rc3
- python >= 3.7
- paddlepaddle >= 2.4rc
- paddlenlp >= 2.4.3
- paddle2onnx >= 1.0.3
<a name="代码结构"></a>
### 3.2 代码结构
Expand Down Expand Up @@ -222,12 +222,12 @@ python train.py \
--do_export \
--num_train_epochs 100 \
--logging_steps 5 \
--save_total_limit 1 \
--per_device_eval_batch_size 32 \
--per_device_train_batch_size 8 \
--metric_for_best_model macro_f1_score \
--load_best_model_at_end \
--evaluation_strategy epoch \
--save_strategy epoch
--eval_steps 100
```
**多卡训练**

Expand All @@ -247,12 +247,12 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \
--do_export \
--num_train_epochs 100 \
--logging_steps 5 \
--save_total_limit 1 \
--per_device_eval_batch_size 32 \
--per_device_train_batch_size 8 \
--metric_for_best_model macro_f1_score \
--load_best_model_at_end \
--evaluation_strategy epoch \
--save_strategy epoch
--eval_steps 100
```

可配置参数说明:
Expand All @@ -273,6 +273,7 @@ python -u -m paddle.distributed.launch --gpus 0,1,2,3 train.py \
- `do_export`: 是否在运行结束时将模型导出为静态图,保存路径为`output_dir/export`
- `num_train_epochs`: 训练的最大轮数。
- `max_steps`: 训练的最大步数。此设置将会覆盖`num_train_epochs`
- `save_total_limit`: 模型检查点保存数量。
- `device`: 使用的设备,默认为`gpu`
- `eval_steps`: 评估模型的间隔步数。
- `logging_steps`: 打印日志的间隔步数。
Expand Down Expand Up @@ -352,9 +353,9 @@ python infer.py --model_path_prefix checkpoints/export/model --data_dir ./data -
可配置参数说明:

- `model_path_prefix`: 导出的静态图模型路径及文件前缀。
- `model_name_or_path`: 内置预训练模型名,或者模型参数配置目录路径,用于加载tokenizer。默认为`ernie-3.0-base-zh`
- `model_name`: 内置预训练模型名,用于加载tokenizer。默认为`ernie-3.0-base-zh`
- `data_dir`: 待推理数据所在路径,数据应存放在该目录下的`data.txt`文件。
- `max_seq_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。
- `max_length`: 最大句子长度,超过该长度的文本将被截断,不足的以Pad补全。提示文本不会被截断。
- `batch_size`: 每次预测的样本数量。
- `device`: 选择推理设备,包括`cpu``gpu`。默认为`gpu`
- `device_id`: 指定GPU设备ID。
Expand Down
113 changes: 68 additions & 45 deletions applications/text_classification/hierarchical/few-shot/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,24 @@

import os
import six
import json
import psutil
import argparse

import numpy as np

from paddlenlp.utils.log import logger
from paddlenlp.prompt import AutoTemplate, Verbalizer, InputExample
from paddlenlp.transformers import AutoTokenizer
from paddlenlp.prompt import AutoTemplate, PromptDataCollatorWithPadding
from paddlenlp.transformers import AutoTokenizer, AutoModelForMaskedLM
import paddle2onnx
import onnxruntime as ort

# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_path_prefix", type=str, required=True, help="The path prefix of inference model to be used.")
parser.add_argument("--model_name_or_path", default="ernie-3.0-base-zh", type=str, help="The directory or name of model.")
parser.add_argument("--model_name", default="ernie-3.0-base-zh", type=str, help="The name of pretrained model.")
parser.add_argument("--data_dir", default=None, type=str, help="The path to the prediction data, including label.txt and data.txt.")
parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization.")
parser.add_argument("--max_length", default=128, type=int, help="The maximum total input sequence length after tokenization.")
parser.add_argument("--use_fp16", action='store_true', help="Whether to use fp16 inference, only takes effect when deploying on gpu.")
parser.add_argument("--batch_size", default=200, type=int, help="Batch size per GPU/CPU for predicting.")
parser.add_argument("--num_threads", default=psutil.cpu_count(logical=False), type=int, help="num_threads for cpu.")
Expand Down Expand Up @@ -103,12 +104,6 @@ def __init__(self,
'device_id':
device_id
}])
self.input_handles = [
self.predictor.get_inputs()[0].name,
self.predictor.get_inputs()[1].name,
self.predictor.get_inputs()[2].name
]

if device == "gpu":
try:
assert 'CUDAExecutionProvider' in self.predictor.get_providers()
Expand All @@ -122,27 +117,52 @@ def __init__(self,
logger.info(">>> [InferBackend] Engine Created ...")

def infer(self, input_dict: dict):
input_dict = {
k: v
for k, v in input_dict.items() if k in self.input_handles
}
result = self.predictor.run(None, input_dict)
return result


class HierachicalPredictor(object):

def __init__(self, args, label_list):
self._label_list = label_list
self._tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
self._max_seq_length = args.max_seq_length
self._batch_size = args.batch_size
self.inference_backend = InferBackend(args.model_path_prefix,
args.device, args.device_id,
args.use_fp16, args.num_threads)
self._template = AutoTemplate.load_from(
os.path.dirname(args.model_path_prefix), self._tokenizer,
args.max_seq_length)
def __init__(self, args):
self.args = args
self.tokenizer = AutoTokenizer.from_pretrained(args.model_name)
self.model = AutoModelForMaskedLM.from_pretrained(args.model_name)
self.template, self.labels, self.input_handles = self.post_init()
self.collate_fn = PromptDataCollatorWithPadding(
self.tokenizer,
padding=True,
return_tensors="np",
return_attention_mask=True)

self.inference_backend = InferBackend(self.args.model_path_prefix,
self.args.device,
self.args.device_id,
self.args.use_fp16,
self.args.num_threads)

def post_init(self):
export_path = os.path.dirname(self.args.model_path_prefix)
template_path = os.path.join(export_path, "template_config.json")
with open(template_path, "r") as fp:
prompt = json.load(fp)
template = AutoTemplate.create_from(prompt, self.tokenizer,
self.args.max_length,
self.model)
keywords = template.extract_template_keywords(template.prompt)
inputs = [
"input_ids", "token_type_ids", "position_ids", "attention_mask",
"masked_positions"
]
if "soft" in keywords:
inputs.append("soft_token_ids")
if "encoder" in keywords:
inputs.append("encoder_ids")
verbalizer_path = os.path.join(export_path, "verbalizer_config.json")
with open(verbalizer_path, "r") as fp:
label_words = json.load(fp)
labels = sorted(list(label_words.keys()))

return template, labels, inputs

def predict(self, input_data: list):
encoded_inputs = self.preprocess(input_data)
Expand All @@ -155,14 +175,27 @@ def _infer(self, input_dict):
infer_data = self.inference_backend.infer(input_dict)
return infer_data

def infer_batch(self, encoded_inputs):
num_sample = len(encoded_inputs["input_ids"])
def infer_batch(self, inputs):
num_sample = len(inputs)
infer_data = None
num_infer_data = None
for idx in range(0, num_sample, self._batch_size):
l, r = idx, idx + self._batch_size
keys = encoded_inputs.keys()
input_dict = {k: encoded_inputs[k][l:r] for k in keys}
for index in range(0, num_sample, self.args.batch_size):
left, right = index, index + self.args.batch_size
batch_dict = self.collate_fn(inputs[left:right])
input_dict = {}
for key in self.input_handles:
value = batch_dict[key]
if key == "attention_mask":
if value.ndim == 2:
value = (1 - value[:, np.newaxis, np.newaxis, :]) * -1e4
elif value.ndim != 4:
raise ValueError(
"Expect attention mask with ndim=2 or 4, but get ndim={}"
.format(value.ndim))
value = value.astype("float32")
else:
value = value.astype("int64")
input_dict[key] = value
results = self._infer(input_dict)
if infer_data is None:
infer_data = [[x] for x in results]
Expand All @@ -175,16 +208,8 @@ def infer_batch(self, encoded_inputs):
return infer_data

def preprocess(self, input_data: list):
text = [InputExample(text_a=x) for x in input_data]
inputs = [self._template.wrap_one_example(x) for x in text]
inputs = {
"input_ids":
np.array([x["input_ids"] for x in inputs], dtype="int64"),
"mask_ids":
np.array([x["mask_ids"] for x in inputs], dtype="int64"),
"soft_token_ids":
np.array([x["soft_token_ids"] for x in inputs], dtype="int64")
}
text = [{"text_a": x} for x in input_data]
inputs = [self.template(x) for x in text]
return inputs

@staticmethod
Expand All @@ -197,7 +222,7 @@ def postprocess(self, infer_data):
label_ids = np.argwhere(probs > threshold)
labels = [[] for _ in range(probs.shape[0])]
for idx, label_id in label_ids:
labels[idx].append(self._label_list[label_id])
labels[idx].append(self.labels[label_id])
return {"label": labels}

def printer(self, result, input_data):
Expand All @@ -212,12 +237,10 @@ def printer(self, result, input_data):
for arg_name, arg_value in vars(args).items():
logger.info("{:20}: {}".format(arg_name, arg_value))

export_path = os.path.dirname(args.model_path_prefix)
labels, _ = Verbalizer.load_from(export_path)
predictor = HierachicalPredictor(args)

text_dir = os.path.join(args.data_dir, "data.txt")
with open(text_dir, "r", encoding="utf-8") as f:
text_list = [x.strip() for x in f.readlines()]

predictor = HierachicalPredictor(args, labels)
predictor.predict(text_list)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
psutil
paddlepaddle>=2.4rc
paddlenlp>=2.4.3
paddle2onnx>=1.0.3
onnxruntime
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
psutil
paddlepaddle-gpu>=2.4rc
paddlenlp>=2.4.3
paddle2onnx>=1.0.3
onnxruntime-gpu
onnx
onnxconverter-common
44 changes: 29 additions & 15 deletions applications/text_classification/hierarchical/few-shot/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from dataclasses import dataclass, field
import os
import sys
from collections import defaultdict

import paddle
import paddle.nn.functional as F
Expand All @@ -41,9 +42,6 @@
class DataArguments:
data_dir: str = field(default="./data", metadata={"help": "The dataset dictionary includes train.txt, dev.txt, test.txt, label.txt and data.txt (optional) files."})
prompt: str = field(default=None, metadata={"help": "The input prompt for tuning."})
soft_encoder: str = field(default="lstm", metadata={"help": "The encoder type of soft template, `lstm`, `mlp` or None."})
encoder_hidden_size: int = field(default=200, metadata={"help": "The dimension of soft embeddings."})


@dataclass
class ModelArguments:
Expand All @@ -67,17 +65,20 @@ def main():
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)

# Define the template for preprocess and the verbalizer for postprocess.
template = AutoTemplate.create_from(
data_args.prompt,
tokenizer,
training_args.max_seq_length,
model=model,
prompt_encoder=data_args.soft_encoder,
encoder_hidden_size=data_args.encoder_hidden_size)
logger.info("Using template: {}".format(template.template))
template = AutoTemplate.create_from(data_args.prompt,
tokenizer,
training_args.max_seq_length,
model=model)
logger.info("Using template: {}".format(template.prompt))

label_file = os.path.join(data_args.data_dir, "label.txt")
verbalizer = SoftVerbalizer.from_file(tokenizer, model, label_file)
with open(label_file, "r", encoding="utf-8") as fp:
label_words = defaultdict(list)
for line in fp:
data = line.strip().split("==")
word = data[1] if len(data) > 1 else data[0].split("##")[-1]
label_words[data[0]].append(word)
verbalizer = SoftVerbalizer(label_words, tokenizer, model)

# Load the few-shot datasets.
train_ds, dev_ds, test_ds = load_local_dataset(
Expand Down Expand Up @@ -139,11 +140,24 @@ def compute_metrics(eval_preds):

# Export static model.
if training_args.do_export:
template = prompt_model.template
template_keywords = template.extract_template_keywords(template.prompt)
input_spec = [
InputSpec(shape=[None, None], dtype="int64"), # input_ids
InputSpec(shape=[None, None], dtype="int64"), # mask_ids
InputSpec(shape=[None, None], dtype="int64"), # soft_token_ids
InputSpec(shape=[None, None], dtype="int64"), # input_ids,
InputSpec(shape=[None, None], dtype="int64"), # token_type_ids
InputSpec(shape=[None, None], dtype="int64"), # position_ids
InputSpec(shape=[None, None, None, None],
dtype="float32") # attention_mask
]
if "mask" in template_keywords:
input_spec.append(InputSpec(shape=[None],
dtype="int64")) # masked_positions
if "soft" in template_keywords:
input_spec.append(InputSpec(shape=[None, None],
dtype="int64")) # soft_token_ids
if "encoder" in template_keywords:
input_spec.append(InputSpec(shape=[None, None],
dtype="int64")) # encoder_ids
export_path = os.path.join(training_args.output_dir, 'export')
trainer.export_model(export_path,
input_spec=input_spec,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import os

from paddlenlp.datasets import load_dataset
from paddlenlp.prompt import InputExample


def load_local_dataset(data_path, splits, label_list):
Expand All @@ -39,14 +38,14 @@ def _reader(data_file, label_list):
for idx, line in enumerate(fp):
data = line.strip().split("\t")
if len(data) == 1:
yield InputExample(text_a=data[0])
yield {"text_a": data[0]}
else:
text, label = data
label = label.strip().split(",")
label = [
float(1) if x in label else float(0) for x in label_list
]
yield InputExample(text_a=text, labels=label)
yield {"text_a": text, "labels": label}

split_map = {"train": "train.txt", "dev": "dev.txt", "test": "test.txt"}
datasets = []
Expand Down
Loading

0 comments on commit f663930

Please sign in to comment.