-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_dataset.py
104 lines (87 loc) · 4.08 KB
/
build_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import logging
import os
from dataclasses import dataclass
from typing import Dict, Sequence, Union, List
import datasets
import torch
from datasets import load_dataset, concatenate_datasets
import transformers
IGNORE_INDEX = -100
logger = logging.getLogger('__name__')
PROMPT_TEMPLATE = (
"[INST] <<SYS>>\n"
"You are a helpful assistant. 你是一个乐于助人的助手。\n"
"<</SYS>>\n\n{instruction} [/INST]"
)
def build_instruction_dataset(data_path: Union[List[str],str],
tokenizer: transformers.PreTrainedTokenizer,
max_seq_length: int, data_cache_dir = None,
preprocessing_num_workers = None,
):
def tokenization(examples):
sources = []
targets = []
prompt = PROMPT_TEMPLATE
for instruction, input, output in zip(examples['instruction'],examples['input'],examples['output']):
if input is not None and input !="":
instruction = instruction+'\n'+input
source = prompt.format_map({'instruction':instruction})
target = f"{output}{tokenizer.eos_token}"
sources.append(source)
targets.append(target)
tokenized_sources = tokenizer(sources,return_attention_mask=False)
tokenized_targets = tokenizer(targets,return_attention_mask=False,add_special_tokens=False)
all_input_ids = []
all_labels = []
for s,t in zip(tokenized_sources['input_ids'],tokenized_targets['input_ids']):
input_ids = torch.LongTensor(s + t)[:max_seq_length]
labels = torch.LongTensor([IGNORE_INDEX] * len(s) + t)[:max_seq_length]
assert len(input_ids) == len(labels)
all_input_ids.append(input_ids)
all_labels.append(labels)
results = {'input_ids':all_input_ids, 'labels': all_labels}
return results
logging.warning("building dataset...")
all_datasets = []
if not isinstance(data_path,(list,tuple)):
data_path = [data_path]
for file in data_path:
if data_cache_dir is None:
data_cache_dir = str(os.path.dirname(file)) # 在数据相同目录下生成cache目录
cache_path = os.path.join(data_cache_dir,os.path.basename(file).split('.')[0]+f"_{max_seq_length}") # e.g. data_fin/json/insft
os.makedirs(cache_path, exist_ok=True)
try:
processed_dataset = datasets.load_from_disk(cache_path)
logger.info(f'training datasets-{file} has been loaded from disk')
except Exception:
raw_dataset = load_dataset("json", data_files=file, cache_dir=cache_path)
tokenization_func = tokenization
tokenized_dataset = raw_dataset.map(
tokenization_func,
batched=True,
num_proc=preprocessing_num_workers,
remove_columns=["instruction","input","output"],
keep_in_memory=False,
desc="preprocessing on dataset",
)
processed_dataset = tokenized_dataset
processed_dataset.save_to_disk(cache_path)
processed_dataset.set_format('torch')
all_datasets.append(processed_dataset['train'])
all_datasets = concatenate_datasets(all_datasets)
return all_datasets
@dataclass
class DataCollatorForSupervisedDataset(object):
"""Collate examples for supervised fine-tuning."""
tokenizer: transformers.PreTrainedTokenizer
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
input_ids = torch.nn.utils.rnn.pad_sequence(
input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id # input_ids要求是list of tenosr。在rocessed_dataset.set_format('torch')这里已经转了
)
labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
return dict(
input_ids=input_ids,
labels=labels,
attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
)