-
Notifications
You must be signed in to change notification settings - Fork 11
/
run.py
204 lines (181 loc) · 10.6 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
AutoModelForQuestionAnswering, Trainer, TrainingArguments, HfArgumentParser
from helpers import prepare_dataset_nli, prepare_train_dataset_qa, \
prepare_validation_dataset_qa, QuestionAnsweringTrainer, compute_accuracy
import os
import json
NUM_PREPROCESSING_WORKERS = 2
def main():
argp = HfArgumentParser(TrainingArguments)
# The HfArgumentParser object collects command-line arguments into an object (and provides default values for unspecified arguments).
# In particular, TrainingArguments has several keys that you'll need/want to specify (when you call run.py from the command line):
# --do_train
# When included, this argument tells the script to train a model.
# See docstrings for "--task" and "--dataset" for how the training dataset is selected.
# --do_eval
# When included, this argument tells the script to evaluate the trained/loaded model on the validation split of the selected dataset.
# --per_device_train_batch_size <int, default=8>
# This is the training batch size.
# If you're running on GPU, you should try to make this as large as you can without getting CUDA out-of-memory errors.
# For reference, with --max_length=128 and the default ELECTRA-small model, a batch size of 32 should fit in 4gb of GPU memory.
# --num_train_epochs <float, default=3.0>
# How many passes to do through the training data.
# --output_dir <path>
# Where to put the trained model checkpoint(s) and any eval predictions.
# *This argument is required*.
argp.add_argument('--model', type=str,
default='google/electra-small-discriminator',
help="""This argument specifies the base model to fine-tune.
This should either be a HuggingFace model ID (see https://huggingface.co/models)
or a path to a saved model checkpoint (a folder containing config.json and pytorch_model.bin).""")
argp.add_argument('--task', type=str, choices=['nli', 'qa'], required=True,
help="""This argument specifies which task to train/evaluate on.
Pass "nli" for natural language inference or "qa" for question answering.
By default, "nli" will use the SNLI dataset, and "qa" will use the SQuAD dataset.""")
argp.add_argument('--dataset', type=str, default=None,
help="""This argument overrides the default dataset used for the specified task.""")
argp.add_argument('--max_length', type=int, default=128,
help="""This argument limits the maximum sequence length used during training/evaluation.
Shorter sequence lengths need less memory and computation time, but some examples may end up getting truncated.""")
argp.add_argument('--max_train_samples', type=int, default=None,
help='Limit the number of examples to train on.')
argp.add_argument('--max_eval_samples', type=int, default=None,
help='Limit the number of examples to evaluate on.')
training_args, args = argp.parse_args_into_dataclasses()
# Dataset selection
if args.dataset.endswith('.json') or args.dataset.endswith('.jsonl'):
dataset_id = None
# Load from local json/jsonl file
dataset = datasets.load_dataset('json', data_files=args.dataset)
# By default, the "json" dataset loader places all examples in the train split,
# so if we want to use a jsonl file for evaluation we need to get the "train" split
# from the loaded dataset
eval_split = 'train'
else:
default_datasets = {'qa': ('squad',), 'nli': ('snli',)}
dataset_id = tuple(args.dataset.split(':')) if args.dataset is not None else \
default_datasets[args.task]
# MNLI has two validation splits (one with matched domains and one with mismatched domains). Most datasets just have one "validation" split
eval_split = 'validation_matched' if dataset_id == ('glue', 'mnli') else 'validation'
# Load the raw data
dataset = datasets.load_dataset(*dataset_id)
# NLI models need to have the output label count specified (label 0 is "entailed", 1 is "neutral", and 2 is "contradiction")
task_kwargs = {'num_labels': 3} if args.task == 'nli' else {}
# Here we select the right model fine-tuning head
model_classes = {'qa': AutoModelForQuestionAnswering,
'nli': AutoModelForSequenceClassification}
model_class = model_classes[args.task]
# Initialize the model and tokenizer from the specified pretrained model/checkpoint
model = model_class.from_pretrained(args.model, **task_kwargs)
tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=True)
# Select the dataset preprocessing function (these functions are defined in helpers.py)
if args.task == 'qa':
prepare_train_dataset = lambda exs: prepare_train_dataset_qa(exs, tokenizer)
prepare_eval_dataset = lambda exs: prepare_validation_dataset_qa(exs, tokenizer)
elif args.task == 'nli':
prepare_train_dataset = prepare_eval_dataset = \
lambda exs: prepare_dataset_nli(exs, tokenizer, args.max_length)
# prepare_eval_dataset = prepare_dataset_nli
else:
raise ValueError('Unrecognized task name: {}'.format(args.task))
print("Preprocessing data... (this takes a little bit, should only happen once per dataset)")
if dataset_id == ('snli',):
# remove SNLI examples with no label
dataset = dataset.filter(lambda ex: ex['label'] != -1)
train_dataset = None
eval_dataset = None
train_dataset_featurized = None
eval_dataset_featurized = None
if training_args.do_train:
train_dataset = dataset['train']
if args.max_train_samples:
train_dataset = train_dataset.select(range(args.max_train_samples))
train_dataset_featurized = train_dataset.map(
prepare_train_dataset,
batched=True,
num_proc=NUM_PREPROCESSING_WORKERS,
remove_columns=train_dataset.column_names
)
if training_args.do_eval:
eval_dataset = dataset[eval_split]
if args.max_eval_samples:
eval_dataset = eval_dataset.select(range(args.max_eval_samples))
eval_dataset_featurized = eval_dataset.map(
prepare_eval_dataset,
batched=True,
num_proc=NUM_PREPROCESSING_WORKERS,
remove_columns=eval_dataset.column_names
)
# Select the training configuration
trainer_class = Trainer
eval_kwargs = {}
# If you want to use custom metrics, you should define your own "compute_metrics" function.
# For an example of a valid compute_metrics function, see compute_accuracy in helpers.py.
compute_metrics = None
if args.task == 'qa':
# For QA, we need to use a tweaked version of the Trainer (defined in helpers.py)
# to enable the question-answering specific evaluation metrics
trainer_class = QuestionAnsweringTrainer
eval_kwargs['eval_examples'] = eval_dataset
metric = datasets.load_metric('squad')
compute_metrics = lambda eval_preds: metric.compute(
predictions=eval_preds.predictions, references=eval_preds.label_ids)
elif args.task == 'nli':
compute_metrics = compute_accuracy
# This function wraps the compute_metrics function, storing the model's predictions
# so that they can be dumped along with the computed metrics
eval_predictions = None
def compute_metrics_and_store_predictions(eval_preds):
nonlocal eval_predictions
eval_predictions = eval_preds
return compute_metrics(eval_preds)
# Initialize the Trainer object with the specified arguments and the model and dataset we loaded above
trainer = trainer_class(
model=model,
args=training_args,
train_dataset=train_dataset_featurized,
eval_dataset=eval_dataset_featurized,
tokenizer=tokenizer,
compute_metrics=compute_metrics_and_store_predictions
)
# Train and/or evaluate
if training_args.do_train:
trainer.train()
trainer.save_model()
# If you want to customize the way the loss is computed, you should subclass Trainer and override the "compute_loss"
# method (see https://huggingface.co/transformers/_modules/transformers/trainer.html#Trainer.compute_loss).
#
# You can also add training hooks using Trainer.add_callback:
# See https://huggingface.co/transformers/main_classes/trainer.html#transformers.Trainer.add_callback
# and https://huggingface.co/transformers/main_classes/callback.html#transformers.TrainerCallback
if training_args.do_eval:
results = trainer.evaluate(**eval_kwargs)
# To add custom metrics, you should replace the "compute_metrics" function (see comments above).
#
# If you want to change how predictions are computed, you should subclass Trainer and override the "prediction_step"
# method (see https://huggingface.co/transformers/_modules/transformers/trainer.html#Trainer.prediction_step).
# If you do this your custom prediction_step should probably start by calling super().prediction_step and modifying the
# values that it returns.
print('Evaluation results:')
print(results)
os.makedirs(training_args.output_dir, exist_ok=True)
with open(os.path.join(training_args.output_dir, 'eval_metrics.json'), encoding='utf-8', mode='w') as f:
json.dump(results, f)
with open(os.path.join(training_args.output_dir, 'eval_predictions.jsonl'), encoding='utf-8', mode='w') as f:
if args.task == 'qa':
predictions_by_id = {pred['id']: pred['prediction_text'] for pred in eval_predictions.predictions}
for example in eval_dataset:
example_with_prediction = dict(example)
example_with_prediction['predicted_answer'] = predictions_by_id[example['id']]
f.write(json.dumps(example_with_prediction))
f.write('\n')
else:
for i, example in enumerate(eval_dataset):
example_with_prediction = dict(example)
example_with_prediction['predicted_scores'] = eval_predictions.predictions[i].tolist()
example_with_prediction['predicted_label'] = int(eval_predictions.predictions[i].argmax())
f.write(json.dumps(example_with_prediction))
f.write('\n')
if __name__ == "__main__":
main()