-
Notifications
You must be signed in to change notification settings - Fork 0
/
cohan_app.py
165 lines (148 loc) · 7.55 KB
/
cohan_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from os import listdir
import pandas as pd
import numpy as np
import random, torch, cohan_models, transformers, time, data_manager
from datetime import datetime
def evaluate_model(train_params):
# time tag
model_reference = train_params['model_reference']
chunk_layout = train_params['chunk_layout']
time_tag = f'{chunk_layout}_{model_reference}_{datetime.now().strftime("%Y-%m-%d-%Hh%Mm%Ss")}'
train_params['time_tag'] = time_tag
# loading dataset
dataset_name = train_params['dataset']
data_loader = data_manager.get_data_manager(dataset_name)
# setting labels
labels_to_idx = data_loader.get_labels_to_idx()
labels = data_loader.get_valid_labels(labels_to_idx)
train_params['n_classes'] = len(labels)
# tokenizer
encoder_id = train_params['encoder_id']
tokenizer = transformers.AutoTokenizer.from_pretrained(encoder_id)
train_params['sep_token_id'] = tokenizer.sep_token_id
# loading data
dic_docs_train, _, dic_docs_test = data_loader.get_data()
# dataset objects
max_seq_len = train_params['max_seq_len']
max_sent_len = train_params['max_sent_len']
max_sent_per_block = train_params['max_sent_per_block']
if train_params.get('n_documents') is not None: # used in tests to speed up the train procedure
n_documents = train_params.get('n_documents')
dic_docs_train = {k: dic_docs_train[k] for k in sorted(dic_docs_train.keys())[:n_documents]}
dic_docs_test = {k: dic_docs_test[k] for k in sorted(dic_docs_test.keys())[:n_documents]}
if chunk_layout == 'Cohan':
ds_train = cohan_models.CohanDataset(dic_docs_train, labels_to_idx, tokenizer, max_sent_len, max_sent_per_block, max_seq_len)
ds_test = cohan_models.CohanDataset(dic_docs_test, labels_to_idx, tokenizer, max_sent_len, max_sent_per_block, max_seq_len)
elif chunk_layout == 'VanBerg':
window_len = train_params['window_len']
ds_train = cohan_models.OverlapedChunksDataset(dic_docs_train, tokenizer, max_sent_len, max_sent_per_block, max_seq_len, window_len, labels_to_idx)
ds_test = cohan_models.OverlapedChunksDataset(dic_docs_test, tokenizer, max_sent_len, max_sent_per_block, max_seq_len, window_len, labels_to_idx)
else:
raise ValueError('Not supported chunk layout:', chunk_layout)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
raw_metrics = {} # key: epoch, value: numpy tensor of shape (n_iterations, 5)
confusion_matrices = {} # key: iteration_id, value: dictionary (key: epoch, value: confusion matrix)
cv_start = time.perf_counter()
seeds = [(42 + i * 10) for i in range(train_params['n_iterations'])]
for i, seed_val in enumerate(seeds):
print(f'==== Started iteration {i + 1} ====')
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# train
iteration_metrics, cm, train_time = cohan_models.fit(train_params, ds_train, ds_test, labels_to_idx, device)
confusion_matrices[i] = cm
for epoch, scores in iteration_metrics.items():
epoch_metrics = raw_metrics.get(epoch, None)
if epoch_metrics is None:
raw_metrics[epoch] = scores.reshape(1,-1)
else:
raw_metrics[epoch] = np.vstack((epoch_metrics, scores))
print(' Iteration time: ', train_time)
metrics = pd.DataFrame(columns=[
'Epoch', 'Train loss', 'std', 'Test loss', 'std',
'P (macro)', 'P std', 'R (macro)', 'R std', 'F1 (macro)', 'F1 std'
])
for i, (epoch, scores) in enumerate(raw_metrics.items()):
mean = np.mean(scores, axis=0)
std = np.std(scores, axis=0)
metrics.loc[i] = [
f'{epoch}',
f'{mean[0]:.6f}', f'{std[0]:.6f}', # train loss
f'{mean[1]:.6f}', f'{std[1]:.6f}', # test loss
f'{mean[2]:.4f}', f'{std[2]:.4f}', # precision (macro)
f'{mean[3]:.4f}', f'{std[3]:.4f}', # recall (macro)
f'{mean[4]:.4f}', f'{std[4]:.4f}' # f1 (macro)
]
cv_end = time.perf_counter()
cv_time = time.strftime("%Hh%Mm%Ss", time.gmtime(cv_end - cv_start))
print('End of evaluation. Total time:', cv_time)
save_report(
metrics, raw_metrics, labels,
confusion_matrices, f'test set ({len(seeds)} random seeds)',
train_params, cv_time, device, time_tag
)
def save_report(
avg_metrics, complete_metrics, labels,
confusion_matrices, evaluation, train_params, train_time, device, time_tag):
"""
Arguments:
avg_metrics : A pandas Dataframe with the averaged metrics.
complete_metrics : A dictionary with the metrics by epoch. The key indicates the epoch.
Each value must be a numpy tensor of shape (n_iterations, 5).
labels : list of all labels.
confusion_matrices : A dictionary => key: iteration_id, value: dictionary (key: epoch, value: confusion matrix)
evaluation : the kind of evalutaion (string). Cross-validation or Holdout.
train_params : A dictionary.
train_time : total time spent on training/evaluation (string).
device : ID of GPU device
time_tag : time tag to be appended to report file name.
"""
model_reference = train_params['model_reference']
dataset_name = train_params["dataset"]
report = (
'RESULTS REPORT - Cohan\n'
f'Model: {model_reference}\n'
f'Encoder: {train_params["encoder_id"] if not train_params["use_mock"] else "MOCK MODEL"}\n'
f'Dataset: {dataset_name}\n'
f'Chunk layout: {train_params["chunk_layout"]}\n'
f'Evaluation: {evaluation}\n'
f'Max sequence length: {train_params["max_seq_len"]}\n'
f'Max sentence length: {train_params["max_sent_len"]}\n'
f'Max sentences per chunk: {train_params["max_sent_per_block"]}\n'
)
if train_params["chunk_layout"] == "VanBerg":
report += f'Window length: {train_params["window_len"]}\n'
report += (
f'Batch size: {train_params["batch_size"]}\n'
f'Dropout rate: {train_params["dropout_rate"]}\n'
f'Learning rate: {train_params["learning_rate"]}\n'
f'Adam Epsilon: {train_params["eps"]}\n'
f'Weight decay: {train_params["weight_decay"]}\n'
f'Train time: {train_time}\n'
)
if torch.cuda.is_available():
report += f'GPU name: {torch.cuda.get_device_name(device)}\n'
memory_in_bytes = torch.cuda.get_device_properties(device).total_memory
memory_in_gb = round((memory_in_bytes/1024)/1024/1024,2)
report += f'GPU memory: {memory_in_gb}\n'
report += '\nAverages:\n'
report += avg_metrics.to_string(index=False, justify='center')
report += '\n\n*** Detailed report ***\n'
report += f'\nConfusion matrices\n{"-"*18}\n'
for i, label in enumerate(labels):
report += f'{label}: {i} \n'
for iteration_id, cm_dic in confusion_matrices.items():
report += f'=> Iteration {iteration_id}:\n'
for e, cm in cm_dic.items():
report += f'Epoch {e}:\n{cm}\n'
report += f'\nScores\n{"-"*6}\n'
for epoch, scores in complete_metrics.items():
df = pd.DataFrame(
scores,
columns=['Train loss', 'Test loss', 'P (macro)', 'R (macro)', 'F1 (macro)'],
index=[f'Iteration {i}' for i in range(scores.shape[0])])
report += f'Epoch: {epoch}\n' + df.to_string() + '\n\n'
with open(f'./reports/{dataset_name}/rep-{time_tag}.txt', 'w') as f:
f.write(report)