-
Notifications
You must be signed in to change notification settings - Fork 4
/
evaluation_function.py
284 lines (240 loc) · 10.4 KB
/
evaluation_function.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import numpy as np
'''
This script contains three sets of evaluation functions:
(a) post-level eval scores that are calculated by merging the actual/predicted labels across all timelines:
> precision = get_precision_score(actual, predicted, LABEL)
> recall = get_recall_score(actual, predicted, LABEL)
> f1_score = get_f1_score(actual, predicted, LABEL)
(b) timeline-level, window-based precision and recall scores (run on each timeline independently!):
> precision_w, recall_w = get_timeline_level_precision_recall(actual, predicted, WINDOW, LABEL)
(c) timeline-level, coverage-based precision/recall scores (run on each timeline independently!):
> cov_recall = get_coverage_recall(actual, predicted, LABEL)
> cor_precision = get_coverage_precision(actual, predicted, LABEL)
'''
'''(a) The post-level evaluation scripts'''
def get_precision_score(actual, predicted, cls='IS'):
"""
Returns the overall (across all timelines) precision score for a particular label.
Parameters
----------
actual: list
list of actual labels in a dev/test set (one per post)
predicted: list
list of predicted labels in a dev/test set (one per post)
cls:
the label we are after (0/IS/IE)
Returns
-------
prec: float
the final precision score for the specified label
"""
assert len(actual)==len(predicted)
actual, predicted = list(actual), list(predicted)
if predicted.count(cls)==0:
print('Made no predictions for label', cls, '(precision undefined - returning nan).')
return np.nan
ac_idx = set(np.where(np.array(actual)==cls)[0])
pr_idx = set(np.where(np.array(predicted)==cls)[0])
prec = len(ac_idx.intersection(pr_idx))/len(pr_idx)
return prec
def get_recall_score(actual, predicted, cls='IS'):
"""
Returns the overall (across all timelines) recall score for a particular label.
Parameters
----------
actual: list
list of actual labels in a dev/test set (one per post)
predicted: list
list of predicted labels in a dev/test set (one per post)
cls:
the label we are after (0/IS/IE)
Returns
-------
rec: float
the final recall score for the specified label
"""
actual, predicted = list(actual), list(predicted)
assert len(actual)==len(predicted)
if actual.count(cls)==0:
print('Have no examples for label', cls, '(recall undefined - returning nan).')
return np.nan
ac_idx = set(np.where(np.array(actual)==cls)[0])
pr_idx = set(np.where(np.array(predicted)==cls)[0])
rec = len(ac_idx.intersection(pr_idx))/len(ac_idx)
return rec
def get_f1_score(actual, predicted, cls='IS'):
"""
Returns the overall (across all timelines) F1 score for a particular label.
Parameters
----------
actual: list
list of actual labels in a dev/test set (one per post)
predicted: list
list of predicted labels in a dev/test set (one per post)
cls:
the label we are after (0/IS/IE)
Returns
-------
f1_sco: float
the final f1 score for the specified label
"""
recall = get_recall_score(actual, predicted, cls)
precision = get_precision_score(actual, predicted, cls)
f1_sco = 2.0*(recall*precision)/(recall+precision)
return f1_sco
'''(b) The window-based, timeline-level Precision and Recall script'''
def get_timeline_level_precision_recall(actual, predicted, window=1, cls='IS'):
"""
Given the lists of (ORDERED!) predicted and actual labels of a SINGLE timeline,
the label to calculate the metrics for and the window to use (allowing
+-window predictions to be considered as accurate), it returns:
(a) the precision using that window for the specified label
(b) the recall -//-
Parameters
----------
actual: list
list of actual labels (ORDERED) in a single timeline (one per post)
predicted: list
list of actual labels (ORDERED) in a single timeline (one per post)
window: int
the window size to consider (you can play around with increasing/decreasing)
cls:
the label we are after (0/IS/IE)
Returns
-------
precision: float
the final window-based precision score for the specified label in this timeline
recall: float
the final window-based recall score for the specified label in this timeline
"""
assert len(actual)==len(predicted)
if (len(actual)>125) or (len(actual)<10):
print('This function should be run at the timeline-level (i.e., not by merging all actual/predicitons together)!')
# Find the indices of the specified predicted and actual label
pr_idx = np.where(np.array(predicted)==cls)[0]
ac_idx = np.where(np.array(actual)==cls)[0]
if len(ac_idx)==0: # cannot divide by zero (Recall is undefined)
recall, precision = np.nan, np.nan
if len(pr_idx)>0:
precision = 0.0
elif len(pr_idx)==0: # cannot divide by zero (Precision is undefined, but Recall is 0)
precision = np.nan
recall = 0.0
else:
already_used = []
for l in ac_idx:
for p in pr_idx:
if (np.abs(l-p)<=window) & (p not in already_used):
already_used.append(p)
break
precision = len(set(already_used))/len(pr_idx)
recall = len(set(already_used))/len(ac_idx)
return precision, recall
'''(c) The coverage scripts'''
def get_coverage_recall(actual, predicted, cls='IS'):
"""
Given the lists of (ORDERED!) predicted and actual labels of a SINGLE timeline,
the label to calculate the metrics for and the window to use (allowing
+-window predictions to be considered as accurate), it returns the recall-oriented
coverage for that particular timeline
Parameters
----------
actual: list
list of actual labels (ORDERED) in a single timeline (one per post)
predicted: list
list of actual labels (ORDERED) in a single timeline (one per post)
cls:
the label we are after (0/IS/IE)
Returns
-------
coverage_recall: float
the final coverage-based recall score for the specified label in this timeline
"""
assert len(actual)==len(predicted)
if (len(actual)>125) or (len(actual)<10):
print('This function should be run at the timeline-level (i.e., not by merging all actual/predicitons together)!')
preds_regions, _ = extract_regions(predicted, cls)
actual_regions, _ = extract_regions(actual, cls)
total_sum, denom = 0.0, 0.0 # timeline basis
coverage_recall = np.nan
if len(actual_regions)>0:
for region in actual_regions: # For each actual region within the timeline
ac = set(region)
Orrs = [] #calculated per region
max_cov_for_region = 0.0 #calculated per region
#Find the maximum ORR for this actual region:
if len(preds_regions)>0:
for predicted_region in preds_regions:
pr = set(predicted_region)
Orrs.append(len(ac.intersection(pr))/len(ac.union(pr))) # Intersection over Union
max_cov_for_region = np.max(Orrs)
#Now multiply it by the length of the region
total_sum = total_sum + (len(ac)*max_cov_for_region)
denom += len(ac)
coverage_recall = total_sum/denom
return coverage_recall
def get_coverage_precision(actual, predicted, cls='IS'):
"""
Given the lists of (ORDERED!) predicted and actual labels of a SINGLE timeline,
the label to calculate the metrics for and the window to use (allowing
+-window predictions to be considered as accurate), it returns the precision-oriented
coverage for that particular timeline
Parameters
----------
actual: list
list of actual labels (ORDERED) in a single timeline (one per post)
predicted: list
list of actual labels (ORDERED) in a single timeline (one per post)
cls:
the label we are after (0/IS/IE)
Returns
-------
coverage_precision: float
the final coverage-based precision score for the specified label in this timeline
"""
assert len(actual)==len(predicted)
if (len(actual)>125) or (len(actual)<10):
print('This function should be run at the timeline-level (i.e., not by merging all actual/predicitons together)!')
actual_regions, _ = extract_regions(actual, cls)
preds_regions, _ = extract_regions(predicted, cls)
total_sum, denom = 0.0, 0.0 # big sum and 1/N, respectively
coverage_precision = np.nan
if len(preds_regions)>0:
for region in preds_regions: # For each predicted region within the timeline
ac = set(region)
Orrs = []
max_cov_for_region = 0.0
#Find the maximum ORR for this predicted region:
if len(actual_regions)>0:
for predicted_region in actual_regions:
pr = set(predicted_region)
Orrs.append(len(ac.intersection(pr))/len(ac.union(pr))) # Intersection over Union
max_cov_for_region = np.max(Orrs)
total_sum = total_sum + (len(ac)*max_cov_for_region)
denom += len(ac)
coverage_precision = total_sum/denom
return coverage_precision
'''Helper functions for the coverage-based metrics'''
def extract_regions(vals, cls):
#Convert labels to boolean based on the class we are looking for:
vals = np.array(vals)
vals_boolean = np.zeros(len(vals))
vals_boolean[vals==cls] = 1
#Find the indices of the positive cases:
indices = np.where(vals_boolean==1)[0]
neg_indices = np.where(vals_boolean==0)[0]
actual_regions = get_regions(indices)
actual_regions_neg = get_regions(neg_indices)
return actual_regions, actual_regions_neg
def get_regions(indices):
actual_regions = []
if len(indices)>0:
current_set = [indices[0]]
for i in range(1, len(indices)):
if indices[i]-indices[i-1]==1: #if they are consecutive
current_set.append(indices[i])
else:
actual_regions.append([v for v in current_set])
current_set = [indices[i]]
actual_regions.append(current_set)
return actual_regions