forked from Noahs-ARK/soft_patterns
-
Notifications
You must be signed in to change notification settings - Fork 1
/
visualize_efficiently.py
executable file
·443 lines (374 loc) · 15.5 KB
/
visualize_efficiently.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
#!/usr/bin/env python3
"""
Script to visualize the patterns in a SoftPatterns model based on their
highest-scoring spans in the dev set.
"""
import argparse
from collections import OrderedDict
import sys
from functools import total_ordering
import numpy as np
from soft_patterns import MaxPlusSemiring, Batch, argmax, SoftPatternClassifier, ProbSemiring, \
LogSpaceMaxTimesSemiring, soft_pattern_arg_parser, general_arg_parser
import torch
from torch.autograd import Variable
from torch.nn import LSTM
import heapq
from data import vocab_from_text, read_embeddings, read_docs, read_labels
from rnn import Rnn
from util import decreasing_length, chunked
SCORE_IDX = 0
START_IDX_IDX = 1
END_IDX_IDX = 2
@total_ordering
class BackPointer:
def __init__(self,
score,
previous,
transition,
start_token_idx,
end_token_idx):
self.score = score
self.previous = previous
self.transition = transition
self.start_token_idx = start_token_idx
self.end_token_idx = end_token_idx
def __eq__(self, other):
return self.score == other.score
def __ne__(self, other):
return not (self == other)
def __lt__(self, other):
return self.score < other.score
def __repr__(self):
return \
"BackPointer(" \
"score={}, " \
"previous={}, " \
"transition={}, " \
"start_token_idx={}, " \
"end_token_idx={}" \
")".format(
self.score,
self.previous,
self.transition,
self.start_token_idx,
self.end_token_idx
)
def display(self, doc_text, extra=""):
if self.previous is None:
#print(" ".join(doc_text))
return extra # " ".join("{:<15}".format(s) for s in doc_text[self.start_token_idx:self.end_token_idx])
if self.transition == "self-loop":
extra = "SL {:<15}".format(doc_text[self.end_token_idx - 1]) + extra
return self.previous.display(doc_text, extra=extra)
if self.transition == "happy path":
extra = "HP {:<15}".format(doc_text[self.end_token_idx - 1]) + extra
return self.previous.display(doc_text, extra=extra)
extra = "ep {:<15}".format("") + extra
return self.previous.display(doc_text, extra=extra)
def get_nearest_neighbors(w, embeddings, k=1000):
"""
For every transition in every pattern, gets the word with the highest
score for that transition.
Only looks at the first `k` words in the vocab (makes sense, assuming
they're sorted by descending frequency).
"""
return argmax(torch.mm(w, embeddings[:k, :]))
def get_candidate_documents(model,
batch_size,
dev_set=None,
k_best=5,
max_doc_len=-1):
"""Get list of candidate documents for each pattern, from which best match will be selected"""
num_patterns = model.total_num_patterns
selected_documents = [[] for i in range(num_patterns)]
for batch in chunked(dev_set, batch_size):
batch_obj = Batch([x[0][0] for x in batch], model.embeddings, model.to_cuda, 0, max_doc_len)
_, scores = model.forward(batch_obj, 1, 0)
scores = scores.data
# Adding epsilon to scores in order to not have two documents with the same score
epsilon = (torch.rand(scores.size())-0.5)/10000
scores += model.to_cuda(epsilon)
for i in range(num_patterns):
for j in range(batch_obj.size()):
tup = (scores[j,i], (batch[j][0][1], batch[j][0], batch[j][1]))
# print(i,j,"tup=",scores[j,i], epsilon[j,i], scores2[j,i])
heapq.heappush(selected_documents[i], tup)
if len(selected_documents[i]) == (k_best + 1):
heapq.heappop(selected_documents[i])
return selected_documents
def visualize_patterns(model,
batch_size,
dev_set=None,
dev_text=None,
k_best=5,
max_doc_len=-1):
dev_sorted = decreasing_length(zip(dev_set, dev_text))
selected_documents = get_candidate_documents(model, batch_size, dev_sorted, k_best, max_doc_len)
dev_labels = [
[
item[1][0]
for item in
pattern_documents
]
for pattern_documents in selected_documents
]
dev_set = [
[
item[1][1]
for item in
pattern_documents
]
for pattern_documents in selected_documents
]
dev_text = [
[
item[1][2]
for item in
pattern_documents
]
for pattern_documents in selected_documents
]
num_patterns = model.total_num_patterns
pattern_length = model.max_pattern_length
nearest_neighbors = \
get_nearest_neighbors(
model.diags.data,
model.to_cuda(torch.FloatTensor(model.embeddings).t())
).view(
num_patterns,
model.num_diags,
pattern_length
)
diags = model.diags.view(num_patterns, model.num_diags, pattern_length, model.word_dim).data
biases = model.bias.view(num_patterns, model.num_diags, pattern_length).data
self_loop_norms = torch.norm(diags[:, 0, :, :], 2, 2)
self_loop_neighbs = nearest_neighbors[:, 0, :]
self_loop_biases = biases[:, 0, :]
fwd_one_norms = torch.norm(diags[:, 1, :, :], 2, 2)
fwd_one_biases = biases[:, 1, :]
fwd_one_neighbs = nearest_neighbors[:, 1, :]
epsilons = model.get_eps_value().data if not model.no_eps else None
for p in range(num_patterns):
back_pointers = list(get_top_scoring_sequences(model, dev_set[p], max_doc_len))
p_len = model.end_states[p].data[0] + 1
k_best_doc_idxs = \
sorted(
range(len(dev_set[p])),
key=lambda doc_idx: back_pointers[doc_idx][p].score,
reverse=True # high-scores first
)[:k_best]
def span_text(doc_idx):
back_pointer = back_pointers[doc_idx][p]
return back_pointer.score, back_pointer.display(dev_text[p][doc_idx],
'#label={}'.format(dev_labels[p][doc_idx]))
print("Pattern:", p, "of length", p_len)
print("Highest scoring spans:")
for k, d in enumerate(k_best_doc_idxs):
score, text = span_text(d)
print("{} {:2.3f} {}".format(k, score, text.encode('utf-8')))
def transition_str(norm, neighb, bias):
return "{:5.2f} * {:<15} + {:5.2f}".format(norm, model.vocab[neighb], bias)
if not model.no_sl:
print("self-loops: ",
", ".join(
transition_str(norm, neighb, bias)
for norm, neighb, bias in zip(self_loop_norms[p, :p_len],
self_loop_neighbs[p, :p_len],
self_loop_biases[p, :p_len])))
print("fwd 1s: ",
", ".join(
transition_str(norm, neighb, bias)
for norm, neighb, bias in zip(fwd_one_norms[p, :p_len - 1],
fwd_one_neighbs[p, :p_len - 1],
fwd_one_biases[p, :p_len - 1])))
if not model.no_eps:
print("epsilons: ",
", ".join("{:31.2f}".format(x) for x in epsilons[p, :p_len - 1]))
print()
def zip_ap_2d(f, a, b):
return [
[
f(x, y) for x, y in zip(xs, ys)
]
for xs, ys in zip(a, b)
]
def cat_2d(padding, a):
return [
[p] + xs
for p, xs in zip(padding, a)
]
def transition_once_with_trace(model,
token_idx,
eps_value,
back_pointers,
transition_matrix_val,
restart_padding):
def times(a, b):
# wildly inefficient, oh well
return model.semiring.times(
torch.FloatTensor([a]),
torch.FloatTensor([b])
)[0]
# Epsilon transitions (don't consume a token, move forward one state)
# We do this before self-loops and single-steps.
# We only allow one epsilon transition in a row.
if not model.no_eps:
epsilons = cat_2d(
restart_padding(token_idx),
zip_ap_2d(
lambda bp, e: BackPointer(score=times(bp.score, e),
previous=bp,
transition="epsilon-transition",
start_token_idx=bp.start_token_idx,
end_token_idx=token_idx),
[xs[:-1] for xs in back_pointers],
eps_value # doesn't depend on token, just state
)
)
epsilons = zip_ap_2d(max, back_pointers, epsilons)
else:
epsilons = back_pointers
happy_paths = cat_2d(
restart_padding(token_idx),
zip_ap_2d(
lambda bp, t: BackPointer(score=times(bp.score, t),
previous=bp,
transition="happy path",
start_token_idx=bp.start_token_idx,
end_token_idx=token_idx + 1),
[xs[:-1] for xs in epsilons],
transition_matrix_val[:, 1, :-1]
)
)
if not model.no_eps:
# Adding self loops (consume a token, stay in same state)
self_loops = zip_ap_2d(
lambda bp, sl: BackPointer(score=times(bp.score, sl),
previous=bp,
transition="self-loop",
start_token_idx=bp.start_token_idx,
end_token_idx=token_idx + 1),
epsilons,
transition_matrix_val[:, 0, :]
)
return zip_ap_2d(max, happy_paths, self_loops)
else:
return happy_paths
def get_top_scoring_spans_for_doc(model, doc, max_doc_len):
# print('d-',doc)
batch = Batch([doc[0]], model.embeddings, model.to_cuda, 0, max_doc_len) # single doc
transition_matrices = model.get_transition_matrices(batch)
num_patterns = model.total_num_patterns
end_states = model.end_states.data.view(num_patterns)
def restart_padding(t):
return [
BackPointer(
score=x,
previous=None,
transition=None,
start_token_idx=t,
end_token_idx=t
)
for x in model.semiring.one(num_patterns)
]
eps_value = model.get_eps_value().data if not model.no_eps else None
hiddens = model.semiring.zero(num_patterns, model.max_pattern_length)
# set start state activation to 1 for each pattern in each doc
hiddens[:, 0] = model.semiring.one(num_patterns, 1)
# convert to back-pointers
hiddens = \
[
[
BackPointer(
score=state_activation,
previous=None,
transition=None,
start_token_idx=0,
end_token_idx=0
)
for state_activation in pattern
]
for pattern in hiddens
]
# extract end-states
end_state_back_pointers = [
bp[end_state]
for bp, end_state in zip(hiddens, end_states)
]
for token_idx, transition_matrix in enumerate(transition_matrices):
transition_matrix = transition_matrix[0, :, :, :].data
hiddens = transition_once_with_trace(model,
token_idx,
eps_value,
hiddens,
transition_matrix,
restart_padding)
# extract end-states and max with current bests
end_state_back_pointers = [
max(best_bp, hidden_bps[end_state])
for best_bp, hidden_bps, end_state in zip(end_state_back_pointers, hiddens, end_states)
]
return end_state_back_pointers
def get_top_scoring_sequences(model, dev_set, max_doc_len):
""" Get top scoring sequences for every pattern and doc. """
for doc_idx, doc in enumerate(dev_set):
if doc_idx % 100 == 99:
print(".", end="", flush=True)
yield get_top_scoring_spans_for_doc(model, doc, max_doc_len)
# TODO: refactor duplicate code with soft_patterns.py
def main(args):
print(args)
if args.seed != -1:
torch.manual_seed(args.seed)
np.random.seed(args.seed)
pattern_specs = OrderedDict(sorted(([int(y) for y in x.split("-")] for x in args.patterns.split("_")),
key=lambda t: t[0]))
n = args.num_train_instances
mlp_hidden_dim = args.mlp_hidden_dim
num_mlp_layers = args.num_mlp_layers
dev_vocab = vocab_from_text(args.vd)
print("Dev vocab size:", len(dev_vocab))
vocab, embeddings, word_dim = \
read_embeddings(args.embedding_file, dev_vocab)
num_padding_tokens = max(list(pattern_specs.keys())) - 1
dev_input, dev_text = read_docs(args.vd, vocab, num_padding_tokens=num_padding_tokens)
dev_labels = read_labels(args.vl)
dev_data = list(zip(dev_input, dev_labels))
if n is not None:
dev_data = dev_data[:n]
num_classes = len(set(dev_labels))
print("num_classes:", num_classes)
if args.maxplus:
semiring = MaxPlusSemiring
elif args.maxtimes:
semiring = LogSpaceMaxTimesSemiring
else:
print("Efficient visualization only works with max based semirings")
return -1
if args.use_rnn:
rnn = Rnn(word_dim,
args.hidden_dim,
cell_type=LSTM,
gpu=args.gpu)
else:
rnn = None
model = SoftPatternClassifier(pattern_specs, mlp_hidden_dim, num_mlp_layers, num_classes, embeddings, vocab,
semiring, args.bias_scale_param, args.gpu, rnn=rnn, pre_computed_patterns=None,
no_sl=args.no_sl, shared_sl=args.shared_sl, no_eps=args.no_eps,
eps_scale=args.eps_scale, self_loop_scale=args.self_loop_scale)
if args.gpu:
state_dict = torch.load(args.input_model)
else:
state_dict = torch.load(args.input_model, map_location=lambda storage, loc: storage)
model.load_state_dict(state_dict)
if args.gpu:
model.to_cuda(model)
visualize_patterns(model, args.batch_size, dev_data, dev_text, args.k_best, args.max_doc_len)
return 0
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
parents=[soft_pattern_arg_parser(), general_arg_parser()])
parser.add_argument("-k", "--k_best", help="Number of nearest neighbor phrases", type=int, default=5)
sys.exit(main(parser.parse_args()))