-
Notifications
You must be signed in to change notification settings - Fork 1
/
augmentation_retrieval.py
110 lines (99 loc) · 3.9 KB
/
augmentation_retrieval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import json_controller
import calculation
import numpy as np
import upload_controller
from data_controller import augmentations
from data_controller import fasttext_vocab as ft_vocab
from data_controller import fasttext_vectors as ft_vecs
from data_controller import glove_vocab as gv_vocab
from data_controller import glove_vectors as gv_vecs
from data_controller import cbow_vocab as cb_vocab
from data_controller import cbow_vectors as cb_vecs
def retrieve_augmentations(number, content, bar):
augments = {}
computed_augments = []
uploaded = 'false'
lower = 'false'
iterations = 3
space = 'fasttext'
if 'uploaded' in bar:
uploaded = bar['uploaded']
if 'lower' in bar:
lower = bar['lower']
if 'iterations' in bar:
iterations = bar['iterations']
if 'space' in bar:
space = bar['space']
if number == 'single':
if 'word' not in bar:
return 'BAD REQUEST - NO WORD INPUT', 400
target = bar['word']
if lower == 'true':
target.lower()
if space == 'glove':
augments[target], computed = retrieve_single_augmentations(target, gv_vocab, gv_vecs)
if space == 'cbow':
augments[target], computed = retrieve_single_augmentations(target, cb_vocab, cb_vecs)
if uploaded == 'true' and upload_controller.uploaded_binary == 'true':
augments[target], computed = retrieve_single_augmentations(target, upload_controller.uploaded_vocab,
upload_controller.uploaded_vecs)
else:
augments[target], computed = retrieve_single_augmentations(target)
if computed:
computed_augments.append(target)
if number == 'multiple':
if content is None:
return 'BAD REQUEST - NO WORD INPUT', 400
target = content['Words'].split(' ')
if lower == 'true':
target = [x.lower() for x in target]
vocab = ft_vocab
vecs = ft_vecs
if space == 'glove':
vocab = gv_vocab
vecs = gv_vecs
if space == 'cbow':
vocab = cb_vocab
vecs = cb_vecs
if uploaded == 'true' and upload_controller.uploaded_binary == 'true':
vocab = upload_controller.uploaded_vocab
vecs = upload_controller.uploaded_vecs
for word in target:
augments[word], computed = retrieve_single_augmentations(word, vocab, vecs)
if computed:
computed_augments.append(word)
response = json_controller.json_augmentation_retrieval(augments, computed_augments)
return response, 200
def retrieve_single_augmentations(target, vocab=ft_vocab, vecs=ft_vecs):
if target in augmentations:
return augmentations[target], False
# TODO elif: search whether augmentation has already been computed once, if yes retrieve it
# TODO inform user whether augments are postspecialized or not
else:
augments, computed = compute_augmentations(target, vocab, vecs)
return augments, computed
def retrieve_multiple_augmentations(target):
augments = []
no_augments = []
for word in target:
aug, found = retrieve_single_augmentations(word)
for w in aug:
augments.append(w)
if found:
no_augments.append(word)
return augments, no_augments
def compute_augmentations(target, vocab, vecs, iterations=4):
augments = []
cosinesim = {}
if target not in vocab:
return augments, True
target_vec = np.array(vecs[vocab[target]])
for word in vocab:
vec = np.array(vecs[vocab[word]])
if word != target:
cosinesim[word] = calculation.cosine_similarity(target_vec, vec)
for i in range(iterations):
maximum = max(cosinesim, key=lambda k: cosinesim[k])
cosinesim.pop(maximum)
augments.append(maximum)
return augments, False