-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_analogy.py
91 lines (66 loc) · 2.51 KB
/
word_analogy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import pickle
import numpy as np
from scipy import spatial
model_path = './models/'
loss_model = 'cross_entropy'
#loss_model = 'nce'
model_filepath = os.path.join(model_path, 'word2vec_%s.model'%(loss_model))
dictionary, steps, embeddings = pickle.load(open(model_filepath, 'r'))
"""
==========================================================================
Write code to evaluate a relation between pairs of words.
You can access your trained model via dictionary and embeddings.
dictionary[word] will give you word_id
and embeddings[word_id] will return the embedding for that word.
word_id = dictionary[word]
v1 = embeddings[word_id]
or simply
v1 = embeddings[dictionary[word_id]]
==========================================================================
"""
#file = open("word_analogy_test.txt", "r")
if loss_model == 'nce':
outputFile = open("word_analogy_test_predictions_nce.txt","wb")
else:
outputFile = open("word_analogy_test_predictions_cross_entropy.txt","wb")
file = open("word_analogy_dev.txt", "r")
for i in file:
words = i.strip().split("||")
samples = words[0].split(",")
options = words[1].split(",")
List3 = list()
List4 = list()
List6 = list()
for sample in samples:
List2 = list()
words = sample.replace("\"", "").split(":")
e1 = embeddings[dictionary[words[0]]]
e2 = embeddings[dictionary[words[1]]]
for j in (e2 - e1):
List2.append(j)
List3.append(List2)
for option in options:
List5 = list()
optionwords = option.replace("\"", "").split(":")
o1 = embeddings[dictionary[optionwords[0]]]
o2 = embeddings[dictionary[optionwords[1]]]
List4.append(optionwords)
for k in (o2 - o1):
List5.append(k)
List6.append(List5)
similarity = 1 - spatial.distance.cdist(np.asarray(List3), np.asarray(List6), 'cosine')
mean = np.mean(similarity, axis = 1)
max = np.argmax(mean, axis=0)
min = np.argmin(mean, axis=0)
answer = ""
ques = ""
if ( (len(List4) - 1) > min and (len(List4) - 1) > max):
for l in range(len(List4)):
answer += ques + "\"" + List4[l][0] + ":" + List4[l][1] + "\"" + ques + "\t"
answer += ques + "\"" + List4[min][0] + ":" + List4[min][1] + "\"" + ques + "\t"
answer += ques + "\"" + List4[max][0] + ":" + List4[max][1] + "\"" + ques + "\n"
outputFile.write(answer)
print("Prediction file is successfully generated.")
file.close()
outputFile.close()