-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
165 lines (126 loc) · 6.83 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python3
#-----------------------------------------------------------------------#
#
# This program takes n-gram files and a word list
# and creates a file with lists of most similar words.
# John Goldsmith and Wang Xiuli 2012.
# Jackson Lee and Simon Jacobs 2015
#
#-----------------------------------------------------------------------#
import argparse
import os
import pickle
from findManifold import *
# argparse is a python module that makes creating command-line interfaces very very easy.
# Try: python main.py --help
def makeArgParser():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("nWords", help="Number of words for analysis", type=int, default=1000)
parser.add_argument("nNeighbors", help="Number of neighbors", type=int, default=9)
parser.add_argument("--nEigenvectors", help="Number of eigenvectors",
type=int, default=11)
parser.add_argument("--datapath", help="Data folder of input ngram files and output neighbor files", type=str,
default="../../data/")
# parser.add_argument("--bigrams", help="Bigrams file to use", type=str,
# default="../../data/english/ngrams/english-brown_bigrams.txt")
# parser.add_argument("--trigrams", help="Trigrams file to use", type=str,
# default="../../data/english/ngrams/english-brown_trigrams.txt")
# parser.add_argument("--words", help="Words file to use", type=str,
# default="../../data/english/ngrams/english-brown_words.txt")
# parser.add_argument("--output", help="Output folder to use", type=str,
# default="../../data/english/neighbors")
parser.add_argument("--corpus", help="Corpus name (e.g. 'brown', 'google')", type=str, default="brown")
parser.add_argument("--lang", help="Language name", type=str, default="english")
return parser
def DEBUG(s):
print(s, flush=True)
# sys.stdout.flush()
def main(argv):
args = makeArgParser().parse_args()
NumberOfWordsForAnalysis = args.nWords
NumberOfNeighbors = args.nNeighbors
NumberOfEigenvectors = args.nEigenvectors
language = args.lang
corpus = args.corpus
datafolder = args.datapath
inpath = datafolder + language + '/ngrams/'
outfolder = datafolder + language + '/neighbors/'
outcontextsfolder = datafolder + language + '/word_contexts/'
if not os.path.exists(outfolder):
os.makedirs(outfolder)
if not os.path.exists(outcontextsfolder):
os.makedirs(outcontextsfolder)
infileBigramsname = inpath + language + '-' + corpus + '_bigrams.txt'
infileTrigramsname = inpath + language + '-' + corpus + '_trigrams.txt'
infileWordsname = inpath + language + '-' + corpus + '_words.txt'
print('Reading word list...', flush=True)
mywords = GetMyWords(infileWordsname, corpus)
print("Word file is", infileWordsname, flush=True)
print('Corpus has', len(mywords), 'words', flush=True)
if NumberOfWordsForAnalysis > len(mywords):
NumberOfWordsForAnalysis = len(mywords)
print('number of words for analysis reduced to', NumberOfWordsForAnalysis)
analyzedwordlist = list(mywords.keys())[ : NumberOfWordsForAnalysis]
outfilenameNeighbors = outfolder + language + '-' + corpus + "_" + \
str(NumberOfWordsForAnalysis) + "_" + \
str(NumberOfNeighbors) + "_nearest_neighbors.txt"
outWordToContexts_pkl_fname = outcontextsfolder + language + '-' + corpus + \
"_" + str(NumberOfWordsForAnalysis) + "_" + \
str(NumberOfNeighbors) + "_WordToContexts.pkl"
outContextToWords_pkl_fname = outcontextsfolder + language + '-' + corpus + \
"_" + str(NumberOfWordsForAnalysis) + "_" + \
str(NumberOfNeighbors) + "_ContextToWords.pkl"
outfileNeighbors = open(outfilenameNeighbors, 'w')
for outfile in [outfileNeighbors]:
print("# language: ", language,
"\n# corpus: ", corpus,
"\n#Number of words analyzed ", NumberOfWordsForAnalysis,
"\n#Number of neighbors identified ", NumberOfNeighbors,"\n",
file=outfile)
print("\nI am looking for: ", infileTrigramsname)
print("Number of words that will be analyzed: ", NumberOfWordsForAnalysis)
print("Number of neighbors: ", NumberOfNeighbors)
DEBUG("Reading bigrams/trigrams")
# TODO: set two dicts -- (1) from word to context (2) from context to word
# context_array, wordContextDict, contextWordDict = GetContextArrayNew(analyzedwordlist, bigramfile, trigramfile)
# context_array = GetContextArrayNew(wordContextDict, contextWordDict)
DEBUG("Computing context array")
context_array, WordToContexts, ContextToWords = GetContextArray(corpus,
NumberOfWordsForAnalysis,
analyzedwordlist,
infileBigramsname,
infileTrigramsname)
with open(outWordToContexts_pkl_fname, 'wb') as f:
pickle.dump(WordToContexts, f)
DEBUG('WordToContexts ready and pickled')
with open(outContextToWords_pkl_fname, 'wb') as f:
pickle.dump(ContextToWords, f)
DEBUG('ContextToWords ready and pickled')
DEBUG("Computing shared contexts")
CountOfSharedContexts = context_array.dot(context_array.T).todense()
del context_array
DEBUG("Computing diameter")
Diameter = Normalize(NumberOfWordsForAnalysis, CountOfSharedContexts)
DEBUG("Computing incidence graph")
incidencegraph = compute_incidence_graph(NumberOfWordsForAnalysis, Diameter, CountOfSharedContexts)
DEBUG("Computing mylaplacian")
mylaplacian = compute_laplacian(NumberOfWordsForAnalysis, Diameter, incidencegraph)
del Diameter
del incidencegraph
DEBUG("Compute eigenvectors...")
myeigenvalues, myeigenvectors = GetEigenvectors(mylaplacian)
del mylaplacian
del myeigenvalues
DEBUG('Coordinates computed. now computing distances between words...')
coordinates = myeigenvectors[:,:NumberOfEigenvectors] # take first N columns of eigenvector matrix
wordsdistance = compute_words_distance(NumberOfWordsForAnalysis, coordinates)
del coordinates
DEBUG('Computing nearest neighbors now... ')
closestNeighbors = compute_closest_neighbors(analyzedwordlist, wordsdistance, NumberOfNeighbors)
DEBUG("Output to files")
for (wordno, word) in enumerate(analyzedwordlist):
print(' '.join([analyzedwordlist[idx] for idx in closestNeighbors[wordno]]), file=outfileNeighbors)
outfileNeighbors.close()
# Don't execute any code if we are loading this file as a module.
if __name__ == "__main__":
main(sys.argv)