-
Notifications
You must be signed in to change notification settings - Fork 1
/
Treecloud.py
345 lines (296 loc) · 14.4 KB
/
Treecloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
#!/usr/sfw/bin/python
#python C:\Sites\gambettelirmm\TreecloudDistribution\treecloud.py stoplist=C:\Sites\GambetteLirmm\TreecloudDistribution\StoplistEnglish.txt nbwords=30 distance=hyperlex unit=1 color=chronology C:\TreeCloud\BBC1.txt
#python C:\Sites\gambettelirmm\TreecloudDistribution\treecloud.py words=C:\Treecloud\Racine\words.csv nbwords=30 distance=hyperlex unit=1 color=chronology C:\TreeCloud\Racine\Phedre.txt
#####################################################
# Copyright 2008-2023 Philippe Gambette
#
# This file is part of TreeCloud v1.5beta (13/04/2023).
#
# TreeCloud is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# TreeCloud is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with TreeCloud. If not, see <http://www.gnu.org/licenses/>.
#
# For more information:
# http://www.treecloud.org
#####################################################
import sys, os, re, string, time
from TreecloudFunctions import *
from math import *
#------------------------------
# analyze and load parameters
#------------------------------
args={}
i=1;
thefile="";
while i<len(sys.argv):
res=re.search("(.*)[=](.*)",sys.argv[i])
if res:
args[res.group(1)]=res.group(2)
else:
thefile=sys.argv[i]
i+=1
if not("minnb" in args):
args["minnb"]=-1
else :
args["minnb"]=int(args["minnb"])
minnb=args["minnb"];
if not("nbwords" in args):
if minnb==-1:
args["nbwords"]=30
else :
args["nbwords"]=-1
else :
args["nbwords"]=int(args["nbwords"])
nbwords=args["nbwords"];
if minnb<1:
minnb=1
if not("window" in args):
args["window"]="30"
winSize=int(args["window"]);
if not("step" in args):
args["step"]="1"
step=int(args["step"]);
if not("distance" in args):
args["distance"]="jaccard"
theformula=args["distance"];
if not("unit" in args):
args["unit"]="1"
unit=int(args["unit"]);
if not("normat" in args):
args["normat"]="auto"
normat=args["normat"];
if not("color" in args):
args["color"]="chronology"
color=args["color"];
if not("sepchar" in args):
args["sepchar"]=""
sepchar=args["sepchar"];
if not("splitstreepath" in args):
args["splitstreepath"]="C:\TreeCloud\SplitsTree.lnk"
splitstreepath=args["splitstreepath"];
if not("customcolor" in args):
args["customcolor"]=" "
customcolor=args["customcolor"];
if not("customsize" in args):
args["customsize"]=" "
customsize=args["customsize"];
if not("dendropath" in args):
args["dendropath"]=""
dendropath=args["dendropath"];
# Build the tree cloud
# * thefile contains the address of the text file
# * winSize is the size of the sliding window
# * step is the sliding step of the sliding window
# * minnb is the minimum number of occurrences of the words in the treecloud.
# -1 if not set
# * nbwords is the minimum number of words in the treecloud.
# -1 if not set, 30 if not set and minnb not set
# * formula contains the name of the cooccurrence distance formula to apply
# * normat contains a string for the normalization method to transform
# the distance matrix into a [0,1] matrix (affine,linear,log,auto)
# * unit equals 1 if the edges of the treecloud have the same length, 0 otherwise
# * color is a string, name of the color set (yahoo,berry,chronology...)
# * splitstreepath contains the path of the program SplitsTree used to draw the tree cloud
# * sepchar contains a string with a special character used to separate sliding windows (alternative to winsize)
# * dendropath contains the path of Dendroscope if used instead of SplitsTree to draw the tree cloud
def buildTreeCloud(thefile,minnb,nbwords,winSize,step,formula,normat,unit,color,splitstreepath,dendropath,sepchar):
textPlusWordlist=openText(thefile,sepchar)
print("Computing word frequencies...")
text=textPlusWordlist[0]
wordlist=textPlusWordlist[1]
print(str(len(text)) + " words - " + str(len(wordlist)) + " different words.")
#saveTenWordsPerLine(thefile)
#separateLines(thefile)
#------------------------------
# wordlist now contains all words of the text as keys
# whose associated values are nb of occurrences
#------------------------------
freqs=sortByFrequency(wordlist)
#------------------------------
# freqs is a dict which associates for any n: a dict of all words which appeared n times (in decreasing order)
#------------------------------
print("Word frequencies computed.")
#------------------------------
# load stoplist
#------------------------------
if "stoplist" in args:
stoplist=loadStoplist(args["stoplist"])
print("Stoplist loaded.")
else :
stoplist={}
#------------------------------
# save words with associated frequency into a csv file
#------------------------------
saveFrequencies(freqs,stoplist,thefile+".freqs.txt")
print("Saved word frequencies (to use in TagCloud Builder for example).")
#------------------------------
# compute list of most frequent words
#------------------------------
if nbwords<1:
nbwords=len(text)
if "words" in args:
# if a word list is imposed, keep only words inside it
theResult=imposedWordList(freqs,loadStoplist(args["words"]),sepchar)
print("Word list" + (args["words"]) + "loaded.")
else :
theResult=wordList(freqs,stoplist,minnb,nbwords,sepchar)
keptWordsId=theResult[0]
keptWords=theResult[1]
keptWordsFrequencies=theResult[2]
#------------------------------
# keptWords now associates all kept words to their id (integer)
# keptWordsId is the reverse map (associates integer to word)
#------------------------------
#------------------------------
# filter text: remove all words not in keptWords from table "text"
#------------------------------
text = filterText(text,keptWordsId,sepchar)
#print text
#------------------------------
# compute cooccurrence matrix from filtered text:
#------------------------------
print("Starting cooccurrence computation...")
if sepchar=="":
coocc = computeCooccurrence(text,keptWordsId,winSize,step)
else :
coocc = computeCooccurrenceDisjoint(text,keptWordsId,sepchar)
print("Cooccurrences computed...")
#print coocc
#------------------------------
# compute distance matrix from cooccurrence matrices:
#------------------------------
distance = normalizeMatrix(distanceFromCooccurrence(coocc,formula),normat)
if normat=="log":
distance = normalizeMatrix(distance,"linear")
print("Distance matrix computed...")
#------------------------------
# compute arboricity of the distance matrix:
#------------------------------
#print("Computing arboricity...")
#arboricity=computeArboricity(distance)
#print("Discrete arboricity:"),arboricity[0];
#print("Continuous arboricity:"),arboricity[1];
#------------------------------
# transform distance matrix into CSV File:
#------------------------------
exportToCsv(distance,keptWords,thefile+"."+formula+".csv")
#------------------------------
# transform distance matrix into Nexus File:
#------------------------------
exportToNexus(distance,keptWords,thefile+"."+formula,unit)
#------------------------------
# transform Nexus File into Nexus Tree through SplitsTree:
#------------------------------
print("Launching SplitsTree from path " + splitstreepath + " to compute the tree.")
invisible=1
nexusOrders(distance,keptWords,thefile+"."+formula,invisible)
#print splitstreepath
if invisible==1:
os.system('""'+(splitstreepath)+'" +g false +s false -i "'+thefile+"."+formula+'.nexorders""')
else:
os.system('""'+(splitstreepath)+'" -i "'+thefile+"."+formula+'.nexorders""')
print("Tree computed by SplitsTree.")
#------------------------------
# read tree splits from Nexus file
#------------------------------
#splits=splitsFromNexus(keptWordsId,thefile+"."+formula)
#------------------------------
# color Nexus file
#------------------------------
print("Coloring the tree...")
colorNexus(keptWordsId,keptWordsFrequencies,thefile+"."+formula,text,color,customcolor,customsize,distance,dendropath)
print("Tree colored.")
if dendropath=="":
print("Displaying the tree cloud in SplitsTree...")
os.system('""'+splitstreepath+'" -i "'+thefile+"."+formula+'.colored.nexus""')
else:
print("Displaying the tree cloud in Dendroscope...")
os.system('""'+dendropath+'" -x "source file=\''+thefile+"."+formula+'.colored.dendrorders\';""')
if os.path.isfile(thefile):
print("Loading the text file...")
buildTreeCloud(thefile,minnb,nbwords,winSize,step,theformula,normat,unit,color,splitstreepath,dendropath,sepchar)
else:
if re.search("help",thefile):
print("")
print("==== HELP ON TREECLOUD ====")
print("treecloud [options] <filename>")
print("")
print("The list of words and frequencies is written in <filename>.freqs.txt")
print("This file can be used by TagCloudBuilder to build a tag cloud.")
print("")
print("The distance matrix is written in <filename>.csv, and in the Nexus")
print("format in <filename>.nexus.")
print("")
print("***OPTIONS:***")
print("")
print("stoplist=<filename>: <filename> is used as a stoplist, i.e. each line")
print(" of the file stored in <filename> contains a word which will not be")
print(" considered during the rest of the analysis.")
print("")
print("words=<filename>: only words present in <filename> (one word per")
print(" line) will be kept for the analysis.")
print("")
# print("savewords=<filename>: the list of words with associated number of")
# print(" occurrences is stored in <filename>.")
# print("")
print("minnb=<n>: the treecloud contains words appearing at least n times.")
print("")
print("nbwords=<n>: the treecloud contains at most nbwords words.")
print(" default: n=30")
print("")
print("window=<n>: width of the sliding window for cooccurrence distance.")
print(" default: n=30")
print("")
print("step=<n>: sliding step of the sliding window for cooccurrence")
print(" distance. If you use n=30 for parameter window=30 then you only")
print(" consider disjoint windows for cooccurrence computation.")
print(" default: n=1 (most accurate)")
print("")
print("sepchar=<string>: separation character to separate cooccurrence")
print(" windows (instead of using sliding windows of constant width)")
print(" default: not used, see parameter 'window'")
print(" use sepchar=aaaaaaa (string in lower case only, do not")
print(" use a punctuation mark).")
print("")
print("distance=<formula>: <formula> is chosen to compute the cooccurrence")
print("distance. Possible values for <formula> (see Evert's PhD thesis):")
print("chisquared mi liddell dice jaccard gmean hyperlex ms oddsratio")
print("zscore loglikelihood poissonstirling")
print("")
print("normat=<string>:normalization method to transform the distance matrix")
print(" into a [0,1] matrix (affine,linear,log,auto)")
print(" default: auto")
print("")
print("splitstreepath=<path>: path of the program SplitsTree (splitstree.org)")
print(" used to compute the tree and draw the tree clouds. Please avoid spaces")
print(" in the path.")
print(" default: C:\TreeCloud\SplitsTree.lnk")
print("")
print("dendropath=<path>: path of the program Dendroscope (dendroscope.org)")
print(" used to draw the tree clouds instead of SplitsTree.")
print(" Please avoid spaces in the path.")
print("")
print("unit=<b>: tree edges with unit length.")
print(" default: b=1, otherwise set b=0")
print("")
print("color=<string>: name of the color set (chronology,dispersion,chronodisp,berry,yahoo).")
print(" default: chronology")
print("")
print("customcolor=<path>: path of a csv file containing words in the first column")
print(" and 3 integers in the next 3 columns")
print("")
print("customsize=<path>: path of a csv file containing words in the first column and")
print(" font references in the second one. Example: Arial-PLAIN-14")
print("")
else:
print(filename + " - file does not exist!")
print("Please use option help to display the possible parameters.")