-
Notifications
You must be signed in to change notification settings - Fork 1
/
yearcloud.py
68 lines (49 loc) · 1.64 KB
/
yearcloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from nyt import NYT
import tfidflib
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
def readf(fname):
with open(fname,'rb') as f:
r = f.read().decode('utf-8', errors='replace')
return r
datafiles = [
('e','headlinesFromExpansion1850to1900'),
('e','headlinesFromExpansion1900to1950'),
('e','headlinesFromExpansion1950to2000'),
('r','headlinesFromRecession1850to1900'),
('r','headlinesFromRecession1900to1950'),
('r','headlinesFromRecession1950to2000'),
]
if __name__ == '__main__':
mypath = 'data/'
nytapi = NYT()
wordcloud = WordCloud(collocations=False)
# use each month as a doc
#yrz = [(1853,12),(1853,11),(1853,10)]
#docs = [' '.join([doc['headline']['main'] for doc in nytapi.get_archive(*yrmo)['response']['docs']]) for yrmo in yrz]
#files = [str(f) for f in listdir(mypath) if isfile(join(mypath, str(f))) and f[0] != '.']
#files = [df[1] for df in datafiles if df[0] == 'e']
#files = [df[1] for df in datafiles if df[0] == 'r']
files = [mypath + df[1] + '.txt' for df in datafiles]
#print(listdir(mypath))
docs = [readf(fname) for fname in files]
# get tfidf scores from each doc
scores = tfidflib.getscores(docs)
# print number of tokens used in the tfidf of each doc
i = 1
import numpy as np
for doc in scores:
fstr = []
#scz = 1000*np.array(list(doc.values()))
#print(np.histogram(scz))
for w in doc.keys():
fstr += [w,]*int(1000*doc[w])
wc = wordcloud.generate(' '.join(fstr))
#plt.imshow(wc)
plt.imsave(files[i] + '.png',wc)
print(str(i), 'has', len(doc.keys()), 'different words.')
i += 1
#print(scores)
#print(len(scores))