forked from srvCodes/Gender-Classification-of-Blog-Author
-
Notifications
You must be signed in to change notification settings - Fork 0
/
probOfWords.py
79 lines (61 loc) · 1.99 KB
/
probOfWords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import nltk
import math
from nltk.corpus import stopwords
from nltk import ngrams
'''
def blogCorpusPOS(brownCorp):
outfile = open('realBlogCorpusWords.txt', 'w')
for sentence in brownCorp:
totalWords = ""
WordsTokenInOneSentence = nltk.word_tokenize(sentence)
WordsInOneSentence = nltk.Text(WordsTokenInOneSentence)
for word in WordsInOneSentence:
for i in range(3,6):
for j in range(len(word)-i+1):
totalWords = totalWords + word[j:j+i] + " "
totalWords = totalWords + "\n"
outfile.write(totalWords)
outfile.close()
def calc_count(brown):
count = 0
wordCount = {}
wordCount_p = {}
for sentence in brown:
words = sentence.split()
for word in words:
count += 1
if word in wordCount:
wordCount[word] += 1
else:
wordCount[word] = 1
# calculate probability
#for word in wordCount:
# temp = [word]
# wordCount_p[tuple(temp)] = (float(wordCount[word])/count)
return wordCount
'''
def q1_output(wordCount,filename):
outfile = open(filename, 'w')
for key,val in wordCount.items():
if val >= 3000:
outfile.write(key + ': ' + str(val) + '\n')
outfile.close()
def main():
with open('realBlogCorpus.txt', 'r') as content_file:
content = content_file.read()
content = content.lower()
stops = set(stopwords.words('english'))
for i in range(3,6):
word_Count = {}
totalcount = 0
for j in range(len(content)-i+1):
if content[j:j+i] in word_Count and content[j:j+i] not in stops:
word_Count[content[j:j+i]] += 1
else:
word_Count[content[j:j+i]] = 1
totalcount += 1
filename = "count_Words" + str(i) + ".txt"
print(totalcount)
q1_output(word_Count,filename)
if __name__ == '__main__':
main()