forked from srvCodes/Gender-Classification-of-Blog-Author
-
Notifications
You must be signed in to change notification settings - Fork 0
/
probOfPOS.py
152 lines (114 loc) · 4.43 KB
/
probOfPOS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import nltk
import math
from nltk import ngrams
from nltk.tag.stanford import StanfordPOSTagger
st = StanfordPOSTagger('/home/saurav/Documents/postagger/models/english-bidirectional-distsim.tagger',
'/home/saurav/Documents/postagger/stanford-postagger.jar')
posTagList = ['NN', 'CC', 'LS', 'PDT', 'POS', 'SYM', 'NNS', 'NNP', 'NNPS', 'FW', 'CD', 'JJ', 'JJR', 'JJS', 'IN', 'TO', 'DT',
'EX', 'PRP', 'PRP$', 'WDT', 'WP', 'WP$', 'MD', 'VB', 'VBZ', 'VBP', 'VBD', 'VBN', 'VBG', 'RB', 'RBR', 'RBS', 'RP', 'WRB', 'UH', '.']
def blogCorpusPOS(brownCorp):
outfile = open('realBlogCorpusPOS.txt', 'w')
for sentence in brownCorp:
tagSentence = ""
sentence = sentence.decode('utf-8')
tokensWord = nltk.word_tokenize(sentence)
textToken = nltk.Text(tokensWord)
#tags = st.tag(textToken)
tags = nltk.pos_tag(tokensWord)
for a,b in tags:
if b in posTagList:
tagSentence = tagSentence + b + " "
tagSentence = tagSentence + "\n"
outfile.write(tagSentence)
outfile.close()
def calc_probabilities(brown):
unigram_p = {}
bigram_p = {}
trigram_p = {}
fourgram_p = {}
fivegram_p = {}
unigram = {}
bigram = {}
trigram = {}
fourgram = {}
fivegram = {}
uni_count = biCount = triCount = fourCount = fiveCount = 0
for sentence in brown:
tokens = sentence.split()
for word in tokens:
uni_count += 1
if word in unigram:
unigram[word] += 1
else:
unigram[word] = 1
bigram_tuples = tuple(nltk.bigrams(tokens))
for item in bigram_tuples:
biCount += 1
if item in bigram:
bigram[item] += 1
else:
bigram[item] = 1
trigram_tuples = tuple(nltk.trigrams(tokens))
for item in trigram_tuples:
triCount += 1
if item in trigram:
trigram[item] += 1
else:
trigram[item] = 1
fourgram_tuples = ngrams(tokens, 4)
for item in fourgram_tuples:
fourCount += 1
if item in fourgram:
fourgram[item] += 1
else:
fourgram[item] = 1
fivegram_tuples = ngrams(tokens, 5)
for item in fivegram_tuples:
fiveCount += 1
if item in fivegram:
fivegram[item] += 1
else:
fivegram[item] = 1
# calculate unigram probability
for word in unigram:
temp = [word]
unigram_p[tuple(temp)] = (float(unigram[word])/uni_count)
# calculate bigram probability
for word in bigram:
bigram_p[tuple(word)] = (float(bigram[word])/biCount)
# calculate trigram probability
for word in trigram:
trigram_p[tuple(word)] = (float(trigram[word])/triCount)
# calculate fourgram probability
for word in fourgram:
fourgram_p[tuple(word)] = (float(fourgram[word])/fourCount)
# calculate fivegram probability
for word in fivegram:
fivegram_p[tuple(word)] = (float(fivegram[word])/fiveCount)
return unigram_p, bigram_p, trigram_p,fourgram_p,fivegram_p
def q1_output(unigrams, bigrams, trigrams,fourgrams ,fivegrams):
#output probabilities
outfile = open('probabilities.txt', 'w')
for unigram in unigrams:
outfile.write(unigram[0] + ':' + str(unigrams[unigram]) + '\n')
for bigram in bigrams:
outfile.write(bigram[0] + ' ' + bigram[1] + ':' + str(bigrams[bigram]) + '\n')
for trigram in trigrams:
outfile.write(trigram[0] + ' ' + trigram[1] + ' ' + trigram[2] + ':' + str(trigrams[trigram]) + '\n')
for fourgram in fourgrams:
outfile.write(fourgram[0] + ' ' + fourgram[1] + ' ' + fourgram[2] + ' ' + fourgram[3] + ':' + str(fourgrams[fourgram]) + '\n')
for fivegram in fivegrams:
outfile.write(fivegram[0] + ' ' + fivegram[1] + ' ' + fivegram[2] + ' ' + fivegram[3] + ' ' + fivegram[4]+ ':' + str(fivegrams[fivegram]) + '\n')
outfile.close()
def main():
infile = open('realBlogCorpus.txt', 'r')
brown = infile.readlines()
infile.close()
blogCorpusPOS(brown)
infile = open('realBlogCorpusPOS.txt','r')
brownPOS = infile.readlines()
infile.close()
(a,b,c,d,e) = calc_probabilities(brownPOS)
q1_output(a,b,c,d,e)
if __name__ == '__main__':
main()