forked from srvCodes/Gender-Classification-of-Blog-Author
-
Notifications
You must be signed in to change notification settings - Fork 0
/
baseFeatures.py
72 lines (53 loc) · 2.54 KB
/
baseFeatures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import nltk
from collections import Counter
import string
import re
import yuleK
import sentiWordNet
from nltk.tag.stanford import StanfordPOSTagger
st = StanfordPOSTagger('/home/saurav/Documents/postagger/models/english-bidirectional-distsim.tagger',
'/home/saurav/Documents/postagger/stanford-postagger.jar')
def baseFeatures(text):
sentence_tokens = text.split('.')+text.split('?')+text.split('!')
countSentences = len(sentence_tokens)-3
if countSentences <= 0:
countSentences = 1
words = text.split()
countWords = len(words)
if countWords is not 0:
countWordPerSentence = countWords*1.0/countSentences
countCharacters = len(text)-countSentences
countCharactersPerSentence = countCharacters*1.0/countSentences
countAlphabets = sum(c.isalpha() for c in text)
normalizedAlphabets = countAlphabets*1.0/countCharacters
countDigits = sum(c.isdigit() for c in text)
normalizedDigits = countDigits*1.0/countCharacters
countSpaces = sum(c.isspace() for c in text)
normalizedSpaces = countSpaces*1.0/countCharacters
countSpecialChars = countCharacters - countAlphabets - countDigits - countSpaces
normalizedSpecialChars = countSpecialChars*1.0/countCharacters
# we assumed short words are those words with length less than 4 characters
countShortWords = sum(1 for word in words if len(word) <= 4)
normalizedShortWords = countShortWords*1.0/countWords
countPunctuations = text.count('.')+text.count(',')+text.count('!')+text.count('?')+text.count(':')+text.count(';')
doubleQuotes=re.findall(r'\"(.+?)\"',text)
singleQuotes=re.findall(r'\'(.+?)\'',text)
countPunctuations += len(singleQuotes) + len(doubleQuotes)
normalizedPunctuations = countPunctuations*1.0/countCharacters
averageWordLength = sum(len(word) for word in words)/len(words)
countQuestionMark = text.count('?')
if countPunctuations <= 0:
countPunctuations = 1
normalizedQuestionPerPunctuations = countQuestionMark*1.0/countPunctuations
try:
lexicalRichness = yuleK.yule(text)
except:
lexicalRichness = 0
sentimentPosScore, sentimentNegScore = sentiWordNet.sentimentFeature(text)
return (countSentences, countWords, countWordPerSentence, countCharacters, countCharactersPerSentence,
normalizedAlphabets, normalizedDigits, normalizedSpaces, normalizedSpecialChars, normalizedShortWords,
normalizedPunctuations, averageWordLength, normalizedQuestionPerPunctuations, lexicalRichness, sentimentPosScore,
sentimentNegScore)
else:
return(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
#print(baseFeatures(""""""))