-
Notifications
You must be signed in to change notification settings - Fork 0
/
Preprocessing.py
113 lines (90 loc) · 3.98 KB
/
Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#list of library
import PyPDF2 #pdf library
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import string
from nltk.stem import WordNetLemmatizer
from matplotlib import pyplot
import regex as re
#--------------------------------------------------------------------#
# creating a pdf file object
pdfFileObj = open('combined_example.pdf', 'rb')
# creating a pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# ----------------------------------------------------------------------
# printing number of pages in pdf file
# print(pdfReader.numPages)
# creating a page object
# pageObj = pdfReader.getPage(0)
# extracting text from page
# print(pageObj.extractText())
print("---------------------------------------------------------------------")
print("Page contents")
for i in range(pdfReader.numPages):
page_to_print = pdfReader.getPage(i)
print(page_to_print.extractText().encode('utf8'))
num_pages = pdfReader.numPages
count = 0
text = ""
while count < num_pages:
pageObj = pdfReader.getPage(count)
count +=1
text += pageObj.extractText()
print("---------------------------------------------------------------------")
# -------------------------------------------------------------------------------------
# Tokenizations
print("Tokenizations")
tokenization_words = word_tokenize(text)
print(tokenization_words)
# -------------------------------------------------------------------------------------
#normalization
# Removed punctuation
punctuation_words = [word for word in tokenization_words if word.isalpha()]
# convert to lower case
lower_case = [w.lower() for w in punctuation_words]
# removes spaces
remove_spaces = [w.strip() for w in lower_case]
# removes number
remove_num = [''.join(x for x in w if x.isalpha()) for w in remove_spaces]
# -------------------------------------------------------------------------------------
print("---------------------------------------------------------------------")
print("Stopwords")
stop_words = set(stopwords.words('english'))
result = [word for word in remove_num if not word in stop_words]
print(result)
print("---------------------------------------------------------------------")
# -------------------------------------------------------------------------------------
print("POS Tag")
tag = nltk.pos_tag(result)
print(tag)
print("---------------------------------------------------------------------")
# -------------------------------------------------------------------------------------
print("Stemming")
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in result]
print(stemmed)
print("---------------------------------------------------------------------")
# -------------------------------------------------------------------------------------
#Lemmmatization
print("Lemmatization")
wordnet_lemmatizer = WordNetLemmatizer()
lemmatize = [wordnet_lemmatizer.lemmatize(word) for word in result]
print(lemmatize)
print("---------------------------------------------------------------------")
# -------------------------------------------------------------------------------------
#write the contents in text file
with open("Tokenization.txt", "w") as text_file:
text_file.write("{0}".format(tokenization_words).encode('utf8'))
with open("Stopwords.txt", "w") as text_file:
text_file.write("{0}".format(result).encode('utf8'))
with open("Tag.txt", "w") as text_file:
text_file.write("{0}".format(tag).encode('utf8'))
with open("Lemmatization.txt", "w") as text_file:
text_file.write("{0}".format(lemmatize).encode('utf8'))
with open("Stemmed.txt", "w") as text_file:
text_file.write("{0}".format(stemmed).encode('utf8'))
print("successfully saved!")
print("---------------------------------------------------------------------")
# -------------------------------------------------------------------------------------