-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
91 lines (76 loc) · 2.76 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import nltk
import mysql.connector
import json
from collections import Counter
closedClassTags = ['CD', 'CC', 'DT', 'EX', 'IN', 'LS', "''", ',', '.', '``', 'MD', 'PDT', 'POS', 'PRP', 'PRP$', 'RP', 'TO', 'UH', 'WDT', 'WP', 'WP$', 'WRB']
def listToString(test_list):
res = str(test_list).strip('[]')
return res
def tokenizedPreprocessor(NTLKList):
tempFinalList = []
wordsList = []
for x in NTLKList:
if x[1] in closedClassTags:
pass
else:
tempFinalList.append(x)
for x in tempFinalList:
wordsList.append(x[0])
counts = Counter(wordsList)
your_list = [list(i) for i in counts.items()]
return [tempFinalList,your_list]
def tokenization(tempArticle):
tokens = nltk.word_tokenize(tempArticle)
tempTokenizedArticleList = nltk.pos_tag(tokens)
preprocessedFunctionReturn = tokenizedPreprocessor(tempTokenizedArticleList)
tempTokenizedArticle = preprocessedFunctionReturn[0]
totalLemmaArticleCount = preprocessedFunctionReturn[1]
signigicantWordCount = len(tempTokenizedArticleList)
return tempTokenizedArticle,totalLemmaArticleCount,signigicantWordCount
#necessary nltk packages
"""
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
"""
mydb = mysql.connector.connect(
host="localhost",
user="root",
password="password",
database="newsCrawler"
)
cursor = mydb.cursor()
cursor.execute("SELECT * FROM news ")
newArticlesTokenized = 0
myresult = cursor.fetchall()
for x in myresult:
#tempId = x[0]
tempurl = x[1]
tempTitle = x[2]
tempArticle = x[3]
sql = "SELECT url FROM tokenizedNews WHERE url = '{}'".format(tempurl,)
cursor.execute(sql)
result = cursor.fetchone()
if result is not None:
pass
else:
preprocessedFunctionReturn = tokenization(tempArticle)
tempTokenizedArticle = preprocessedFunctionReturn[0]
totalLemmaArticleCount = preprocessedFunctionReturn[1]
significantWordCount = preprocessedFunctionReturn[2]
#tempTokenizedArticle = ' '.join([str(elem) for elem in preprocessedTokenizedArticleList])
#tempLemmaCount = ' '.join([str(elem) for elem in totalLemmaArticleCount])
sql = "INSERT INTO tokenizedNews (url,title,tokenizedArticle,lemmaCount,wordCount) values (%s, %s, %s, %s, %s)"
val = (tempurl, tempTitle, json.dumps(tempTokenizedArticle), json.dumps(totalLemmaArticleCount),significantWordCount)
cursor.execute(sql,val)
mydb.commit()
newArticlesTokenized = newArticlesTokenized+1
if newArticlesTokenized > 1:
print(newArticlesTokenized,'articles tokenized')
elif newArticlesTokenized == 1:
print(newArticlesTokenized,'article tokenized')
else:
print("No new tokenized articles")
#print(tempId)
#print(tempurl)
#print(tempTitle)
#print(tempArticle)