-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
147 lines (107 loc) · 5.34 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# from _typeshed import Self
import pandas as pd
import re
import gensim
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
import nltk
import spacy
# The MyDataset class ingests a dataset and manages preprocessing for models
class MyDataset:
# Overview: The constructor for the MyDataset class, must be modified to ingest datasets that are organized differently
# Parameters: fn: name of dataset file
# Output: n/a
def __init__(self, fn):
# NOTE: the three columns in self.dataset must be 'Article', 'Heading', and 'Date'
self.filename = fn
self.dataset = pd.read_csv(fn, encoding="ISO-8859-1") # ISO-8859-1 is used for the example data but is usually uncommon, remove later
# Overview: Prepares data for lda
# Parameters: n/a
# Output: n/a
def preprocess_for_LDA(self):
data = Preprocess.text_to_list(self.dataset.Article)
data_words = list(Preprocess.remove_special_chars(data))
clean_data = Preprocess.remove_stopwords(data_words)
return clean_data
# Overview: Prepares data for bert
# Parameters: n/a
# Output: n/a
def preprocess_for_bert(self):
# clean_data = (Preprocess.remove_special_chars(self.dataset.Article))
self.dataset.Article =self.dataset.apply(lambda row: re.sub(r"http\S+", "", row.Article).lower(), 1)
self.dataset.Article = self.dataset.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.Article.split())), 1)
self.dataset.Article = self.dataset.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.Article).split()), 1)
clean_data = self.dataset.Article.to_list()
dates = list(self.dataset.Date)
return clean_data, dates
# Overview: Prepares data for sentiment analysis
# Parameters: n/a
# Output: n/a
def preprocess_for_VADER(self):
return Preprocess.remove_special_chars(self.dataset.Heading)
# Overview: Prints dataset head
# Parameters: n/a
# Output: n/a
def show(self):
print(self.dataset.head())
# A class containing statoc functions for preprocessing, uses re, gensim, and nltk
class Preprocess:
def text_to_list(texts):
return texts.values.tolist()
def remove_special_chars(Article):
for s in Article:
s = re.sub('\S*@\S*\s?', '', s) # remove emails
s = re.sub('\s+', ' ', s) # remove newline chars
s = re.sub("\'", "", s) # remove single quotes
s = gensim.utils.simple_preprocess(str(s), deacc=True)
yield(s)
def remove_stopwords(Articles):
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
bigram = gensim.models.Phrases(Articles, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[Articles], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])
texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in Articles]
texts = [bigram_mod[doc] for doc in texts]
texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
texts_out = []
# run this if you get an error here: python -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
for sent in texts:
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
# remove stopwords once more after lemmatization
texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]
return texts_out
def lemmatize(Articles):
allowed_postags=['NOUN','ADJ','VERB','ADV']
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
texts_out = []
for s in Articles:
doc = nlp(" ".join(s))
texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
return texts_out
def testing_script():
# Testing Loading Data
x = MyDataset('Articles.csv')
x.show()
# Testing lext_to_list
data = Preprocess.text_to_list(x.dataset.Article)
print(data[:1])
# Testing data_words
data_words = list(Preprocess.remove_special_chars(data))
print(data_words[:1])
# Testing remove_stopwords
clean_data = Preprocess.remove_stopwords(data_words)
print(clean_data[:1])
# Testing lemmatize
data_lemmatized = Preprocess.lemmatize(data_words)
print(data_lemmatized[:1])
def main():
testing_script()
if __name__=="__main__":
main()