-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
52 lines (46 loc) · 1.69 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import nltk
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from keras.preprocessing import sequence
import pickle
# Cleaning comment
def clean_comment(comment):
p = """'!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~'"""
stop_words = stopwords.words('english')
stop_words.remove('not')
comment = [word.lower() for word in comment]
comment = [''.join(w for w in word if w not in p) for word in comment]
comment = [word for word in comment if word not in stop_words]
return comment
# Stemming and Lemmantization
def stemming(comment):
return [SnowballStemmer(language="english").stem(word) for word in comment]
def lemmatization(comment):
return [WordNetLemmatizer.lemmatize(word) for word in comment]
def prep_comment(comment):
regexp = "([a-zA-Z]+(?:’[a-z]+)?)"
regex_tokenizer = RegexpTokenizer(regexp)
comment = regex_tokenizer.tokenize(comment)
comment = clean_comment(comment)
# comment = stemming(comment)
return ' '.join(comment)
def load_tokenizer():
with open('tokenizer.pickle', 'rb') as handle:
pkl = pickle.load(handle)
return pkl
def process(text):
maxlen = 35
tokenizer = load_tokenizer()
nltk.download('stopwords')
cleaned_comment = prep_comment(text)
word_seq_train = tokenizer.texts_to_sequences(cleaned_comment)
wst = []
for w in word_seq_train:
if len(wst) == maxlen:
break
if len(w) != 0:
wst.append(*w)
word_seq_train = sequence.pad_sequences([wst], maxlen=maxlen, padding="post")
return word_seq_train