-
Notifications
You must be signed in to change notification settings - Fork 1
/
prepare.py
163 lines (116 loc) · 4.9 KB
/
prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import pandas as pd
import numpy as np
import os
import unicodedata
import re
import json
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
# functions
def basic_clean(string):
'''
This function takes in a string and
returns the string normalized.
'''
string = unicodedata.normalize('NFKD', string)\
.encode('ascii', 'ignore')\
.decode('utf-8', 'ignore')
string = re.sub(r'[^\w\s]', '', string).lower()
return string
##############################
def tokenize(string):
'''
This function takes in a string and
returns a tokenized string.
'''
# Create tokenizer.
tokenizer = nltk.tokenize.ToktokTokenizer()
# Use tokenizer
string = tokenizer.tokenize(string, return_str=True)
return string
#############################
def stem(string):
'''
This function takes in a string and
returns a string with words stemmed.
'''
# Create porter stemmer.
ps = nltk.porter.PorterStemmer()
# Use the stemmer to stem each word in the list of words we created by using split.
stems = [ps.stem(word) for word in string.split()]
# Join our lists of words into a string again and assign to a variable.
string = ' '.join(stems)
return string
#############################
def lemmatize(string):
'''
This function takes in string for and
returns a string with words lemmatized.
'''
# Create the lemmatizer.
wnl = nltk.stem.WordNetLemmatizer()
# Use the lemmatizer on each word in the list of words we created by using split.
lemmas = [wnl.lemmatize(word) for word in string.split()]
# Join our list of words into a string again and assign to a variable.
string = ' '.join(lemmas)
return string
#############################
def remove_stopwords(string, extra_words=[], exclude_words=[]):
'''
This function takes in a string, optional extra_words and exclude_words parameters
with default empty lists and returns a string.
'''
# Create stopword_list.
stopword_list = stopwords.words('english')
# Remove 'exclude_words' from stopword_list to keep these in my text.
stopword_list = set(stopword_list) - set(exclude_words)
# Add in 'extra_words' to stopword_list.
stopword_list = stopword_list.union(set(extra_words))
# Split words in string.
words = string.split()
# Create a list of words from my string with stopwords removed and assign to variable.
filtered_words = [word for word in words if word not in stopword_list]
# Join words in the list back into strings and assign to a variable.
string_without_stopwords = ' '.join(filtered_words)
return string_without_stopwords
###############################
def prep_repo_data(df, column, extra_words=[], exclude_words=[]):
'''
This function take in a df and the string name for a text column with
option to pass lists for extra_words and exclude_words and
returns a df with the text article title, original text, stemmed text,
lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
'''
# drops duplicates but keeps the first instance
df = df.drop_duplicates(subset=None, keep='first')
df['clean'] = df[column].apply(basic_clean)\
.apply(tokenize)\
.apply(remove_stopwords,
extra_words=extra_words,
exclude_words=exclude_words)\
.apply(lemmatize)
df['stemmed'] = df[column].apply(basic_clean).apply(stem)
df['lemmatized'] = df[column].apply(basic_clean).apply(lemmatize)
return df[['language', 'repo', column, 'stemmed', 'lemmatized', 'clean']]
def add_columns(df):
# add a column that is a list of each word for each repo
words = [re.sub(r'([^a-z0-9\s]|\s.\s)', '', doc).split() for doc in df.clean]
# column name will be words, and the column will contain lists of the words in each doc
df = pd.concat([df, pd.DataFrame({'words': words})], axis=1)
# add a column that shows the length
df['doc_length'] = [len(wordlist) for wordlist in df.words]
return df
def split_repo_data(df):
from sklearn.model_selection import train_test_split
train_validate, test = train_test_split(df[['language',
'clean', 'words', 'doc_length']],
stratify=df.language,
test_size=.2,
random_state=123)
train, validate = train_test_split(train_validate,
stratify=train_validate.language,
test_size=.25,
random_state=123)
return train, validate, test
print("Prepare module loaded.")