-
Notifications
You must be signed in to change notification settings - Fork 4
/
preprocess.py
73 lines (61 loc) · 2.66 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# SOURCE = "https://raw.githubusercontent.com/okkyibrohim/id-multi-label-hate-speech-and-abusive-language-detection"
import pandas as pd
import re
# KAMUSALAY = "https://raw.githubusercontent.com/okkyibrohim/id-multi-label-hate-speech-and-abusive-language-detection/master/new_kamusalay.csv"
# STOPWORDS = "https://github.com/rifkisrg/text-mining-project-akhir/raw/70f8fb1c4a9a29cd86a32e15567777506e84ea23/stopwordbahasa.csv"
alay_dict = pd.read_csv('new_kamusalay.csv', names = ['original', 'replacement'], encoding='latin-1')
stopword_dict = pd.read_csv('stopwordbahasa.csv', names = ['stopword'], encoding='latin-1')
def lowercase(text):
return text.lower()
def remove_unnecessary_char(text):
text = re.sub('\\+n', ' ', text)
text = re.sub('\n'," ",text) # Remove every '\n'
text = re.sub('rt',' ',text) # Remove every retweet symbol
text = re.sub('RT',' ',text) # Remove every retweet symbol
text = re.sub('user',' ',text) # Remove every username
text = re.sub('USER', ' ', text)
text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
text = re.sub(':', ' ', text)
text = re.sub(';', ' ', text)
text = re.sub('\\+n', ' ', text)
text = re.sub('\n'," ",text) # Remove every '\n'
text = re.sub('\\+', ' ', text)
text = re.sub(' +', ' ', text) # Remove extra spaces
return text
def remove_nonaplhanumeric(text):
text = re.sub('[^0-9a-zA-Z]+', ' ', text)
return text
alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
def normalize_alay(text):
return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])
def remove_stopword(text):
text = ' '.join(['' if word in stopword_dict.stopword.values else word for word in text.split(' ')])
text = re.sub(' +', ' ', text) # Remove extra spaces
text = text.strip()
return text
def remove_emoticon_byte(text):
text = text.replace("\\", " ")
text = re.sub('x..', ' ', text)
text = re.sub(' n ', ' ', text)
text = re.sub('\\+', ' ', text)
text = re.sub(' +', ' ', text)
return text
def remove_early_space(text):
if text[0] == ' ':
return text[1:]
else:
return text
def preprocess(text):
if not isinstance(text, str):
return ''
text = remove_unnecessary_char(text) # 2
text = normalize_alay(text) # 3
text = remove_unnecessary_char(text)
text = remove_emoticon_byte(text)
text = remove_early_space(text)
text = remove_nonaplhanumeric(text)
text = lowercase(text)
text = remove_stopword(text)
text = text.replace('url', '')
text = text.replace('amp', '')
return text