forked from paradous/chatbot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessor.py
81 lines (57 loc) · 2.16 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
import contractions
from config import Config
config = Config()
class Preprocessor:
def __init__(self):
# Load SpaCy model for preprocess. Default: en_core_web_sm
self.nlp = spacy.load(config.MODEL_PREPROCESS)
@staticmethod
def strip_html_tags(text: str) -> str:
"""Remove html tags from the document."""
soup = BeautifulSoup(text, "html.parser")
return soup.get_text(separator=" ")
@staticmethod
def expand_contractions(text: str) -> str:
"""Expand shortened words, e.g. 'don't' to 'do not'."""
return contractions.fix(text)
@staticmethod
def remove_accented_chars(text: str) -> str:
"""Remove accented characters from text, e.g. café."""
return unidecode.unidecode(text)
@staticmethod
def remove_whitespace(text: str) -> str:
"""Remove extra whitespaces from text."""
text = text.strip()
return " ".join(text.split())
@staticmethod
def limit_n_words(text: str, limit: int = 256):
"""Limit a text to n-words. Default: 256."""
text = text.split()[:limit]
return " ".join(text)
def preprocess(self, text: str) -> str:
"""Apply a preprocess pipeline to a given text."""
# Apply all preformatting
text = self.strip_html_tags(text)
text = self.expand_contractions(text)
text = self.remove_accented_chars(text)
text = self.expand_contractions(text)
text = self.limit_n_words(text)
text = text.lower()
# Tokenize the text
document = self.nlp(text)
clean_text = []
for token in document:
# Convert number words to numeric numbers
if token.pos_ == 'NUM':
clean_text.append(w2n.word_to_num(token.text))
# Convert tokens to base form
elif token.lemma_ != "-PRON-":
clean_text.append(token.lemma_)
# Append the token if no modification was applied
else:
clean_text.append(token)
return ' '.join(map(str, clean_text))