-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
35 lines (27 loc) · 921 Bytes
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# stopwords = pkgutil.get_data(__package__, 'smart_common_words.txt')
# stopwords = stopwords.decode('ascii').split('\n')
# stopwords = {key.strip(): 1 for key in stopwords}
def _get_ngrams(n, text):
"""Calcualtes n-grams.
Args:
n: which n-grams to calculate
text: An array of tokens
Returns:
A set of n-grams
"""
# set of tuple : {('a', 'Doctor'), (), ..., }
ngram_set = set()
text_length = len(text)
max_index_ngram_start = text_length - n
for i in range(max_index_ngram_start + 1):
ngram_set.add(tuple(text[i:i + n]))
return ngram_set
def _get_word_ngrams(n, sentences):
"""Calculates word n-grams for multiple sentences.
"""
assert len(sentences) > 0
assert n > 0
# words = _split_into_words(sentences)
words = sum(sentences, [])
# words = [w for w in words if w not in stopwords]
return _get_ngrams(n, words)