-
Notifications
You must be signed in to change notification settings - Fork 10.2k
/
rules.py
40 lines (33 loc) · 1.44 KB
/
rules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
_SEPARATOR = r'@'
_RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)
_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE)
_UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + _SEPARATOR + r'(\w)', re.UNICODE)
_UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + _SEPARATOR + r'(\w)', re.UNICODE)
def _replace_with_separator(text, separator, regexs):
replacement = r"\1" + separator + r"\2"
result = text
for regex in regexs:
result = regex.sub(replacement, result)
return result
def split_sentence(text, best=True):
text = re.sub(r'([。!??])([^”’])', r"\1\n\2", text)
text = re.sub(r'(\.{6})([^”’])', r"\1\n\2", text)
text = re.sub(r'(…{2})([^”’])', r"\1\n\2", text)
text = re.sub(r'([。!??][”’])([^,。!??])', r'\1\n\2', text)
for chunk in text.split("\n"):
chunk = chunk.strip()
if not chunk:
continue
if not best:
yield chunk
continue
processed = _replace_with_separator(chunk, _SEPARATOR, [_AB_SENIOR, _AB_ACRONYM])
sents = list(_RE_SENTENCE.finditer(processed))
if not sents:
yield chunk
continue
for sentence in sents:
sentence = _replace_with_separator(sentence.group(), r" ", [_UNDO_AB_SENIOR, _UNDO_AB_ACRONYM])
yield sentence