-
Notifications
You must be signed in to change notification settings - Fork 34
/
separate_reference.py
253 lines (199 loc) · 8.07 KB
/
separate_reference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
from .Grammars import reference_patterns
from .dictionary import wordlist_english
import logging
import re
REFERENCE_PREFIX = "REF_"
strip_table = str.maketrans("", "", "(){}<>[]")
strip_table = str.maketrans("", "", "(){}<>[].,!;?")
class separate_reference:
"""
Detects if a reference number has been mistakenly concatenated to words in
a document. This module will remove reference numbers, with the option
to include them as a token representing the reference number. If these
reference numbers wrap around a period at the end of the sentence, the
module will identify this and properly split the sentences.
Example:
input: 'How is the treatment going.4-5 Pretty well'
output: 'How is the treatment going . Pretty well'
"""
def __init__(self, reference_token=False, f_wordlist=None):
"""
Initialize the parser
Args:
reference_token: boolean, flag to decide to tokenize removed
reference content
"""
self.logger = logging.getLogger(__name__)
if f_wordlist is None:
f_wordlist = wordlist_english
self.english_words = set()
with open(f_wordlist) as FIN:
for line in FIN:
self.english_words.add(line.strip())
self.reference_token = reference_token
self.reference_pattern = reference_patterns()
def __call__(self, text):
"""
call the parser
Args:
text: a document string
Returns:
return_doc: a document string
"""
new_doc = []
tokens = text.strip().split()
new_sentence = []
for token in tokens:
# Check if word is of the form word4.
new_tokens = self.single_number_pattern(token)
if new_tokens:
new_sentence.extend(new_tokens)
continue
# Check if the word is of the form word(4)
new_tokens = self.identify_reference_punctuation_pattern(
token, self.reference_pattern.single_number_parens, parens=True
)
if new_tokens:
new_sentence.extend(new_tokens)
continue
# Check if the word is of the form word,2,3,4
new_tokens = self.identify_reference_punctuation_pattern(
token, self.reference_pattern.punctuation_then_number, forward=3
)
if new_tokens:
new_sentence.extend(new_tokens)
continue
# Check if the word is of the form word2,3,4
new_tokens = self.identify_reference_punctuation_pattern(
token, self.reference_pattern.number_then_punctuation
)
if new_tokens:
new_sentence.extend(new_tokens)
continue
# if no reference detected, append word to the new sentence
new_sentence.append(token)
join_sentence = " ".join(new_sentence)
new_doc.append(join_sentence)
return_doc = " ".join(new_doc)
return return_doc
# This is an ambiguous case, because there could be words that end in a
# number that don't represent a footnote, as is the case for chemicals.
# The code looks up the characters that make up a word, and if they are
# not found in the dictionary, it is assumed it is a chemical name and
# the number is not pruned.
def single_number_pattern(self, token):
"""
Detect the most basic case where a single number is concatenated to the
word token
Args:
token: a string token
Returns:
output: a list of string tokens
"""
output = []
try:
parse_return = self.reference_pattern.single_number.parseString(
token
)
except BaseException:
return False
if parse_return[0] not in self.english_words:
output.append(token)
else:
word = parse_return[0]
reference = parse_return[1]
output.append(word)
if self.reference_token:
output.append(REFERENCE_PREFIX + reference)
if self.end_parens_match(parse_return):
output[-1] = output[-1] + parse_return[-1]
return output
def identify_reference_punctuation_pattern(
self, token, pattern, parens=False, forward=2
):
"""
Identify whether the pyparsing pattern passed to the function is found
in the token.
Args:
token: a string token
pattern: a pyparsing grammar pattern
parens: a boolean to flag whether the function should expect to
recognize a reference in parenthesis
forward: Number of characters to skip forward
Return:
Output: a list of string tokens
"""
output = []
parse_return = pattern.searchString(token)
if parse_return:
substring = "".join(parse_return[0][forward:])
index = token.find(substring)
word = token[:index]
if self.end_parens_match(parse_return[0], parens=True):
end_offset = len(parse_return[0][-1]) * -1
reference = token[len(word) : end_offset]
else:
reference = token[len(word) :]
output.append(word)
if self.reference_token:
ref_token = (REFERENCE_PREFIX + reference).translate(
strip_table
)
output.append(ref_token)
# Handle nested parens
if (
len(substring) > 2
and substring[-2] in "])}"
and substring[-3] in "])}"
):
output[-1] += substring[-2]
# Reference tokens have stripped too much
if self.reference_token:
if substring[-1] in ".,?!;:":
output[-1] += substring[-1]
# Replace any stripped punctuation
if substring[0] in ".,?!;:":
output[-1] += substring[0]
if self.end_parens_match(parse_return[0], parens=parens):
output[-1] = output[-1] + parse_return[0][-1]
else:
output = False
if output and reference[-1] in ".,?!;:" and not self.reference_token:
output[-1] += reference[-1]
return output
def end_parens_match(
self, strlist, search=re.compile(r"[^)}\]]").search, parens=False
):
"""
Check if the token ends in parenthesis, and thus needs to avoid
removing them as part of the reference.
Args:
strlist: a list of string subtokens, parsed from the given grammar
search: pattern to recognize. Defaults to ending parenthesis
parens: boolean to flag whether the passed grammar is for
references with parenthesis in them.
Return:
a boolean
"""
# special case when parsing with the parenthetical content grammar.
# We check to see if a token ends with a parenthesis to see if we need
# to append one to the cleaned token. However, we do not want to do
# this if the token ends with a parenthesis that holds a nested
# reference
if parens:
LP_Paran = sum(1 for a in strlist if a == "(")
RP_Paran = sum(1 for a in strlist if a == ")")
LP_Bracket = sum(1 for a in strlist if a == "[")
RP_Bracket = sum(1 for a in strlist if a == "]")
LP_Curl = sum(1 for a in strlist if a == "{")
RP_Curl = sum(1 for a in strlist if a == "}")
# If the count of the left paren doesn't match the right, then
# ignore all parenthesis
FLAG_valid = (
(LP_Paran == RP_Paran)
and (LP_Bracket == RP_Bracket)
and (LP_Curl == RP_Curl)
)
if FLAG_valid:
return False
return isinstance(strlist[-1], str) and not bool(search(strlist[-1]))