-
Notifications
You must be signed in to change notification settings - Fork 20
/
relationfinder.py
executable file
·348 lines (329 loc) · 16.5 KB
/
relationfinder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
#### MEMORY-BASED SHALLOW PARSER ######################################################################
# Copyright (c) 2003-2010 University of Antwerp, Belgium and Tilburg University, The Netherlands
# Vincent Van Asch <vincent.vanasch@ua.ac.be>, Tom De Smedt <tom@organisms.be>
# License: GNU General Public License, see LICENSE.txt
### RELATION FINDER ##################################################################################
# MBSP's first step is to find the role of each word in the sentence,
# then to find groups ("chunks") of words that belong together,
# then to find relations between the different groups.
# The relation finder retrieves relations in the verb-argument structure (verb, object, subject, ...)
#
# This is a substitute for the Perl relation finder (relfinder.pl)
# The updated substitute makes it possible:
# - to tag ADJP-PRD's,
# - to use sentence splitting.
#
# Takes a PNP sentence:
# Make/VB/I-VP/O an/DT/I-NP/O oval/NN/I-NP/O ././O/O
# and returns a REL sentence:
# Make/VB/I-VP/O/VP-1 an/DT/I-NP/O/NP-OBJ-1 oval/NN/I-NP/O/NP-OBJ-1 ././O/O
#
# RELFINDER.PL changes:
# The known differences with relfinder.pl when composing feature vectors (instances) are:
# - commas stay commas in the instances (like in the instancebase) and are not substituted with COMMA,
# - commas are counted when computing the distance of a focus word to a verb,
# - there is no maximum distance. In relfinder.pl instances with a distance above a certaine threshold
# are not retained.
import re
import config
import client
try:
HOST = config.hosts[config.servers.index('relation')] # Relation server host (e.g. localhost).
PORT = config.ports[config.servers.index('relation')] # Relation server port.
except:
HOST = ''
PORT = 0
#--- SPLIT TOKENS ------------------------------------------------------------------------------------
def _split(tagged_string):
""" Takes a PNP-tagged sentence and splits tokens into a list.
Tokens in the input string must contain word, part-of-speech, chunk and preposition tags.
For example: "Make/VB/I-VP/O an/DT/I-NP/O oval/NN/I-NP/O ././O/O"
yields [[Make, VB, I-VP, O] , [an, DT, I-NP, O] , [oval, NN, I-NP, O] , [., ., O, O]]
- token[0] = the word,
- token[1] = the part-of-speech tag,
- token[2] = the chunk tag,
- token[3] = the PNP tag,
- token[4] = original index
"""
# Make all spaces singular.
# Remove leading and trailing whitespace.
s = re.sub(' +', ' ', tagged_string)
s = s.strip()
# Split every token into tags and append its index in the original sentence:
T = []
for i, token in enumerate(s.split(' ')):
tags = token.split('/') + [i]
if len(tags) != 5:
# We end up here if a token is missing pos/chunk/pnp tags.
s = 'Input of _split() is not a well-formed PNP sentence (expected WORD/POS/CHUNK/PNP tags).'
raise Exception, s
T.append(tags)
return T
def _join(s):
""" Converts a list of tokens from _split() back into a sentence string.
"""
return " ".join(['/'.join(token) for token in s])
#--- STEP 1 ------------------------------------------------------------------------------------------
# Step 1 collects indices of useful words to create TiMBL lookup instances with.
def _collapse(token, map={'O':'-'}):
""" Returns the token without I/B chunk tag prefixes, e.g. B-NP => NP.
Chunk tags are then replaced using the given map (by default, O becomes '-').
"""
ch = token[2].split('-')[-1]
ch = map.get(ch, ch)
return [token[0], token[1], ch, token[3], token[4]]
def _np_pnp(token):
""" Returns the token without I/B chunk tag prefixes, and replaced PNP.
"""
if token[3] != 'O':
return [token[0], token[1], 'PNP', token[3], token[4]]
else:
return _collapse(token)
def _step1(s):
""" Iterates the list of tokens from _split() and returns a reduced representation of the sentence.
Returns a tuple with:
1) A list of indices of verbs in the sentence. Only verb phrase heads are included.
2) A list of indices of commas in the sentence.
3) A list of indices. For each word, the chunk head index of the chunk this word belongs to.
4) A list of tokens that must be taken into account when constructing the feature vectors.
This roughly corresponds to the chunk heads.
5) A list of indices. For each word, either -1 if the word is not represented
in list 4) or otherwise the index of the token in 4).
6) A list of indices of the tokens for which an instance must be made.
7) For each word, the list contains either '' (not in any verb chunk) or an integer
indicating that the word is in the nth verb phrase.
""" # Example: "Hello , my name is Inigo Montoya ."
verbs = [] # 1) A list of indices of verbs in the sentence. [4]
commas = [] # 2) A list of indices of commas in the sentence. [1]
chunks = [] # 3) A list of indices of the chunk heads. [0, 1, 3, 3, 4, 6, 6, 7]
heads = [] # 4) A list of split tokens that are chunk heads. [[',', ',', '-', 'O', 1], ['name', ...]]
indices = [] # 5) The index of the token in heads. [-1, 0, -1, 1, 2, -1, 3, -1]
instances = [] # 6) Words for which an instance must be made. [3, 6]
VP_chunks = [] # 7) Word belongs to the how-manieth verb phrase? ['', '', '', '', 1, '', '']
VP_start = True
VP_count = 0
VB = ('VB','VBZ','VBD','VBN','VBG','VBP')
ch, prev = [], ' '
for i, token in enumerate(s):
# 1) Collect verb indices.
# Only take the verb phrase heads (i.e. the last verb in the verb phrase).
if token[2] == 'I-VP' and token[1] in VB:
b, j = True, i+1
while j < len(s) and s[j][2] == 'I-VP':
if s[j][1] in VB: b = False; break
j += 1
if b:
verbs.append(i)
# 2) Collect comma indices.
if token[0] == ',':
commas.append(i)
# 3) Collect chunk head indices.
# We are inside a chunk or PNP when: I-NP + I-NP, B-NP + I-NP or I-PP + I-NP.
if token[2] == prev and token[2] != 'O' \
or token[2][0] == 'I' and prev[0] == 'B' and token[2][2:] == prev[2:] \
or token[2] == 'I-NP' and prev =='I-PP':
ch.append(i)
else:
if ch: chunks.extend([ch[-1]] * len(ch))
ch = [i]
prev = token[2]
# 4) Collect the tokens that must be taken into account when constructing the feature vectors.
# This roughly corresponds to the chunk heads:
# - nouns, but only the last one in a NP,
# - adjectives (cpm, sup), but only the last one of the ADJP chunk, and not inside a PNP,
# - verbs, but only the heads and inside a VP,
# - coordinators, personal pronouns, what, punctuation,
# - adverbs as long as they are not in a NP,
# - That + numbers, but only if the last one in a NP.
# Collapse I/B and replace "NP" chunk tag with "PNP".
head_index = -1
is_head = (i+1 == len(s) or s[i+1][2] != 'I-NP') # head = the last token in NP chunk
if token[1].startswith('N') and is_head \
or token[1].startswith('J') and is_head and token[2] in ('I-ADJP',) and token[3] == 'O' \
or token[1].startswith('V') and token[4] in verbs \
or token[1] in ('CC', 'PRP', 'WP', ',', '"') \
or token[1] in ('RB', 'RBR') and token[2] not in ('B-NP', 'I-NP') \
or token[1] in ('CD', 'WDT') and is_head:
heads.append(_np_pnp(token)) # Rewrite NP tag to PNP tag.
head_index = len(heads)-1
# 5) Collect references to the heads list in 4).
# Contains -1 if the word is not represented in heads,
# the index of the token in heads otherwise.
indices.append(head_index)
# 6) Collect indices of words for which to create instances.
if token[1] in ('NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'WP', 'WDT', 'CD') and is_head \
or token[1] in ('JJ', 'JJR', 'JJS') and token[2] in ('I-ADJP',):
instances.append(i)
# 7) Collect VP ordinals.
# For each word, the list contains either '' (not in any verb chunk) or an integer
# indicating that the word is in the nth verb phrase.
if token[2] == 'I-VP':
if VP_start:
VP_chunks.append(VP_count+1); VP_count+=1; VP_start=False
else:
VP_chunks.append(VP_count)
else:
VP_chunks.append(''); VP_start=True
# Process the last pending chunk for step 3).
if ch not in chunks:
chunks.extend([ch[-1]] * len(ch))
return (verbs, commas, chunks, heads, indices, instances, VP_chunks)
#--- STEP 2 ------------------------------------------------------------------------------------------
# Step 2 collects distances between verbs and other words (specifically commas and other verbs).
def _step2(verbs, commas, chunks, sentence_length):
""" Returns a (distance, comma, verb)-tuple of dictionaries.
The keys of each dictionary are the verb indices.
For each sentence word you can then look up the number of commas between the word and a verb.
For example: "Hello , my name is Inigo Montoya ."
{ 4: [-3, -2, -1, -1, 0, 1, 1, 2] } => The distance between "hello" and "is" is -3 (3 before).
{ 4: [1, '-', 0, 0, '-', 0, 0, 0] } => There is one comma between "hello" and "is".
{ 4: [0, 0, 0, 0, '-', 0, 0, 0] } => There are no verbs between "hello" and "is".
The given parameters verbs, commas, chunks can be acquired from _step1().
"""
distance = {} # 1) Distance from verb to word.
comma = {} # 2) Number of commas between verb and word.
verb = {} # 3) Number of verbs between verb and word.
for i in verbs:
# 1) Distance from verb to each other word.
D = [0]
if i != len(chunks)-1:
n, prev = 0, ''
for j in range(i+1, len(chunks)): # Traverse words after the verb.
if chunks[j] != prev:
D.append(n+1); n+=1; prev=chunks[j]
else:
D.append(n)
if i != 0:
n, prev = 0, chunks[i]
for j in reversed(range(0,i)): # Traverse words in front of verb.
if chunks[j] != prev:
D.insert(0, -1*(n+1)); n+=1; prev=chunks[j]
else:
D.insert(0, -1*n)
distance[i] = D
# 2) Number of commas between verb and word.
C = ['-']
V = ['-']
nc, nv = 0, 0 # comma/verb count
for j in range(i+1, sentence_length): # Traverse words after verb.
if j in commas:
C.append('-'); nc+=1
elif j in verbs:
C.append('-')
else:
C.append(nc)
if j in verbs:
V.append('-'); nv+=1
else:
V.append(nv)
nc, nv = 0, 0
for j in reversed(range(0,i)): # Traverse words in front of verb.
if j in commas:
C.insert(0,'-'); nc +=1
elif j in verbs:
C.insert(0,'-')
else:
C.insert(0,nc)
if j in verbs:
V.insert(0,'-'); nv+=1
else:
V.insert(0,nv)
comma[i] = C
verb[i] = V
return (distance, comma, verb)
#--- REL INSTANCES -----------------------------------------------------------------------------------
def _instances(index, distance, verbs, verb_map, comma_map, heads, head_index, sentence):
""" Makes lookup instances of the token at index for the relation server.
Returns a tuple of two lists (instances, indices of verb).
An instance looks like:
distance, VC, comma, verb, verb_pos, -2 word, -2 pos, -2 chunk, -1 word, -1 pos, -1 chunk, f prep, f word, f pos, f chunk, +1 pos, +1 chunk ?
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
"""
def _tag(position, index, verb_index):
# Yields token[index] tag for the token at the given position.
i = head_index + position
if 0 <= i < len(heads):
if heads[head_index][4] < verb_index:
return heads[i][index]
elif heads[i][4] < verb_index:
return '-'
else:
return heads[i][index]
return '-'
# Make an instance for every verb.
instances = []
indices = []
for i in verbs:
instance = ['-']*17 + ['?']
instance[0] = str(distance[i][index]) # Distance.
instance[1] = str(verb_map[i][index]) # Verb count.
instance[2] = str(comma_map[i][index]) # Comma.
instance[3] = sentence[i][0].lower() # Verb.
instance[4] = sentence[i][1] # Verb part-of-speech.
instance[5] = _tag(-2,0,i) # The word before the previous.
instance[6] = _tag(-2,1,i) # The word before the previous, part-of-speech.
instance[7] = _tag(-2,2,i) # The word before the previous, chunk tag.
instance[8] = _tag(-1,0,i) # Previous word.
instance[9] = _tag(-1,1,i) # Previous word's part-of-speech
instance[10] = _tag(-1,2,i) # Previous word's chunk tag.
instance[11] = '-' # Focus preposition - XXX This should be implemented.
instance[12] = _tag(0,0,i) # Focus word.
instance[13] = _tag(0,1,i) # Focus word's part-of-speech.
instance[14] = _tag(0,2,i) # Focus word's chunk tag.
instance[15] = _tag(1,1,i) # Next word's part-of-speech.
instance[16] = _tag(1,2,i) # Next word's chunk tag.
instances.append(' '.join(instance))
indices.append(i)
return (instances, indices)
#--- TAG ---------------------------------------------------------------------------------------------
def tag(tagged_string):
""" Takes a PNP-tagged sentence and returns a tagged sentence with the added relation tags.
Tokens in the input string must contain WORD, POS, CHUNK and PNP tags.
Example relation tags: NP-SBJ-1, VP-1, NP-OBJ-1, NP-SBJ-2, ADJP-CLR, ... (see tags.py)
Note 1: on rare occasions words can be tagged with multiple relations (e.g. NP-OBJ-1*NP-OBJ-3).
Note 2: the separator for multiple relation can be "*" OR ";".
"""
s = _split(tagged_string)
verbs, commas, chunks, heads, indices, instance_candidates, VP_chunks = _step1(s)
distance, comma_map, verb_map = _step2(verbs, commas, chunks, len(s))
tags = []
V = [] # Verb indices collected from _instance().
I = [] # Instance index per tag.
for i in instance_candidates:
instances, verb_indices = _instances(i, distance, verbs, verb_map, comma_map, heads, indices[i], s)
instances = [x.replace('/','*') for x in instances]
tags.extend(instances)
V.extend(verb_indices)
I.extend([i] * len(instances))
# The client.batch() function in the client module takes care of managing server clients,
# we simply pass it all the tagging jobs and a definition of the client we need.
tags = client.batch(tags, client=(client.Timbl, HOST, PORT, config.RELATION, config.log), retries=1)
# Getting tags for complete chunks:
chunk_dict = {}
for i, tag in enumerate(tags):
ch = chunks[I[i]]
vp = str(VP_chunks[V[i]])
if tag != '-':
if ch in chunk_dict.keys():
new_tag = '-'.join([tag, vp])
# Do not append a tag referencing to the same verb twice.
# Taking the first occurence:
new_tag_vp = new_tag.split('-')[-1]
linked_vps = [x.split('-')[-1] for x in chunk_dict[ch].split('*')]
if new_tag_vp not in linked_vps:
chunk_dict[ch] += '*' + new_tag
else:
chunk_dict[ch] = '-'.join([tag, vp])
# Collect NP-SBJ, NP-OBJ etc. relations and add the VP relations.
# Place the relation tags in the sentence.
relations = []
for i, chunk in enumerate(chunks):
if chunk in chunk_dict:
relations.append(chunk_dict[chunk])
elif VP_chunks[i]:
relations.append('VP-%d' % (VP_chunks[i]))
else:
relations.append('O')
for i in range(len(s)):
s[i][4] = relations[i]
return _join(s)