-
Notifications
You must be signed in to change notification settings - Fork 2
/
convert.lisp
312 lines (256 loc) · 11 KB
/
convert.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
(ql:quickload :plump)
(ql:quickload :plump-sexp)
(ql:quickload :serapeum)
(ql:quickload :alexandria)
(ql:quickload :ironclad)
(ql:quickload :cxml)
(defparameter *sense-map-ht* nil)
(defmacro with-open-files (args &body body)
(case (length args)
((0)
`(progn ,@body))
((1)
`(with-open-file ,(first args) ,@body))
(t `(with-open-file ,(first args)
(with-open-files
,(rest args) ,@body)))))
(defun mapcat (f vec)
(loop for x across vec
append (funcall f x)))
(defun node-get-id (node)
(second (serapeum:split-sequence #\_
(plump:attribute node "id")
:remove-empty-subseqs t)))
(defun node-gloss-wsd? (node)
(and (equal (plump:tag-name node) "gloss")
(equal (plump:attribute node "desc")
"wsd")))
(defun node-form (node)
(let ((form (plump:render-text (plump:strip node))))
(if (equal form "")
nil
form)))
(defun meta->tk (node)
(plump:children
(plump-sexp:parse
`((meta :tag "ignore" :id ,(concatenate 'string "id_" (plump:tag-name node))) ""))))
(defun qf->wf (node)
(plump:children
(plump-sexp:parse
`((meta :tag "ignore")
,(serapeum:string-case (plump:attribute node "rend")
("dq" "\"")
("sq" "'")
(t (error "attribute rend of ~S must be \"dq\" or \"sq\"" node)))))))
(defun node-lemma (node)
(plump:attribute node "lemma"))
(defun node-annotation-tag (node)
(plump:attribute node "tag"))
(defun node-coll (node)
(let ((coll-keys (plump:attribute node "coll")))
(when coll-keys
(serapeum:split-sequence #\,
coll-keys
:remove-empty-subseqs t))))
(defun node-pos (node)
(plump:attribute node "pos"))
(defun posn->pos (posn)
(gethash posn
(alexandria:alist-hash-table
'((#\1 . "n") (#\2 . "v") (#\3 . "a") (#\4 . "r") (#\5 . "a"))
:test #'eql)
"#|#"))
(defun sense-map->ht (in)
"IN is the stream for a txt file mapping sense keys to synset-ids."
(let ((map (make-hash-table :test #'equal :size 210000)))
(loop for line = (read-line in nil 'eof)
until (eq line 'eof)
do (destructuring-bind (sk syid * **)
(serapeum:split-sequence #\space line)
(let* ((p (position #\% sk :test #'eql))
(posn (char sk (1+ p)))
(pos (posn->pos posn)))
(when (eql posn #\5)
;; change posn of adjective satellites to
;; adjectives because annotations are this way
(setf (char sk (1+ p)) #\3))
(setf (gethash sk map) (concatenate 'string pos syid)))))
map))
(defun sense-key->synset-id (sk)
(or
(gethash sk *sense-map-ht*)
(error "sense key ~S does not exist" sk)))
(defun filter-child-elements (node p)
(loop for c across (plump:child-elements node)
when (funcall p c)
collect c))
(defun gloss-text (node)
(let* ((text-node (first
(filter-child-elements node
(lambda (n)
(and (equal (plump:tag-name n) "gloss")
(equal (plump:attribute n "desc")
"orig")))))))
(plump:render-text text-node)))
(defun gloss-terms (node)
(let* ((terms-node (first
(filter-child-elements node
(lambda (n) (equal (plump:tag-name n) "terms")))))
(terms (filter-child-elements terms-node
(lambda (n) (equal (plump:tag-name n) "term")))))
(mapcar #'plump:render-text terms)))
(defun gloss-tokens (node)
(labels
((run (nodes)
(mapcan #'make-token
(mapcat #'expand-tokens
;; wf, qf, mwf..
nodes)))
(tag-boundary (nodes bound)
(let* ((first (first nodes))
(last (first (last nodes)))
(firstb (plump:attribute first "bound"))
(lastb (plump:attribute last "bound")))
(plump:set-attribute first "bound" (if firstb (concatenate 'string firstb "|" bound) bound))
(plump:set-attribute last "bound" (if lastb (concatenate 'string lastb "|" bound) bound))
nodes))
(expand-tokens (node)
(let ((tag (plump:tag-name node)))
(serapeum:string-case tag
(("cf" "wf")
(list node))
("qf"
(let ((qf (qf->wf node)))
(mapcat #'expand-tokens
(concatenate 'vector
qf
(plump:child-elements node)
qf))))
(("mwf" "aux" "classif" "def" "ex")
(let ((children (mapcat #'expand-tokens
(plump:child-elements node))))
(when children (tag-boundary children tag)))))))
(make-token (node)
(serapeum:string-case
(plump:tag-name node)
("wf"
(list (make-token-plist node :wf)))
("cf"
(cf-token node))))
(make-token-plist (node kind)
;; senses are always direct children of either wf or glob,
;; which call this function
(let ((senses (node-get-senses node)))
(list :form (node-form node)
:lemma (node-lemma node)
:pos (node-pos node)
:status (alexandria:if-let ((st (node-annotation-tag node)))
(or (serapeum:string-case st
(("auto" "man")
(assert senses)
(case senses
(nosense (concatenate 'string st "-nosense"))
(reset "un")))
(("ignore" "un")
(assert (null senses))))
st))
:kind (let ((coll-keys (node-coll node)))
(case kind
(:coll
(assert (null senses)
(senses)
"when token ~S is part of a collocation, it must have no annotation (only its glob may be annotated"
node)
(cons kind coll-keys))
(:glob
(assert (null (cdr coll-keys))
(coll-keys)
"when token ~S is a glob it must have be part of only one colloc"
node)
(cons kind (first coll-keys)))
(otherwise kind)))
:anno senses
:meta (let ((id (node-get-id node))
(bound (plump:attribute node "bound")))
(append
(and id (list (list :id id)))
(and bound (mapcar (lambda (x) (list :bound x)) (serapeum:split-sequence #\| bound)))))
:conf (if (eq senses 'nosense) 0 1))))
(node-get-senses (node)
(let* ((ids (filter-child-elements
node
(lambda (n) (equal (plump:tag-name n) "id"))))
(ignored? (some (lambda (n) (equal
(plump:attribute n "sk")
"purposefully_ignored%0:00:00::"))
ids)))
(if ignored?
(progn
(assert (equal (node-annotation-tag node) "man")
(node)
"~S is purposefully ignored, so should have tag \"man\" or \"auto\".")
(if (null (cdr ids))
'nosense
(or (warn "node with id ~S must be a singleton if there's a purposefully ignored token."
(plump:attribute node "id"))
'reset)))
(mapcar (lambda (s)
(let ((sk (plump:attribute s "sk")))
(sense-key->synset-id sk)))
ids))))
(cf-token (node)
(let ((globs (filter-child-elements node
(lambda (n)
(equal (plump:tag-name n) "glob")))))
(append
(loop for g in globs
collect (glob-token g))
(list (make-token-plist node :coll)))))
(glob-token (node)
(make-token-plist node :glob)))
;;
(run ;; def, aux, ex, classif
(plump:child-elements node))))
(defun gloss-sentence (node)
(labels ((get-wsd-gloss (node)
(filter-child-elements node #'node-gloss-wsd?)))
;;
(assert (equal (plump:tag-name node) "synset"))
(list :id (or (plump:attribute node "id") (error "."))
:terms (gloss-terms node)
:text (gloss-text node)
:tokens (gloss-tokens (first (get-wsd-gloss node))))))
;; (defun wordnet-sentences (node)
;; (assert (equal (plump:tag-name node) "wordnet"))
;; (map 'list #'gloss-sentence (plump:child-elements node)))
(defun checksum (str)
(format nil "~a"
(mod
(parse-integer
(ironclad:byte-array-to-hex-string
(ironclad:digest-sequence 'ironclad:sha224 (string-to-octets str))) :start 50 :radix 16)
1000)))
(defun save-sent (s out-fp)
(with-open-file (out (make-pathname :name (checksum (getf s :id))
:type "plist" :defaults out-fp)
:direction :output :if-exists :append
:if-does-not-exist :create)
(write s :pretty nil :case :downcase :stream out)
(terpri out)))
(defun main (glosstag-fp out-fp sensemap-fp &key (*sense-map-ht* *sense-map-ht*))
(let ((out-fp (ensure-directories-exist out-fp)))
(with-open-file (in-map sensemap-fp)
(let ((*sense-map-ht* (sense-map->ht in-map))
(in-files (directory (make-pathname :defaults glosstag-fp :name :wild :type "xml"))))
(format t "Input Files: ~a~%Output Directory: ~a~%Sense Index: ~a~%"
in-files
out-fp
sensemap-fp)
(loop for fp in in-files
do (klacks:with-open-source (in (cxml:make-source fp))
(labels ((f (s-xml)
(save-sent (gloss-sentence
(aref (plump:child-elements (plump:parse s-xml)) 0))
out-fp)))
(loop while (klacks:find-element in "synset")
do (f (klacks:serialize-element in (cxml:make-string-sink :omit-xml-declaration-p t)))))))))))