-
Notifications
You must be signed in to change notification settings - Fork 2
/
deduplication.lisp
156 lines (135 loc) · 4.95 KB
/
deduplication.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
;; (C) 2013 IBM Corporation
;; Author: Alexandre Rademaker
;; The code of this file is made to be runned in AG version 4.10 (at the time it was written, the AG 4.10
;; used is the one running on Amazon AWS server.
;; Deduplication of entities in wn-en:
;; - SenseIndex
;; - SenseIndex and WordSense
;; - Word
;; - cleanup
;;
;; Checking the merging of SenseIndex entities.
;; cut -d " " -f 1 sentidx.vrb > lixo.1
;; cut -d " " -f 1 index.sense > lixo.2
;; cat lixo.1 lixo.2 | sort | uniq | wc -l
;;
;; Deduplications of entities in wn-br:
;; - Word
(in-package :db.agraph.user)
(defun merge-nodes (old new)
"Transfer all in and out edges from OLD to NEW, except owl:sameAs edges."
(let ((new-triples nil))
(progn
(mapcar #'(lambda (tr)
(if (not (get-triple :s new :p (predicate tr) :o (object tr)))
(push (list new (predicate tr) (object tr)) new-triples)))
(get-triples-list :s old :limit nil))
(mapcar #'(lambda (tr)
(if (not (get-triple :s (subject tr) :p (predicate tr) :o new))
(push (list (subject tr) (predicate tr) new) new-triples)))
(get-triples-list :o old :limit nil))
(dolist (a (remove-if (lambda (a) (part= (nth 1 a) !owl:sameAs)) new-triples))
(add-triple (nth 0 a) (nth 1 a) (nth 2 a)))
(delete-triples :s old)
(delete-triples :o old))))
(defun group-nodes (key value &optional (counter 0))
(declare (ignore key))
(if (> (length value) 1)
(progn
(format *debug-io* "Merging group ~a ~a~%" counter value)
(let ((master (car value)))
(dolist (other (cdr value))
(progn
(merge-nodes (car other) (car master))))))))
(defun deduplicate-words ()
(let ((wt (make-hash-table :test #'equal))
(words (select0-distinct (?w ?l)
(q- ?w !wn30:lexicalForm ?l)))
(counter 0))
(dolist (w words)
(let ((str (upi->value (cadr w))))
(if (gethash str wt)
(push w (gethash str wt))
(setf (gethash str wt) (list w)))))
(format *debug-io* "Finished hashtable~%")
(maphash (lambda (k v) (group-nodes k v (incf counter))) wt)))
(defun deduplicate-senseindex ()
(let ((wt (make-hash-table :test #'equal))
(words (select0-distinct (?w ?l)
(q- ?w !wn30:senseKey ?l)))
(counter 0))
(dolist (w words)
(let ((str (upi->value (cadr w))))
(if (gethash str wt)
(push w (gethash str wt))
(setf (gethash str wt) (list w)))))
(format *debug-io* "Finished hash-table ~a ~%" wt)
(maphash (lambda (k v) (group-nodes k v (incf counter))) wt)))
; WordNet-3.0 has 29 SenseIndex nodes with the same lemma+lexid of
; other 2 WordSenses. For 28 cases, it doesn't matter because both
; WordSense are indistinguishabe. In one case (Utopia0) we have to
; mannualy remove one sameAs triple before deduplicate the nodes. I
; choose the one related with wn30i:wordsense-03020193-a-2 which is a
; sense related to the synset 07283198. Command:
;;
;; (delete-triples :p !owl:sameAs :o !wn30i:wordsense-03020193-a-2)
(defun identify-senseindex/wordsense ()
(select0/callback (?si ?ws)
(lambda (p)
(add-triple (nth 0 p) !owl:sameAs (nth 1 p)))
(q- ?ss !wn30:containsSenseIndex ?si)
(q- ?ss !wn30:containsWordSense ?ws)
(q- ?ws !wn30:word ?w)
(q- ?ws !wn30:lexicalId ?i1)
(q- ?w !wn30:lemma ?l1)
(q- ?si !wn30:lexId ?i2)
(q- ?si !wn30:lemma ?l2)
(lispp (equal (concatenate 'string (part->value ?l1) (part->value ?i1))
(concatenate 'string (part->value ?l2) (part->value ?i2))))))
(defun deduplicate-sameAs (c)
(do* ((tripla (get-triple :p !owl:sameAs)
(get-triple :p !owl:sameAs))
(counter 0
(1+ counter)))
((or (null tripla)
(> counter c)))
(progn
(merge-nodes (subject tripla) (object tripla))
(format *debug-io* "~a ~a ~%" (subject tripla) (object tripla)))))
(defun clean-senseindex ()
(delete-triples :o !wn30:SenseIndex)
(delete-triples :p !wn30:lexId)
(delete-triples :p !wn30:containsSenseIndex))
;; After the previous function, the following query was executed in
;; the web interface:
;;
;; delete {
;; ?ws wn30:lemma ?val .
;; } where {
;; ?ws wn30:lemma ?val .
;; ?ws a wn30:WordSense .
;; }
;; Finally, we must fix Wordnet-BR adding a map from the
;; AdjectiveSynset intances that are AdjectiveSatelliteSynset in the
;; original WordNet. I did it using the web interface of AG and the
;; following query:
;;
;; construct {
;; ?new1 owl:sameAs ?new2 .
;; }
;; where {
;; ?a a wn30:AdjectiveSatelliteSynset .
;; BIND (iri(replace(str(?a),"/wn30/","/wn30-br/")) AS ?new1)
;; BIND (iri(replace(replace(str(?a),"/wn30/","/wn30-br/"),"-s","-a")) AS ?new2)
;; }
;;
;; Old code:
;; (defun correct-synsets-br ()
;; (select0/callback (?ss1 ?id)
;; (lambda (p)
;; (let ((addr (format nil "synset-~a-s" (part->value (second p)))))
;; (add-triple (first p) !owl:sameAs (resource addr "wn30br"))))
;; (q- ?ss1 !wn30:synsetId ?id)
;; (q- ?ss1 !rdf:type !wn30:AdjectiveSynset)
;; (q- ?ss2 !wn30:synsetId ?id)
;; (q- ?ss2 !rdf:type !wn30:AdjectiveSatelliteSynset)))