-
Notifications
You must be signed in to change notification settings - Fork 7
/
lexer.lisp
333 lines (314 loc) · 13.4 KB
/
lexer.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
;;; -*- show-trailing-whitespace: t; indent-tabs-mode: nil -*-
;;; Copyright (c) 2007 Ivan Shvedunov. All rights reserved.
;;; Copyright (c) 2007 David Lichteblau. All rights reserved.
;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
;;; are met:
;;;
;;; * Redistributions of source code must retain the above copyright
;;; notice, this list of conditions and the following disclaimer.
;;;
;;; * Redistributions in binary form must reproduce the above
;;; copyright notice, this list of conditions and the following
;;; disclaimer in the documentation and/or other materials
;;; provided with the distribution.
;;;
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
(in-package :xpath)
;;; Define a function called NAME with one argument (the XPath expression as a
;;; string), which returns a lexer closure suitable for use with cl-yacc.
;;;
;;; - The lexer closure takes no arguments and returns the next token name
;;; and the token value as multiple values, or NIL at the end of the
;;; string.
;;;
;;; - Each rule is of the form (regex (&rest arguments) &rest body)
;;; BODY must return token name and value as multiple values, unless
;;; body contains only a single keyword, in which case the keyword
;;; is used as both token name and value.
;;;
;;; - As a special exception, a rule may return as multiple values
;;; (name1 value1 name2 value2)
;;; In that case, name1 and value1 are returned as normal for this
;;; invocation of the lexer closure, but the next invocation will
;;; return name2 and value2 instead of parsing the next token.
;;;
;;; - The number of arguments must match the number of registers used by
;;; REGEX.
;;;
;;; - Magic variables PREV-TOKEN-NAME and PREV-TOKEN-VALUE are bound
;;; in BODY so that the previous token can be inspected.
;;;
;;; - There is no implicit register group for the entire rule,
;;; so make sure to wrap parens around the regex to get such a group
;;; as an argument.
;;;
;;; Implementation notes:
;;; - We implement lexing using cl-ppcre by collecting all rules into one
;;; big regex of the form:
;;; ^rule1|rule2|rule3|...rulenN
;;; ruleIII is converted using a named register group as:
;;; (?<gIII>...)
;;; We use named register groups because the second return value of
;;; cl-ppcre:create-scanner then tells us which register groups belongs
;;; to which rule.
;;;
;;; - In the lexer closure, we do a linear seach over the groups to find
;;; the one named register group that actually matched.
;;;
;;; - FIXME: cl-ppcre doesn't look for the longest match, so the rules
;;; need to be written so that they check for ambiguous matches manually.
;;;
;;; - FIXME: Whitespace is currently allowed before and after -every- rule.
;;; Is this correct?
;;;
(defmacro deflexer (name-and-options &body rules)
(let ((regex-stream (make-string-output-stream))
(name (if (listp name-and-options)
(car name-and-options)
name-and-options))
(ignore-whitespace
(if (listp name-and-options)
(destructuring-bind (&key (ignore-whitespace t))
(cdr name-and-options)
ignore-whitespace)
t)))
(format regex-stream "^(?:")
(loop
for counter from 0
for (subregex args . body) in rules do
(unless (zerop counter)
(write-char #\| regex-stream))
(format regex-stream
(if ignore-whitespace
"(?<g~D>\\s*~A\\s*)"
"(?<g~D>~A)")
counter
subregex))
(format regex-stream ")")
`(let ((actions
(vector
,@(loop
for counter from 0
for (subregex args . body) in rules
collect
`(cons ,(length args)
(lambda (prev-token-name prev-token-value ,@args)
(declare (ignorable prev-token-name
prev-token-value))
,@(if (and (keywordp (car body))
(null (cdr body)))
`((values ,(car body) ,(car body)))
body)))))))
(multiple-value-bind (scanner group-numbers)
(let ((cl-ppcre:*allow-named-registers* t)
(group-table (make-array ,(length rules))))
(multiple-value-bind (scanner groups)
(cl-ppcre:create-scanner
,(get-output-stream-string regex-stream))
(loop
for i from 0
for group in groups
when group do
(let ((group-number
(parse-integer group
:start 1
:junk-allowed t)))
(setf (elt group-table group-number) i)))
(values scanner group-table)))
(defun ,name (str)
(let ((pos 0)
(length (length str))
(prev-token-name nil)
(prev-token-value nil)
(next-token-name nil)
(next-token-value nil))
(lambda ()
(cond
(next-token-name
(multiple-value-prog1
(values next-token-name next-token-value)
(setf next-token-name nil)))
((< pos length)
(multiple-value-bind
(total-start total-end group-start group-end)
(cl-ppcre:scan scanner str :start pos)
(unless total-start
(xpath-error "not a well-formed XPath expression: ~
lexer failed at position ~D" pos))
(setf pos total-end)
(loop
for group-number across group-numbers
for (nargs . action) across actions
for start = (elt group-start group-number)
when start do
(multiple-value-bind
(token-name token-value extra-name extra-value)
(apply action
prev-token-name
prev-token-value
(loop
for i from (1+ group-number)
repeat nargs
collect
(let ((astart (elt group-start i)))
(and astart
(subseq str
astart
(elt group-end i))))))
(setf prev-token-name token-name
prev-token-value token-value)
(setf next-token-name extra-name
next-token-value extra-value)
(return (values token-name token-value)))
finally
(xpath-error "not a well-formed XPath expression: ~
no token rule matched at ~D" pos))))
(t nil)))))))))
(defun namep (str)
(and (not (zerop (length str)))
(cxml::name-start-rune-p (elt str 0))
(every #'cxml::name-rune-p str)))
(defun nc-name-p (str)
(and (namep str) (cxml::nc-name-p str)))
(deflexer xpath-lexer
("\\(" () :lparen)
("\\)" () :rparen)
("\\[" () :lbracket)
("\\]" () :rbracket)
("\\.\\." () :two-dots)
("\\@" () :at)
("\\," () :comma)
(":(:)?" (2p) (if 2p :colons :colon))
("\\$((?:(?:[\\w-.]+):)?(?:[\\w-.]+))"
(qname)
(values :variable qname))
(#.(apply #'format nil "([^~C-~C~C-~C~C-~C][\\w-.]*)"
;; some checking to make sure the first character looks at least
;; a bit like a NCNameStartChar, so that numbers and operators
;; won't get mistaken for an NCName.
(mapcar #'code-char '(#x000 #x40 #x5B #x60 #x7B #xbf)))
(name)
(unless (nc-name-p name)
(xpath-error "not an NCName: ~A" name))
(values :ncname name))
("\"([^\"]*)\"" (value) (values :literal value))
("'([^']*)'" (value) (values :literal value))
("(\\d+(?:\\.\\d*)?|(\\.(\\d*)))"
(value dot digits)
(if (and dot (zerop (length digits)))
:dot
(values :number (handler-case
(parse-number:parse-number (preprocess-number-str value))
(org.mapcar.parse-number::invalid-number ()
;; re-signal this condition, because it's not
;; a subclass of error
(xpath-error "not a well-formed XPath number"))))))
("/(/)?" (2p) (if 2p :// :/))
("\\|" () :pipe)
("\\+" () :+)
("-" () :-)
("=" () :=)
("!=" () :!=)
("<(=)?" (=p) (if =p :<= :<))
(">(=)?" (=p) (if =p :>= :>))
("\\*" () (case prev-token-name
((:at :two-colons :lparen :lbracket :comma
:// :/ :pipe :+ :- := :!= :< :<= :> :>=)
(values :star :star))
(t (values :star-or-multiply :star-or-multiply)))))
;;; After the first lexing step, we need to correct various tokens, because
;;; - XPath explicitly modifies the lexing result using its disambiguation
;;; rules.
;;; - Our DEFLEXER matches the shortest rules, not the longest rule,
;;; making it inconvenient to distinguish various similar rules
;;; directly in the lexer.
;;;
;;; To implement all of this, we use a special lexer closure that wraps around
;;; the actual DEFLEXER-generated closure. The wrapper does a two-token
;;; lookahead and matches the three current tokens against hand-written
;;; rules, which then get a change to replace the current tokens with
;;; disambiguated versions. If that happens, matching is repeated using
;;; the replaced token until no match can be found. Finally the first of the
;;; three tokens is returned to the caller.
;;;
(defmacro define-fixup-lexer (name &rest rules)
`(defun ,name (next-lexer)
(let (a b c aname bname cname)
(setf (values bname b) (funcall next-lexer))
(setf (values cname c) (funcall next-lexer))
(lambda ()
(flet ((shift ()
(multiple-value-bind (n v)
(funcall next-lexer)
(shiftf aname bname cname n)
(shiftf a b c v)))
(ersetze! (name &optional (value name))
(setf aname name)
(setf a value)
t)
(ersetze!b (name &optional (value name))
(setf bname name)
(setf b value)
t)
(match (xa &optional (xb nil xbp) (xc nil xcp))
(and (or (eq xa t) (eq xa aname))
(or (not xbp)
(and (or (eq xb t) (eq xb bname))
(or (not xcp)
(or (eq xc t) (eq xc cname))))))))
(shift)
(cond
((null aname)
nil)
(t
(loop while (cond ,@rules))
(values aname a))))))))
(define-fixup-lexer make-fixup-lexer
((or (match :ncname :colon :star-or-multiply)
(match :ncname :colon :star))
(let ((value a))
(shift)
(shift)
(ersetze! :ns-name value)))
((match :ncname :colon :ncname)
(let ((value (cons a c)))
(shift)
(shift)
(ersetze! :qname value)))
((match :ncname :colons)
(ersetze! :axis-name a))
((and (match :ncname)
(cl-ppcre:all-matches "^(and|or|mod|div)$" a))
(let ((sym (intern (string-upcase a) :keyword)))
(ersetze! sym sym)))
((and (match :ncname :lparen)
(cl-ppcre:all-matches "^comment|text|processing-instruction|node$"
a))
(let ((sym (intern (string-upcase a) :keyword)))
(if (eq sym :processing-instruction)
(ersetze! :processing-instruction sym)
(ersetze! :node-type-or-function-name a))))
((match :ncname :lparen)
(ersetze! :function-name a))
((match :qname :lparen)
(ersetze! :function-name a))
((and (match t :star-or-multiply)
(find aname '(:at :colons :lparen :lbracket
:div :mod :and :or :multiply
:/ :// :pipe :+ :- := :!= :< :<= :> :>=)))
(ersetze!b :star))
((match t :star-or-multiply)
(ersetze!b :multiply))
((match :star-or-multiply)
(ersetze! :star)))