-
Notifications
You must be signed in to change notification settings - Fork 4
/
pure_lexer_implScript.sml
463 lines (421 loc) · 15.6 KB
/
pure_lexer_implScript.sml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
(*
Definition of the lexer: code for consuming tokens until a top-level
semicolon is found (semicolons can be hidden in `let`-`in`-`end` blocks,
structures, signatures, and between parentheses).
TODO: update this description if it is incorrect.
*)
open preamble tokensTheory locationTheory
val _ = temp_delsimps ["NORMEQ_CONV"]
val _ = new_theory "pure_lexer_impl";
val _ = set_grammar_ancestry ["misc", "tokens", "ASCIInumbers", "location"]
(* val tac =
full_simp_tac (srw_ss()) [char_le_def, char_lt_def] >>
Cases_on `t` >>
rw [get_token_def, processIdent_def, isAlphaNum_def, isAlpha_def, isDigit_def,
isLower_def, isUpper_def];*)
Datatype:
symbol = StringS string
| CharS char
| NumberS int
| WordS num
| LongS string (* identifiers with a . in them *)
| FFIS string
| OtherS string
| ErrorS
End
(* helper functions *)
Definition mkCharS_def:
(mkCharS (StringS s) = if LENGTH s = 1 then CharS (HD s)
else ErrorS) /\
(mkCharS _ = ErrorS)
End
Definition read_while_def:
(read_while P "" s = (IMPLODE (REVERSE s),"")) /\
(read_while P (STRING c cs) s =
if P c then read_while P cs (c :: s)
else (IMPLODE (REVERSE s),STRING c cs))
End
Theorem read_while_thm:
∀cs s cs' s'.
(read_while P cs s = (s',cs')) ⇒ STRLEN cs' <= STRLEN cs
Proof
Induct THEN SRW_TAC [][read_while_def] THEN SRW_TAC [][] THEN
RES_TAC THEN FULL_SIMP_TAC std_ss [LENGTH,LENGTH_APPEND] THEN DECIDE_TAC
QED
Definition is_single_char_symbol_def:
is_single_char_symbol c = MEM c "()[]{},;"
End
Definition isSymbol_def:
isSymbol c = MEM c (CHR 96 (* backquote *) :: "!%&$#+-/:<=>?@\\~^|*")
End
Definition isAlphaNumPrime_def:
isAlphaNumPrime c <=> isAlphaNum c \/ (c = #"'") \/ (c = #"_")
End
Definition next_loc_def:
next_loc n (POSN r c) = POSN r (c+n) ∧
next_loc n x = x
End
Definition next_line_def:
next_line (POSN r c) = POSN (r+1) 1 ∧
next_line x = x
End
Definition read_string_def:
read_string str s (loc:locn) =
if str = "" then (ErrorS, loc, "") else
if HD str = #"\"" then (StringS s, loc, TL str) else
if HD str = #"\n" then (ErrorS, next_line loc, TL str) else
if HD str <> #"\\" then
read_string (TL str) (s ++ [HD str]) (next_loc 1 loc)
else
case TL str of
| #"\\"::cs => read_string cs (s ++ "\\") (next_loc 2 loc)
| #"\""::cs => read_string cs (s ++ "\"") (next_loc 2 loc)
| #"n"::cs => read_string cs (s ++ "\n") (next_loc 2 loc)
| #"t"::cs => read_string cs (s ++ "\t") (next_loc 2 loc)
| _ => (ErrorS, loc, TL str)
Termination
WF_REL_TAC `measure (LENGTH o FST)` THEN REPEAT STRIP_TAC
THEN Cases_on `str` THEN FULL_SIMP_TAC (srw_ss()) [] THEN DECIDE_TAC
End
Theorem read_string_thm:
∀s t l l' x1 x2. (read_string s t l = (x1, l', x2)) ⇒
(LENGTH x2 <= LENGTH s + LENGTH t)
Proof
ONCE_REWRITE_TAC [EQ_SYM_EQ]
\\ HO_MATCH_MP_TAC (fetch "-" "read_string_ind")
\\ REPEAT STRIP_TAC \\ POP_ASSUM MP_TAC
\\ ONCE_REWRITE_TAC [read_string_def]
\\ Cases_on `s` \\ SIMP_TAC (srw_ss()) []
\\ SRW_TAC [] [LENGTH] \\ RES_TAC \\ TRY DECIDE_TAC
\\ SRW_TAC [] [LENGTH] \\ Cases_on `t'`
\\ FULL_SIMP_TAC (srw_ss()) [] \\ CCONTR_TAC
\\ Q.PAT_X_ASSUM `(x1, l', x2) = xxx` MP_TAC
\\ SIMP_TAC std_ss [] \\ SRW_TAC [] []
\\ REPEAT STRIP_TAC \\ FULL_SIMP_TAC std_ss []
\\ RES_TAC \\ TRY DECIDE_TAC \\ CCONTR_TAC
\\ FULL_SIMP_TAC std_ss [LENGTH] \\ DECIDE_TAC
QED
Definition skip_nested_comment_def:
(skip_nested_comment "" d _ = NONE) /\
(skip_nested_comment [x] d _ = NONE) /\
(skip_nested_comment (x::y::xs) d loc =
if [x;y] = "{-" then
skip_nested_comment xs (d+1:num) (next_loc 2 loc)
else if [x;y] = "-}" then
(if d = 0 then SOME (xs, next_loc 2 loc)
else skip_nested_comment xs (d-1) (next_loc 2 loc))
else if ORD x = 10 then
skip_nested_comment (y::xs) d (next_line loc)
else skip_nested_comment (y::xs) d (next_loc 1 loc))
End
Theorem skip_nested_comment_thm:
∀xs d l l' str. skip_nested_comment xs d l = SOME (str, l') ⇒
LENGTH str ≤ LENGTH xs
Proof
ho_match_mp_tac skip_nested_comment_ind >> simp[skip_nested_comment_def] >>
rw[] >> gvs[]
QED
Definition skip_eol_comment_def:
skip_eol_comment "" l = SOME ("", l) ∧
skip_eol_comment (c::cs) l =
if c = #"\n" then SOME (cs, next_line l)
else skip_eol_comment cs (next_loc 1 l)
End
Theorem skip_eol_comment_thm:
∀cs l cs' l'. skip_eol_comment cs l = SOME (cs', l') ⇒ LENGTH cs' ≤ LENGTH cs
Proof
Induct >>
simp[skip_eol_comment_def, AllCaseEqs(), DISJ_IMP_THM, FORALL_AND_THM] >>
rw[] >> first_x_assum drule >> simp[]
QED
Definition unhex_alt_def:
unhex_alt x = (if isHexDigit x then UNHEX x else 0n)
End
Definition num_from_dec_string_alt_def:
num_from_dec_string_alt = s2n 10 unhex_alt
End
Definition num_from_hex_string_alt_def:
num_from_hex_string_alt = s2n 16 unhex_alt
End
Definition read_FFIcall_def:
(read_FFIcall "" acc loc = (ErrorS, loc, "")) ∧
(read_FFIcall (c::s0) acc loc =
if c = #")" then
(FFIS (REVERSE acc), next_loc 2 loc, s0)
else if c = #"\n" then (ErrorS, loc, s0)
else if isSpace c then
read_FFIcall s0 acc (next_loc 1 loc)
else
read_FFIcall s0 (c::acc) (next_loc 1 loc))
End
Theorem read_FFIcall_reduces_input:
∀s0 a l0 t l s.
read_FFIcall s0 a l0 = (t, l, s) ⇒ LENGTH s < LENGTH s0 + 1
Proof
Induct >> dsimp[read_FFIcall_def, bool_case_eq] >> rw[] >>
qpat_x_assum `_ = _` (assume_tac o SYM) >> res_tac >> simp[]
QED
Definition next_sym_alt_def:
(next_sym_alt "" _ = NONE) /\
(next_sym_alt (c::str) loc =
if c = #"\n" then (* skip new line *)
next_sym_alt str (next_line loc)
else if isSpace c then (* skip blank space *)
next_sym_alt str (next_loc 1 loc)
else if isDigit c then (* read number *)
if str ≠ "" ∧ c = #"0" ∧ HD str = #"w" then
if TL str = "" then SOME (ErrorS, Locs loc loc, "")
else if isDigit (HD (TL str)) then
let (n,rest) = read_while isDigit (TL str) [] in
SOME (WordS (num_from_dec_string_alt n),
Locs loc (next_loc (LENGTH n + 1) loc),
rest)
else if HD(TL str) = #"x" then
let (n,rest) = read_while isHexDigit (TL (TL str)) [] in
SOME (WordS (num_from_hex_string_alt n),
Locs loc (next_loc (LENGTH n + 2) loc),
rest)
else SOME (ErrorS, Locs loc loc, TL str)
else
if str ≠ "" ∧ c = #"0" ∧ HD str = #"x" then
let (n,rest) = read_while isHexDigit (TL str) [] in
SOME (NumberS (& num_from_hex_string_alt n),
Locs loc (next_loc (LENGTH n) loc),
rest)
else
let (n,rest) = read_while isDigit str [] in
SOME (NumberS (&(num_from_dec_string_alt (c::n))),
Locs loc (next_loc (LENGTH n) loc),
rest)
else if c = #"~" /\ str <> "" /\ isDigit (HD str) then
(* read negative number *)
let (n,rest) = read_while isDigit str [] in
SOME (NumberS (0- &(num_from_dec_string_alt n)),
Locs loc (next_loc (LENGTH n) loc),
rest)
else if c = #"'" then (* read type variable *)
let (n,rest) = read_while isAlphaNumPrime str [c] in
SOME (OtherS n,
Locs loc (next_loc (LENGTH n - 1) loc),
rest)
else if c = #"\"" then (* read string *)
let (t, loc', rest) = read_string str "" (next_loc 1 loc) in
SOME (t, Locs loc loc', rest)
else if c = #"`" then SOME (OtherS "`", Locs loc loc, str)
else if isPREFIX "#\"" (c::str) then
let (t, loc', rest) = read_string (TL str) "" (next_loc 2 loc) in
SOME (mkCharS t, Locs loc loc', rest)
else if isPREFIX "#(" (c::str) then
let (t, loc', rest) =
read_FFIcall (TL str) "" (next_loc 2 loc)
in
SOME (t, Locs loc loc', rest)
else if isPREFIX "{-" (c::str) then
case skip_nested_comment (TL str) (0:num) (next_loc 2 loc) of
| NONE => SOME (ErrorS, Locs loc (next_loc 2 loc), "")
| SOME (rest, loc') => next_sym_alt rest loc'
else if isPREFIX "--" (c::str) ∧ (2 ≤ LENGTH str ⇒ ¬isPunct (EL 1 str))
then
case skip_eol_comment (TL str) (next_loc 2 loc) of
NONE => SOME (ErrorS, Locs loc (next_loc 2 loc), "")
| SOME (rest, loc') => next_sym_alt rest loc'
else if is_single_char_symbol c then (* single character tokens, i.e. delimiters *)
SOME (OtherS [c], Locs loc loc, str)
else if isSymbol c then
let (n,rest) = read_while isSymbol str [c] in
SOME (OtherS n,
Locs loc (next_loc (LENGTH n - 1) loc),
rest)
else if isAlpha c then (* read identifier *)
let (n,rest) = read_while isAlphaNumPrime str [c] in
case rest of
#"."::rest' =>
(case rest' of
c'::rest' =>
if isAlpha c' then
let (n', rest'') = read_while isAlphaNumPrime rest' [c'] in
SOME (LongS (n ++ "." ++ n'),
Locs loc
(next_loc (LENGTH n + LENGTH n') loc),
rest'')
else if isSymbol c' then
let (n', rest'') = read_while isSymbol rest' [c'] in
SOME (LongS (n ++ "." ++ n'),
Locs loc
(next_loc (LENGTH n + LENGTH n') loc),
rest'')
else
SOME (ErrorS,
Locs loc (next_loc (LENGTH n) loc),
rest')
| "" => SOME (ErrorS,
Locs loc (next_loc (LENGTH n) loc),
[]))
| _ => SOME (OtherS n,
Locs loc (next_loc (LENGTH n - 1) loc),
rest)
else if c = #"_" then SOME (OtherS "_", Locs loc loc, str)
else (* input not recognised *)
SOME (ErrorS, Locs loc loc, str))
Termination
WF_REL_TAC ‘measure (LENGTH o FST)’ >> rpt strip_tac >> simp[] >~
[‘skip_nested_comment’]
>- (drule skip_nested_comment_thm >> rename [‘TL str’] >> Cases_on ‘str’ >>
gs[]) >~
[‘skip_eol_comment’]
>- (drule skip_eol_comment_thm >> rename [‘TL str’] >> Cases_on ‘str’ >>
gs[])
End
Triviality EVERY_isDigit_imp:
EVERY isDigit x ⇒ MAP UNHEX x = MAP unhex_alt x
Proof
rw[]>>match_mp_tac LIST_EQ>>
fs[EL_MAP,EVERY_EL,unhex_alt_def,isDigit_def,isHexDigit_def]
QED
Triviality toNum_rw:
∀x. EVERY isDigit x ⇒ toNum x = num_from_dec_string_alt x
Proof
rw[ASCIInumbersTheory.s2n_def,ASCIInumbersTheory.num_from_dec_string_def,
num_from_dec_string_alt_def]>>
AP_TERM_TAC>>
match_mp_tac EVERY_isDigit_imp>>
metis_tac[rich_listTheory.EVERY_REVERSE]
QED
Triviality EVERY_isHexDigit_imp:
EVERY isHexDigit x ⇒ MAP UNHEX x = MAP unhex_alt x
Proof
rw[]>>match_mp_tac LIST_EQ>>fs[EL_MAP,EVERY_EL,unhex_alt_def]
QED
Triviality num_from_hex_string_rw:
∀x. EVERY isHexDigit x ⇒ num_from_hex_string x = num_from_hex_string_alt x
Proof
rw[ASCIInumbersTheory.s2n_def,ASCIInumbersTheory.num_from_hex_string_def,
num_from_hex_string_alt_def]>>
AP_TERM_TAC>>
match_mp_tac EVERY_isHexDigit_imp>>
metis_tac[rich_listTheory.EVERY_REVERSE]
QED
Triviality EVERY_IMPLODE:
∀ls P. EVERY P (IMPLODE ls) ⇔ EVERY P ls
Proof Induct>>fs[]
QED
Triviality read_while_P_lem:
∀ ls rest P x y.
EVERY P rest ∧ read_while P ls rest = (x,y) ⇒ EVERY P x
Proof
Induct>>fs[read_while_def]>>rw[]>>
fs[EVERY_IMPLODE,rich_listTheory.EVERY_REVERSE]>>
first_assum match_mp_tac>>fs[]>>
qexists_tac`STRING h rest`>>fs[]
QED
Theorem read_while_P[local]:
∀ls P x y. read_while P ls "" = (x,y) ⇒ EVERY P x
Proof
rw[]>>ho_match_mp_tac read_while_P_lem>>
MAP_EVERY qexists_tac [`ls`,`""`,`y`]>>fs[]
QED
Theorem next_sym_alt_LESS:
∀input0 locn input.
next_sym_alt input0 locn = SOME (sym, locn', input) ⇒
LENGTH input < LENGTH input0
Proof
recInduct next_sym_alt_ind >> simp[next_sym_alt_def] >> rw[] >>
rpt (pairarg_tac >> gvs[AllCaseEqs()]) >> gvs[AllCaseEqs()] >>~-
([‘TL (TL str)’, ‘read_while’],
Cases_on ‘str’ >> gvs[] >> rename [‘TL str0’] >>
Cases_on ‘str0’ >> gvs[] >> drule read_while_thm >> simp[]) >>~-
([‘TL str’, ‘read_while’],
Cases_on ‘str’ >> gvs[] >> drule read_while_thm >> simp[]) >>~-
([‘read_while’], rpt $ dxrule read_while_thm >> simp[]) >~
[‘skip_nested_comment (TL str) _ _ = _’]
>- (drule skip_nested_comment_thm >> Cases_on ‘str’ >> gvs[]) >~
[‘read_string’, ‘TL str’]
>- (Cases_on ‘str’ >> gvs[] >> drule read_string_thm >> simp[]) >~
[‘read_string’]
>- (drule read_string_thm >> simp[]) >~
[‘read_FFIcall (TL str)’]
>- (Cases_on ‘str’ >> gs[] >> drule read_FFIcall_reduces_input >> simp[]) >~
[‘skip_eol_comment’]
>- (drule skip_eol_comment_thm >> Cases_on ‘str’ >> gvs[]) >>
rename [‘TL str’] >> Cases_on ‘str’ >> gs[]
QED
(* lex_until_toplevel_semicolon *)
Definition processIdent_def:
processIdent s =
case s of
| "" => LexErrorT
| c::s =>
if isAlpha c then
AlphaT (c::s)
else
SymbolT (c::s)
End
Definition get_token_def[nocompute]:
get_token s =
if s = "(" then LparT else
if s = ")" then RparT else
if s = "," then CommaT else
if s = ";" then SemicolonT else
if s = "=" then EqualsT else
if s = "[" then LbrackT else
if s = "]" then RbrackT else
if s = "{" then LbraceT else
if s = "}" then RbraceT else
if s = "|" then BarT else
if s = "_" then UnderbarT else
if s = "case" then CaseT else
if s = "else" then ElseT else
if s = "exception" then ExceptionT else
if s = "if" then IfT else
if s = "in" then InT else
if s = "include" then IncludeT else
if s = "let" then LetT else
if s = "of" then OfT else
if s = "then" then ThenT else
if s = "type" then TypeT else
if s = "where" then WhereT else
processIdent s
End
Definition token_of_sym_def:
token_of_sym s =
case s of
| ErrorS => LexErrorT
| StringS s => StringT s
| CharS c => CharT c
| NumberS i => IntT i
| WordS n => WordT n
| LongS s => let (s1,s2) = SPLITP (\x. x = #".") s in
LongidT s1 (case s2 of "" => "" | (c::cs) => cs)
| FFIS s => FFIT s
| OtherS s => get_token s
End
Definition next_token_def:
next_token input loc =
case next_sym_alt input loc of
| NONE => NONE
| SOME (sym, locs, rest_of_input) =>
SOME (token_of_sym sym, locs, rest_of_input)
End
Theorem next_token_LESS:
∀s l l' rest input. (next_token input l = SOME (s, l', rest)) ⇒
LENGTH rest < LENGTH input
Proof
rpt gen_tac >> Cases_on ‘next_sym_alt input l’ >>
simp[next_token_def, AllCaseEqs(), PULL_EXISTS] >> rw[] >>
drule next_sym_alt_LESS >> simp[]
QED
(* top-level lexer specification *)
Definition lexer_fun_aux_def:
lexer_fun_aux input loc =
case next_token input loc of
| NONE => []
| SOME (token, Locs loc' loc'', rest_of_input) =>
(token, Locs loc' loc'') ::
lexer_fun_aux rest_of_input (next_loc 1 loc'')
Termination
WF_REL_TAC ‘measure (LENGTH o FST)’ >> rw[] >> imp_res_tac next_token_LESS
End
Definition lexer_fun_def:
lexer_fun input = lexer_fun_aux input (POSN 1 1)
End
val _ = export_theory();