[spec] Tweak tokenisation for text format (WebAssembly#1499)

dhil · Aug 2, 2022 · 11d1a53 · 11d1a53
1 parent c502b7a
commit 11d1a53
Show file tree

Hide file tree

Showing 3 changed files with 284 additions and 8 deletions.
diff --git a/document/core/text/lexical.rst b/document/core/text/lexical.rst
@@ -50,7 +50,7 @@ The character stream in the source text is divided, from left to right, into a s
      (\text{a} ~|~ \dots ~|~ \text{z})~\Tidchar^\ast
      \qquad (\mbox{if occurring as a literal terminal in the grammar}) \\
    \production{reserved} & \Treserved &::=&
-     \Tidchar^+ \\
+     (\Tidchar ~|~ \Tstring)^+ \\
    \end{array}
 
 Tokens are formed from the input character stream according to the *longest match* rule.
@@ -63,9 +63,9 @@ The set of *keyword* tokens is defined implicitly, by all occurrences of a :ref:
 Any token that does not fall into any of the other categories is considered *reserved*, and cannot occur in source text.
 
 .. note::
-   The effect of defining the set of reserved tokens is that all tokens must be separated by either parentheses or :ref:`white space <text-space>`.
-   For example, :math:`\text{0\$x}` is a single reserved token.
-   Consequently, it is not recognized as two separate tokens :math:`\text{0}` and :math:`\text{\$x}`, but instead disallowed.
+   The effect of defining the set of reserved tokens is that all tokens must be separated by either parentheses, :ref:`white space <text-space>`, or :ref:`comments <text-comment>`.
+   For example, :math:`\text{0\$x}` is a single reserved token, as is :math:`\text{"a""b"}`.
+   Consequently, they are not recognized as two separate tokens :math:`\text{0}` and :math:`\text{\$x}`, or :math:`"a"` and :math:`"b"`, respectively, but instead disallowed.
    This property of tokenization is not affected by the fact that the definition of reserved tokens overlaps with other token classes.
 
 

diff --git a/interpreter/text/lexer.mll b/interpreter/text/lexer.mll
@@ -63,8 +63,9 @@ let symbol =
   ['+''-''*''/''\\''^''~''=''<''>''!''?''@''#''$''%''&''|'':''`''.''\'']
 
 let space = [' ''\t''\n''\r']
+let control = ['\x00'-'\x1f'] # space
 let ascii = ['\x00'-'\x7f']
-let ascii_no_nl = ['\x00'-'\x09''\x0b'-'\x7f']
+let ascii_no_nl = ascii # '\x0a'
 let utf8cont = ['\x80'-'\xbf']
 let utf8enc =
     ['\xc2'-'\xdf'] utf8cont
@@ -104,7 +105,7 @@ let name = idchar+
 let id = '$' name
 
 let keyword = ['a'-'z'] (letter | digit | '_' | '.' | ':')+
-let reserved = name | ',' | ';' | '[' | ']' | '{' | '}'
+let reserved = (idchar | string)+ | ',' | ';' | '[' | ']' | '{' | '}'
 
 let ixx = "i" ("32" | "64")
 let fxx = "f" ("32" | "64")
@@ -713,13 +714,14 @@ rule token = parse
   | eof { EOF }
 
   | reserved { unknown lexbuf }
-  | utf8 { error lexbuf "malformed operator" }
+  | control { error lexbuf "misplaced control character" }
+  | utf8enc { error lexbuf "misplaced unicode character" }
   | _ { error lexbuf "malformed UTF-8 encoding" }
 
 and comment start = parse
   | ";)" { () }
   | "(;" { comment (Lexing.lexeme_start_p lexbuf) lexbuf; comment start lexbuf }
   | '\n' { Lexing.new_line lexbuf; comment start lexbuf }
+  | utf8_no_nl { comment start lexbuf }
   | eof { error_nest start lexbuf "unclosed comment" }
-  | utf8 { comment start lexbuf }
   | _ { error lexbuf "malformed UTF-8 encoding" }
diff --git a/test/core/tokens.wast b/test/core/tokens.wast
@@ -0,0 +1,274 @@
+;; Tokens can be delimited by parentheses
+
+(module
+  (func(nop))
+)
+(module
+  (func (nop)nop)
+)
+(module
+  (func nop(nop))
+)
+(module
+  (func(nop)(nop))
+)
+(module
+  (func $f(nop))
+)
+(module
+  (func br 0(nop))
+)
+(module
+  (table 1 funcref)
+  (func)
+  (elem (i32.const 0)0)
+)
+(module
+  (table 1 funcref)
+  (func $f)
+  (elem (i32.const 0)$f)
+)
+(module
+  (memory 1)
+  (data (i32.const 0)"a")
+)
+(module
+  (import "spectest" "print"(func))
+)
+
+
+;; Tokens can be delimited by comments
+
+(module
+  (func;;bla
+  )
+)
+(module
+  (func (nop);;bla
+  )
+)
+(module
+  (func nop;;bla
+  )
+)
+(module
+  (func $f;;bla
+  )
+)
+(module
+  (func br 0;;bla
+  )
+)
+(module
+  (data "a";;bla
+  )
+)
+
+
+;; Space required between symbols and non-parenthesis tokens
+
+(module
+  (func (block $l (i32.const 0) (br_table 0 $l)))
+)
+(assert_malformed
+  (module quote
+    "(func (block $l (i32.const 0) (br_table 0$l)))"
+  )
+  "unknown operator"
+)
+
+(module
+  (func (block $l (i32.const 0) (br_table $l 0)))
+)
+(assert_malformed
+  (module quote
+    "(func (block $l (i32.const 0) (br_table $l0)))"
+  )
+  "unknown label"
+)
+
+(module
+  (func (block $l (i32.const 0) (br_table $l $l)))
+)
+(assert_malformed
+  (module quote
+    "(func (block $l (i32.const 0) (br_table $l$l)))"
+  )
+  "unknown label"
+)
+
+(module
+  (func (block $l0 (i32.const 0) (br_table $l0)))
+)
+(module
+  (func (block $l$l (i32.const 0) (br_table $l$l)))
+)
+
+
+;; Space required between strings and non-parenthesis tokens
+
+(module
+  (data "a")
+)
+(assert_malformed
+  (module quote
+    "(data\"a\")"
+  )
+  "unknown operator"
+)
+
+(module
+  (data $l "a")
+)
+(assert_malformed
+  (module quote
+    "(data $l\"a\")"
+  )
+  "unknown operator"
+)
+
+(module
+  (data $l " a")
+)
+(assert_malformed
+  (module quote
+    "(data $l\" a\")"
+  )
+  "unknown operator"
+)
+
+(module
+  (data $l "a ")
+)
+(assert_malformed
+  (module quote
+    "(data $l\"a \")"
+  )
+  "unknown operator"
+)
+
+(module
+  (data $l "a " "b")
+)
+(assert_malformed
+  (module quote
+    "(data $l\"a \"\"b\")"
+  )
+  "unknown operator"
+)
+
+(module
+  (data $l "")
+)
+(assert_malformed
+  (module quote
+    "(data $l\"\")"
+  )
+  "unknown operator"
+)
+
+(module
+  (data $l " ")
+)
+(assert_malformed
+  (module quote
+    "(data $l\" \")"
+  )
+  "unknown operator"
+)
+
+(module
+  (data $l " ")
+)
+(assert_malformed
+  (module quote
+    "(data $l\" \")"
+  )
+  "unknown operator"
+)
+
+(module
+  (data "a" "b")
+)
+(assert_malformed
+  (module quote
+    "(data \"a\"\"b\")"
+  )
+  "unknown operator"
+)
+
+(module
+  (data "a" " b")
+)
+(assert_malformed
+  (module quote
+    "(data \"a\"\" b\")"
+  )
+  "unknown operator"
+)
+
+(module
+  (data "a " "b")
+)
+(assert_malformed
+  (module quote
+    "(data \"a \"\"b\")"
+  )
+  "unknown operator"
+)
+
+(module
+  (data "" "")
+)
+(assert_malformed
+  (module quote
+    "(data \"\"\"\")"
+  )
+  "unknown operator"
+)
+
+(module
+  (data "" " ")
+)
+(assert_malformed
+  (module quote
+    "(data \"\"\" \")"
+  )
+  "unknown operator"
+)
+
+(module
+  (data " " "")
+)
+(assert_malformed
+  (module quote
+    "(data \" \"\"\")"
+  )
+  "unknown operator"
+)
+
+
+(assert_malformed
+  (module quote
+    "(func \"a\"x)"
+  )
+  "unknown operator"
+)
+(assert_malformed
+  (module quote
+    "(func \"a\"0)"
+  )
+  "unknown operator"
+)
+(assert_malformed
+  (module quote
+    "(func 0\"a\")"
+  )
+  "unknown operator"
+)
+(assert_malformed
+  (module quote
+    "(func \"a\"$x)"
+  )
+  "unknown operator"
+)