Skip to content

Commit

Permalink
[spec] Tweak tokenisation for text format (WebAssembly#1499)
Browse files Browse the repository at this point in the history
  • Loading branch information
rossberg committed Aug 2, 2022
1 parent c502b7a commit 11d1a53
Show file tree
Hide file tree
Showing 3 changed files with 284 additions and 8 deletions.
8 changes: 4 additions & 4 deletions document/core/text/lexical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ The character stream in the source text is divided, from left to right, into a s
(\text{a} ~|~ \dots ~|~ \text{z})~\Tidchar^\ast
\qquad (\mbox{if occurring as a literal terminal in the grammar}) \\
\production{reserved} & \Treserved &::=&
\Tidchar^+ \\
(\Tidchar ~|~ \Tstring)^+ \\
\end{array}
Tokens are formed from the input character stream according to the *longest match* rule.
Expand All @@ -63,9 +63,9 @@ The set of *keyword* tokens is defined implicitly, by all occurrences of a :ref:
Any token that does not fall into any of the other categories is considered *reserved*, and cannot occur in source text.

.. note::
The effect of defining the set of reserved tokens is that all tokens must be separated by either parentheses or :ref:`white space <text-space>`.
For example, :math:`\text{0\$x}` is a single reserved token.
Consequently, it is not recognized as two separate tokens :math:`\text{0}` and :math:`\text{\$x}`, but instead disallowed.
The effect of defining the set of reserved tokens is that all tokens must be separated by either parentheses, :ref:`white space <text-space>`, or :ref:`comments <text-comment>`.
For example, :math:`\text{0\$x}` is a single reserved token, as is :math:`\text{"a""b"}`.
Consequently, they are not recognized as two separate tokens :math:`\text{0}` and :math:`\text{\$x}`, or :math:`"a"` and :math:`"b"`, respectively, but instead disallowed.
This property of tokenization is not affected by the fact that the definition of reserved tokens overlaps with other token classes.


Expand Down
10 changes: 6 additions & 4 deletions interpreter/text/lexer.mll
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,9 @@ let symbol =
['+''-''*''/''\\''^''~''=''<''>''!''?''@''#''$''%''&''|'':''`''.''\'']

let space = [' ''\t''\n''\r']
let control = ['\x00'-'\x1f'] # space
let ascii = ['\x00'-'\x7f']
let ascii_no_nl = ['\x00'-'\x09''\x0b'-'\x7f']
let ascii_no_nl = ascii # '\x0a'
let utf8cont = ['\x80'-'\xbf']
let utf8enc =
['\xc2'-'\xdf'] utf8cont
Expand Down Expand Up @@ -104,7 +105,7 @@ let name = idchar+
let id = '$' name

let keyword = ['a'-'z'] (letter | digit | '_' | '.' | ':')+
let reserved = name | ',' | ';' | '[' | ']' | '{' | '}'
let reserved = (idchar | string)+ | ',' | ';' | '[' | ']' | '{' | '}'

let ixx = "i" ("32" | "64")
let fxx = "f" ("32" | "64")
Expand Down Expand Up @@ -713,13 +714,14 @@ rule token = parse
| eof { EOF }

| reserved { unknown lexbuf }
| utf8 { error lexbuf "malformed operator" }
| control { error lexbuf "misplaced control character" }
| utf8enc { error lexbuf "misplaced unicode character" }
| _ { error lexbuf "malformed UTF-8 encoding" }

and comment start = parse
| ";)" { () }
| "(;" { comment (Lexing.lexeme_start_p lexbuf) lexbuf; comment start lexbuf }
| '\n' { Lexing.new_line lexbuf; comment start lexbuf }
| utf8_no_nl { comment start lexbuf }
| eof { error_nest start lexbuf "unclosed comment" }
| utf8 { comment start lexbuf }
| _ { error lexbuf "malformed UTF-8 encoding" }
274 changes: 274 additions & 0 deletions test/core/tokens.wast
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
;; Tokens can be delimited by parentheses

(module
(func(nop))
)
(module
(func (nop)nop)
)
(module
(func nop(nop))
)
(module
(func(nop)(nop))
)
(module
(func $f(nop))
)
(module
(func br 0(nop))
)
(module
(table 1 funcref)
(func)
(elem (i32.const 0)0)
)
(module
(table 1 funcref)
(func $f)
(elem (i32.const 0)$f)
)
(module
(memory 1)
(data (i32.const 0)"a")
)
(module
(import "spectest" "print"(func))
)


;; Tokens can be delimited by comments

(module
(func;;bla
)
)
(module
(func (nop);;bla
)
)
(module
(func nop;;bla
)
)
(module
(func $f;;bla
)
)
(module
(func br 0;;bla
)
)
(module
(data "a";;bla
)
)


;; Space required between symbols and non-parenthesis tokens

(module
(func (block $l (i32.const 0) (br_table 0 $l)))
)
(assert_malformed
(module quote
"(func (block $l (i32.const 0) (br_table 0$l)))"
)
"unknown operator"
)

(module
(func (block $l (i32.const 0) (br_table $l 0)))
)
(assert_malformed
(module quote
"(func (block $l (i32.const 0) (br_table $l0)))"
)
"unknown label"
)

(module
(func (block $l (i32.const 0) (br_table $l $l)))
)
(assert_malformed
(module quote
"(func (block $l (i32.const 0) (br_table $l$l)))"
)
"unknown label"
)

(module
(func (block $l0 (i32.const 0) (br_table $l0)))
)
(module
(func (block $l$l (i32.const 0) (br_table $l$l)))
)


;; Space required between strings and non-parenthesis tokens

(module
(data "a")
)
(assert_malformed
(module quote
"(data\"a\")"
)
"unknown operator"
)

(module
(data $l "a")
)
(assert_malformed
(module quote
"(data $l\"a\")"
)
"unknown operator"
)

(module
(data $l " a")
)
(assert_malformed
(module quote
"(data $l\" a\")"
)
"unknown operator"
)

(module
(data $l "a ")
)
(assert_malformed
(module quote
"(data $l\"a \")"
)
"unknown operator"
)

(module
(data $l "a " "b")
)
(assert_malformed
(module quote
"(data $l\"a \"\"b\")"
)
"unknown operator"
)

(module
(data $l "")
)
(assert_malformed
(module quote
"(data $l\"\")"
)
"unknown operator"
)

(module
(data $l " ")
)
(assert_malformed
(module quote
"(data $l\" \")"
)
"unknown operator"
)

(module
(data $l " ")
)
(assert_malformed
(module quote
"(data $l\" \")"
)
"unknown operator"
)

(module
(data "a" "b")
)
(assert_malformed
(module quote
"(data \"a\"\"b\")"
)
"unknown operator"
)

(module
(data "a" " b")
)
(assert_malformed
(module quote
"(data \"a\"\" b\")"
)
"unknown operator"
)

(module
(data "a " "b")
)
(assert_malformed
(module quote
"(data \"a \"\"b\")"
)
"unknown operator"
)

(module
(data "" "")
)
(assert_malformed
(module quote
"(data \"\"\"\")"
)
"unknown operator"
)

(module
(data "" " ")
)
(assert_malformed
(module quote
"(data \"\"\" \")"
)
"unknown operator"
)

(module
(data " " "")
)
(assert_malformed
(module quote
"(data \" \"\"\")"
)
"unknown operator"
)


(assert_malformed
(module quote
"(func \"a\"x)"
)
"unknown operator"
)
(assert_malformed
(module quote
"(func \"a\"0)"
)
"unknown operator"
)
(assert_malformed
(module quote
"(func 0\"a\")"
)
"unknown operator"
)
(assert_malformed
(module quote
"(func \"a\"$x)"
)
"unknown operator"
)

0 comments on commit 11d1a53

Please sign in to comment.