diff --git a/document/core/text/lexical.rst b/document/core/text/lexical.rst index a3b4529d35..0c40a58d15 100644 --- a/document/core/text/lexical.rst +++ b/document/core/text/lexical.rst @@ -50,7 +50,7 @@ The character stream in the source text is divided, from left to right, into a s (\text{a} ~|~ \dots ~|~ \text{z})~\Tidchar^\ast \qquad (\mbox{if occurring as a literal terminal in the grammar}) \\ \production{reserved} & \Treserved &::=& - \Tidchar^+ \\ + (\Tidchar ~|~ \Tstring)^+ \\ \end{array} Tokens are formed from the input character stream according to the *longest match* rule. @@ -63,9 +63,9 @@ The set of *keyword* tokens is defined implicitly, by all occurrences of a :ref: Any token that does not fall into any of the other categories is considered *reserved*, and cannot occur in source text. .. note:: - The effect of defining the set of reserved tokens is that all tokens must be separated by either parentheses or :ref:`white space `. - For example, :math:`\text{0\$x}` is a single reserved token. - Consequently, it is not recognized as two separate tokens :math:`\text{0}` and :math:`\text{\$x}`, but instead disallowed. + The effect of defining the set of reserved tokens is that all tokens must be separated by either parentheses, :ref:`white space `, or :ref:`comments `. + For example, :math:`\text{0\$x}` is a single reserved token, as is :math:`\text{"a""b"}`. + Consequently, they are not recognized as two separate tokens :math:`\text{0}` and :math:`\text{\$x}`, or :math:`"a"` and :math:`"b"`, respectively, but instead disallowed. This property of tokenization is not affected by the fact that the definition of reserved tokens overlaps with other token classes. diff --git a/interpreter/text/lexer.mll b/interpreter/text/lexer.mll index 161520e0a4..d9a12b5d21 100644 --- a/interpreter/text/lexer.mll +++ b/interpreter/text/lexer.mll @@ -62,8 +62,9 @@ let symbol = ['+''-''*''/''\\''^''~''=''<''>''!''?''@''#''$''%''&''|'':''`''.''\''] let space = [' ''\t''\n''\r'] +let control = ['\x00'-'\x1f'] # space let ascii = ['\x00'-'\x7f'] -let ascii_no_nl = ['\x00'-'\x09''\x0b'-'\x7f'] +let ascii_no_nl = ascii # '\x0a' let utf8cont = ['\x80'-'\xbf'] let utf8enc = ['\xc2'-'\xdf'] utf8cont @@ -103,7 +104,7 @@ let name = idchar+ let id = '$' name let keyword = ['a'-'z'] (letter | digit | '_' | '.' | ':')+ -let reserved = name | ',' | ';' | '[' | ']' | '{' | '}' +let reserved = (idchar | string)+ | ',' | ';' | '[' | ']' | '{' | '}' let ixx = "i" ("32" | "64") let fxx = "f" ("32" | "64") @@ -705,13 +706,14 @@ rule token = parse | eof { EOF } | reserved { unknown lexbuf } - | utf8 { error lexbuf "malformed operator" } + | control { error lexbuf "misplaced control character" } + | utf8enc { error lexbuf "misplaced unicode character" } | _ { error lexbuf "malformed UTF-8 encoding" } and comment start = parse | ";)" { () } | "(;" { comment (Lexing.lexeme_start_p lexbuf) lexbuf; comment start lexbuf } | '\n' { Lexing.new_line lexbuf; comment start lexbuf } + | utf8_no_nl { comment start lexbuf } | eof { error_nest start lexbuf "unclosed comment" } - | utf8 { comment start lexbuf } | _ { error lexbuf "malformed UTF-8 encoding" } diff --git a/test/core/tokens.wast b/test/core/tokens.wast new file mode 100644 index 0000000000..4e785154e9 --- /dev/null +++ b/test/core/tokens.wast @@ -0,0 +1,274 @@ +;; Tokens can be delimited by parentheses + +(module + (func(nop)) +) +(module + (func (nop)nop) +) +(module + (func nop(nop)) +) +(module + (func(nop)(nop)) +) +(module + (func $f(nop)) +) +(module + (func br 0(nop)) +) +(module + (table 1 funcref) + (func) + (elem (i32.const 0)0) +) +(module + (table 1 funcref) + (func $f) + (elem (i32.const 0)$f) +) +(module + (memory 1) + (data (i32.const 0)"a") +) +(module + (import "spectest" "print"(func)) +) + + +;; Tokens can be delimited by comments + +(module + (func;;bla + ) +) +(module + (func (nop);;bla + ) +) +(module + (func nop;;bla + ) +) +(module + (func $f;;bla + ) +) +(module + (func br 0;;bla + ) +) +(module + (data "a";;bla + ) +) + + +;; Space required between symbols and non-parenthesis tokens + +(module + (func (block $l (i32.const 0) (br_table 0 $l))) +) +(assert_malformed + (module quote + "(func (block $l (i32.const 0) (br_table 0$l)))" + ) + "unknown operator" +) + +(module + (func (block $l (i32.const 0) (br_table $l 0))) +) +(assert_malformed + (module quote + "(func (block $l (i32.const 0) (br_table $l0)))" + ) + "unknown label" +) + +(module + (func (block $l (i32.const 0) (br_table $l $l))) +) +(assert_malformed + (module quote + "(func (block $l (i32.const 0) (br_table $l$l)))" + ) + "unknown label" +) + +(module + (func (block $l0 (i32.const 0) (br_table $l0))) +) +(module + (func (block $l$l (i32.const 0) (br_table $l$l))) +) + + +;; Space required between strings and non-parenthesis tokens + +(module + (data "a") +) +(assert_malformed + (module quote + "(data\"a\")" + ) + "unknown operator" +) + +(module + (data $l "a") +) +(assert_malformed + (module quote + "(data $l\"a\")" + ) + "unknown operator" +) + +(module + (data $l " a") +) +(assert_malformed + (module quote + "(data $l\" a\")" + ) + "unknown operator" +) + +(module + (data $l "a ") +) +(assert_malformed + (module quote + "(data $l\"a \")" + ) + "unknown operator" +) + +(module + (data $l "a " "b") +) +(assert_malformed + (module quote + "(data $l\"a \"\"b\")" + ) + "unknown operator" +) + +(module + (data $l "") +) +(assert_malformed + (module quote + "(data $l\"\")" + ) + "unknown operator" +) + +(module + (data $l " ") +) +(assert_malformed + (module quote + "(data $l\" \")" + ) + "unknown operator" +) + +(module + (data $l " ") +) +(assert_malformed + (module quote + "(data $l\" \")" + ) + "unknown operator" +) + +(module + (data "a" "b") +) +(assert_malformed + (module quote + "(data \"a\"\"b\")" + ) + "unknown operator" +) + +(module + (data "a" " b") +) +(assert_malformed + (module quote + "(data \"a\"\" b\")" + ) + "unknown operator" +) + +(module + (data "a " "b") +) +(assert_malformed + (module quote + "(data \"a \"\"b\")" + ) + "unknown operator" +) + +(module + (data "" "") +) +(assert_malformed + (module quote + "(data \"\"\"\")" + ) + "unknown operator" +) + +(module + (data "" " ") +) +(assert_malformed + (module quote + "(data \"\"\" \")" + ) + "unknown operator" +) + +(module + (data " " "") +) +(assert_malformed + (module quote + "(data \" \"\"\")" + ) + "unknown operator" +) + + +(assert_malformed + (module quote + "(func \"a\"x)" + ) + "unknown operator" +) +(assert_malformed + (module quote + "(func \"a\"0)" + ) + "unknown operator" +) +(assert_malformed + (module quote + "(func 0\"a\")" + ) + "unknown operator" +) +(assert_malformed + (module quote + "(func \"a\"$x)" + ) + "unknown operator" +)