Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[spec] Tweak tokenisation for text format #1499

Merged
merged 2 commits into from
Jun 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions document/core/text/lexical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ The character stream in the source text is divided, from left to right, into a s
(\text{a} ~|~ \dots ~|~ \text{z})~\Tidchar^\ast
\qquad (\mbox{if occurring as a literal terminal in the grammar}) \\
\production{reserved} & \Treserved &::=&
\Tidchar^+ \\
(\Tidchar ~|~ \Tstring)^+ \\
\end{array}

Tokens are formed from the input character stream according to the *longest match* rule.
Expand All @@ -63,9 +63,9 @@ The set of *keyword* tokens is defined implicitly, by all occurrences of a :ref:
Any token that does not fall into any of the other categories is considered *reserved*, and cannot occur in source text.

.. note::
The effect of defining the set of reserved tokens is that all tokens must be separated by either parentheses or :ref:`white space <text-space>`.
For example, :math:`\text{0\$x}` is a single reserved token.
Consequently, it is not recognized as two separate tokens :math:`\text{0}` and :math:`\text{\$x}`, but instead disallowed.
The effect of defining the set of reserved tokens is that all tokens must be separated by either parentheses, :ref:`white space <text-space>`, or :ref:`comments <text-comment>`.
For example, :math:`\text{0\$x}` is a single reserved token, as is :math:`\text{"a""b"}`.
Consequently, they are not recognized as two separate tokens :math:`\text{0}` and :math:`\text{\$x}`, or :math:`"a"` and :math:`"b"`, respectively, but instead disallowed.
This property of tokenization is not affected by the fact that the definition of reserved tokens overlaps with other token classes.


Expand Down
10 changes: 6 additions & 4 deletions interpreter/text/lexer.mll
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ let symbol =
['+''-''*''/''\\''^''~''=''<''>''!''?''@''#''$''%''&''|'':''`''.''\'']

let space = [' ''\t''\n''\r']
let control = ['\x00'-'\x1f'] # space
let ascii = ['\x00'-'\x7f']
let ascii_no_nl = ['\x00'-'\x09''\x0b'-'\x7f']
let ascii_no_nl = ascii # '\x0a'
let utf8cont = ['\x80'-'\xbf']
let utf8enc =
['\xc2'-'\xdf'] utf8cont
Expand Down Expand Up @@ -103,7 +104,7 @@ let name = idchar+
let id = '$' name

let keyword = ['a'-'z'] (letter | digit | '_' | '.' | ':')+
let reserved = name | ',' | ';' | '[' | ']' | '{' | '}'
let reserved = (idchar | string)+ | ',' | ';' | '[' | ']' | '{' | '}'

let ixx = "i" ("32" | "64")
let fxx = "f" ("32" | "64")
Expand Down Expand Up @@ -705,13 +706,14 @@ rule token = parse
| eof { EOF }

| reserved { unknown lexbuf }
| utf8 { error lexbuf "malformed operator" }
| control { error lexbuf "misplaced control character" }
| utf8enc { error lexbuf "misplaced unicode character" }
| _ { error lexbuf "malformed UTF-8 encoding" }

and comment start = parse
| ";)" { () }
| "(;" { comment (Lexing.lexeme_start_p lexbuf) lexbuf; comment start lexbuf }
| '\n' { Lexing.new_line lexbuf; comment start lexbuf }
| utf8_no_nl { comment start lexbuf }
| eof { error_nest start lexbuf "unclosed comment" }
| utf8 { comment start lexbuf }
| _ { error lexbuf "malformed UTF-8 encoding" }
274 changes: 274 additions & 0 deletions test/core/tokens.wast
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
;; Tokens can be delimited by parentheses

(module
(func(nop))
)
(module
(func (nop)nop)
)
(module
(func nop(nop))
)
(module
(func(nop)(nop))
)
(module
(func $f(nop))
)
(module
(func br 0(nop))
)
(module
(table 1 funcref)
(func)
(elem (i32.const 0)0)
)
(module
(table 1 funcref)
(func $f)
(elem (i32.const 0)$f)
)
(module
(memory 1)
(data (i32.const 0)"a")
)
(module
(import "spectest" "print"(func))
)


;; Tokens can be delimited by comments

(module
(func;;bla
)
)
(module
(func (nop);;bla
)
)
(module
(func nop;;bla
)
)
(module
(func $f;;bla
)
)
(module
(func br 0;;bla
)
)
(module
(data "a";;bla
)
)


;; Space required between symbols and non-parenthesis tokens

(module
(func (block $l (i32.const 0) (br_table 0 $l)))
)
(assert_malformed
(module quote
"(func (block $l (i32.const 0) (br_table 0$l)))"
)
"unknown operator"
)

(module
(func (block $l (i32.const 0) (br_table $l 0)))
)
(assert_malformed
(module quote
"(func (block $l (i32.const 0) (br_table $l0)))"
)
"unknown label"
)

(module
(func (block $l (i32.const 0) (br_table $l $l)))
)
(assert_malformed
(module quote
"(func (block $l (i32.const 0) (br_table $l$l)))"
)
"unknown label"
)

(module
(func (block $l0 (i32.const 0) (br_table $l0)))
)
(module
(func (block $l$l (i32.const 0) (br_table $l$l)))
)


;; Space required between strings and non-parenthesis tokens

(module
(data "a")
)
(assert_malformed
(module quote
"(data\"a\")"
)
"unknown operator"
)

(module
(data $l "a")
)
(assert_malformed
(module quote
"(data $l\"a\")"
)
"unknown operator"
)

(module
(data $l " a")
)
(assert_malformed
(module quote
"(data $l\" a\")"
)
"unknown operator"
)

(module
(data $l "a ")
)
(assert_malformed
(module quote
"(data $l\"a \")"
)
"unknown operator"
)

(module
(data $l "a " "b")
)
(assert_malformed
(module quote
"(data $l\"a \"\"b\")"
)
"unknown operator"
)

(module
(data $l "")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, apparently, GH can't display the Unicode characters used here and below, at least not on my browser.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see two little box things, fwiw. What character is it supposed to be?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be honest, I don't know. I just copied it from comments.wast, but my editor only shows boxes as well. :)

)
(assert_malformed
(module quote
"(data $l\"\")"
)
"unknown operator"
)

(module
(data $l " ")
)
(assert_malformed
(module quote
"(data $l\" \")"
)
"unknown operator"
)

(module
(data $l " ")
)
(assert_malformed
(module quote
"(data $l\" \")"
)
"unknown operator"
)

(module
(data "a" "b")
)
(assert_malformed
(module quote
"(data \"a\"\"b\")"
)
"unknown operator"
)

(module
(data "a" " b")
)
(assert_malformed
(module quote
"(data \"a\"\" b\")"
)
"unknown operator"
)

(module
(data "a " "b")
)
(assert_malformed
(module quote
"(data \"a \"\"b\")"
)
"unknown operator"
)

(module
(data "" "")
)
(assert_malformed
(module quote
"(data \"\"\"\")"
)
"unknown operator"
)

(module
(data "" " ")
)
(assert_malformed
(module quote
"(data \"\"\" \")"
)
"unknown operator"
)

(module
(data " " "")
)
(assert_malformed
(module quote
"(data \" \"\"\")"
)
"unknown operator"
)


(assert_malformed
(module quote
"(func \"a\"x)"
)
"unknown operator"
)
(assert_malformed
(module quote
"(func \"a\"0)"
)
"unknown operator"
)
(assert_malformed
(module quote
"(func 0\"a\")"
)
"unknown operator"
)
(assert_malformed
(module quote
"(func \"a\"$x)"
)
"unknown operator"
)