From 6420f4107e856bc0d2e913c216d286f5eb80eb4c Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Fri, 13 Sep 2024 14:26:42 -0700 Subject: [PATCH 1/2] Require string-style identifiers to be UTF-8 In the WebAssembly text format, strings can generally be arbitrary bytes, but identifiers must be valid UTF-8. Check for UTF-8 validity when parsing string-style identifiers in the lexer. Update StringLowering to generate valid UTF-8 global names even for strings that may not be valid UTF-8 and test that text round tripping works correctly after StringLowering. Fixes #6937. --- src/parser/lexer.cpp | 10 ++++++ src/passes/StringLowering.cpp | 7 ++-- test/lit/parse-bad-identifier.wast | 8 +++++ test/lit/passes/string-gathering.wast | 36 +++++++++---------- test/lit/passes/string-lowering-imports.wast | 22 ++++++------ .../passes/string-lowering-instructions.wast | 8 ++--- 6 files changed, 57 insertions(+), 34 deletions(-) create mode 100644 test/lit/parse-bad-identifier.wast diff --git a/src/parser/lexer.cpp b/src/parser/lexer.cpp index bb6428e875f..44aecdc2bc1 100644 --- a/src/parser/lexer.cpp +++ b/src/parser/lexer.cpp @@ -280,6 +280,13 @@ struct LexStrResult : LexResult { // Allocate a string only if there are escape sequences, otherwise just use // the original string_view. std::optional str; + + std::string_view getStr() { + if (str) { + return *str; + } + return span; + } }; struct LexStrCtx : LexCtx { @@ -860,6 +867,9 @@ std::optional ident(std::string_view in) { return {}; } if (auto s = str(ctx.next())) { + if (!String::isUTF8(s->getStr())) { + return {}; + } ctx.isStr = true; ctx.str = s->str; ctx.take(*s); diff --git a/src/passes/StringLowering.cpp b/src/passes/StringLowering.cpp index 349ba8cd00f..081db606876 100644 --- a/src/passes/StringLowering.cpp +++ b/src/passes/StringLowering.cpp @@ -153,9 +153,12 @@ struct StringGathering : public Pass { [[maybe_unused]] bool valid = String::convertWTF16ToWTF8(wtf8, string.str); assert(valid); - // TODO: Use wtf8.view() once we have C++20. + // Then escape it because identifiers must be valid UTF-8. + // TODO: Use wtf8.view() and escaped.view() once we have C++20. + std::stringstream escaped; + String::printEscaped(escaped, wtf8.str()); auto name = Names::getValidGlobalName( - *module, std::string("string.const_") + std::string(wtf8.str())); + *module, std::string("string.const_") + std::string(escaped.str())); globalName = name; newNames.insert(name); auto* stringConst = builder.makeStringConst(string); diff --git a/test/lit/parse-bad-identifier.wast b/test/lit/parse-bad-identifier.wast new file mode 100644 index 00000000000..984e7e2300e --- /dev/null +++ b/test/lit/parse-bad-identifier.wast @@ -0,0 +1,8 @@ +;; RUN: not wasm-opt %s -S -o - 2>&1 | filecheck %s + +;; CHECK: 7:10: error: expected valtype + +(module + ;; String identifiers must still be UTF-8. + (global $"\ff" i32 (i32.const 0)) +) diff --git a/test/lit/passes/string-gathering.wast b/test/lit/passes/string-gathering.wast index 524c6a1f0c1..a6243c2386b 100644 --- a/test/lit/passes/string-gathering.wast +++ b/test/lit/passes/string-gathering.wast @@ -17,14 +17,14 @@ ;; CHECK: (type $0 (func)) - ;; CHECK: (global $string.const_bar (ref string) (string.const "bar")) + ;; CHECK: (global $"string.const_\"bar\"" (ref string) (string.const "bar")) - ;; CHECK: (global $string.const_other (ref string) (string.const "other")) + ;; CHECK: (global $"string.const_\"other\"" (ref string) (string.const "other")) ;; CHECK: (global $global (ref string) (string.const "foo")) (global $global (ref string) (string.const "foo")) - ;; CHECK: (global $global2 stringref (global.get $string.const_bar)) + ;; CHECK: (global $global2 stringref (global.get $"string.const_\"bar\"")) ;; LOWER: (type $0 (array (mut i16))) ;; LOWER: (type $1 (func)) @@ -45,9 +45,9 @@ ;; LOWER: (type $9 (func (param externref i32 i32) (result (ref extern)))) - ;; LOWER: (import "string.const" "0" (global $string.const_bar (ref extern))) + ;; LOWER: (import "string.const" "0" (global $"string.const_\"bar\"" (ref extern))) - ;; LOWER: (import "string.const" "1" (global $string.const_other (ref extern))) + ;; LOWER: (import "string.const" "1" (global $"string.const_\"other\"" (ref extern))) ;; LOWER: (import "string.const" "2" (global $global (ref extern))) @@ -69,12 +69,12 @@ ;; LOWER: (import "wasm:js-string" "substring" (func $substring (type $9) (param externref i32 i32) (result (ref extern)))) - ;; LOWER: (global $global2 externref (global.get $string.const_bar)) + ;; LOWER: (global $global2 externref (global.get $"string.const_\"bar\"")) (global $global2 (ref null string) (string.const "bar")) ;; CHECK: (func $a (type $0) ;; CHECK-NEXT: (drop - ;; CHECK-NEXT: (global.get $string.const_bar) + ;; CHECK-NEXT: (global.get $"string.const_\"bar\"") ;; CHECK-NEXT: ) ;; CHECK-NEXT: (drop ;; CHECK-NEXT: (global.get $global) @@ -82,7 +82,7 @@ ;; CHECK-NEXT: ) ;; LOWER: (func $a (type $1) ;; LOWER-NEXT: (drop - ;; LOWER-NEXT: (global.get $string.const_bar) + ;; LOWER-NEXT: (global.get $"string.const_\"bar\"") ;; LOWER-NEXT: ) ;; LOWER-NEXT: (drop ;; LOWER-NEXT: (global.get $global) @@ -99,10 +99,10 @@ ;; CHECK: (func $b (type $0) ;; CHECK-NEXT: (drop - ;; CHECK-NEXT: (global.get $string.const_bar) + ;; CHECK-NEXT: (global.get $"string.const_\"bar\"") ;; CHECK-NEXT: ) ;; CHECK-NEXT: (drop - ;; CHECK-NEXT: (global.get $string.const_other) + ;; CHECK-NEXT: (global.get $"string.const_\"other\"") ;; CHECK-NEXT: ) ;; CHECK-NEXT: (drop ;; CHECK-NEXT: (global.get $global) @@ -113,10 +113,10 @@ ;; CHECK-NEXT: ) ;; LOWER: (func $b (type $1) ;; LOWER-NEXT: (drop - ;; LOWER-NEXT: (global.get $string.const_bar) + ;; LOWER-NEXT: (global.get $"string.const_\"bar\"") ;; LOWER-NEXT: ) ;; LOWER-NEXT: (drop - ;; LOWER-NEXT: (global.get $string.const_other) + ;; LOWER-NEXT: (global.get $"string.const_\"other\"") ;; LOWER-NEXT: ) ;; LOWER-NEXT: (drop ;; LOWER-NEXT: (global.get $global) @@ -217,9 +217,9 @@ (module ;; CHECK: (type $0 (func)) - ;; CHECK: (global $string.const_foo (ref string) (string.const "foo")) + ;; CHECK: (global $"string.const_\"foo\"" (ref string) (string.const "foo")) - ;; CHECK: (global $global (mut (ref string)) (global.get $string.const_foo)) + ;; CHECK: (global $global (mut (ref string)) (global.get $"string.const_\"foo\"")) ;; LOWER: (type $0 (array (mut i16))) ;; LOWER: (type $1 (func (param externref externref) (result i32))) @@ -240,7 +240,7 @@ ;; LOWER: (type $9 (func (param externref i32 i32) (result (ref extern)))) - ;; LOWER: (import "string.const" "0" (global $string.const_foo (ref extern))) + ;; LOWER: (import "string.const" "0" (global $"string.const_\"foo\"" (ref extern))) ;; LOWER: (import "wasm:js-string" "fromCharCodeArray" (func $fromCharCodeArray (type $3) (param (ref null $0) i32 i32) (result (ref extern)))) @@ -260,17 +260,17 @@ ;; LOWER: (import "wasm:js-string" "substring" (func $substring (type $9) (param externref i32 i32) (result (ref extern)))) - ;; LOWER: (global $global (mut (ref extern)) (global.get $string.const_foo)) + ;; LOWER: (global $global (mut (ref extern)) (global.get $"string.const_\"foo\"")) (global $global (mut (ref string)) (string.const "foo")) ;; CHECK: (func $a (type $0) ;; CHECK-NEXT: (drop - ;; CHECK-NEXT: (global.get $string.const_foo) + ;; CHECK-NEXT: (global.get $"string.const_\"foo\"") ;; CHECK-NEXT: ) ;; CHECK-NEXT: ) ;; LOWER: (func $a (type $2) ;; LOWER-NEXT: (drop - ;; LOWER-NEXT: (global.get $string.const_foo) + ;; LOWER-NEXT: (global.get $"string.const_\"foo\"") ;; LOWER-NEXT: ) ;; LOWER-NEXT: ) (func $a diff --git a/test/lit/passes/string-lowering-imports.wast b/test/lit/passes/string-lowering-imports.wast index 6a908139e04..597fe293f8b 100644 --- a/test/lit/passes/string-lowering-imports.wast +++ b/test/lit/passes/string-lowering-imports.wast @@ -1,38 +1,40 @@ ;; NOTE: Assertions have been generated by update_lit_checks.py --all-items and should not be edited. ;; RUN: wasm-opt %s -all --string-lowering-magic-imports --remove-unused-module-elements -S -o - | filecheck %s +;; RUN: wasm-opt %s -all --string-lowering-magic-imports --remove-unused-module-elements -S -o - | wasm-opt -all -S -o - | filecheck %s ;; RUN: wasm-opt %s -all --string-lowering-magic-imports --remove-unused-module-elements --roundtrip -S -o - | filecheck %s --check-prefix=RTRIP + (module ;; CHECK: (type $0 (func)) - ;; CHECK: (import "\'" "bar" (global $string.const_bar (ref extern))) + ;; CHECK: (import "\'" "bar" (global $"string.const_\"bar\"" (ref extern))) - ;; CHECK: (import "\'" "foo" (global $string.const_foo (ref extern))) + ;; CHECK: (import "\'" "foo" (global $"string.const_\"foo\"" (ref extern))) - ;; CHECK: (import "\'" "needs\tescaping\00.\'#%- .\r\n\\08\0c\n\r\t.\ea\99\ae" (global $"string.const_needs\tescaping\00.\'#%- .\r\n\\08\0c\n\r\t.\ea\99\ae" (ref extern))) + ;; CHECK: (import "\'" "needs\tescaping\00.\'#%- .\r\n\\08\0c\n\r\t.\ea\99\ae" (global $"string.const_\"needs\\tescaping\\00.\\\'#%- .\\r\\n\\\\08\\0c\\n\\r\\t.\\ea\\99\\ae\"" (ref extern))) - ;; CHECK: (import "string.const" "0" (global $"string.const_unpaired high surrogate \ed\a0\80 " (ref extern))) + ;; CHECK: (import "string.const" "0" (global $"string.const_\"unpaired high surrogate \\ed\\a0\\80 \"" (ref extern))) - ;; CHECK: (import "string.const" "1" (global $"string.const_unpaired low surrogate \ed\bd\88 " (ref extern))) + ;; CHECK: (import "string.const" "1" (global $"string.const_\"unpaired low surrogate \\ed\\bd\\88 \"" (ref extern))) ;; CHECK: (export "consts" (func $consts)) ;; CHECK: (func $consts (type $0) ;; CHECK-NEXT: (drop - ;; CHECK-NEXT: (global.get $string.const_foo) + ;; CHECK-NEXT: (global.get $"string.const_\"foo\"") ;; CHECK-NEXT: ) ;; CHECK-NEXT: (drop - ;; CHECK-NEXT: (global.get $string.const_bar) + ;; CHECK-NEXT: (global.get $"string.const_\"bar\"") ;; CHECK-NEXT: ) ;; CHECK-NEXT: (drop - ;; CHECK-NEXT: (global.get $"string.const_needs\tescaping\00.\'#%- .\r\n\\08\0c\n\r\t.\ea\99\ae") + ;; CHECK-NEXT: (global.get $"string.const_\"needs\\tescaping\\00.\\\'#%- .\\r\\n\\\\08\\0c\\n\\r\\t.\\ea\\99\\ae\"") ;; CHECK-NEXT: ) ;; CHECK-NEXT: (drop - ;; CHECK-NEXT: (global.get $"string.const_unpaired high surrogate \ed\a0\80 ") + ;; CHECK-NEXT: (global.get $"string.const_\"unpaired high surrogate \\ed\\a0\\80 \"") ;; CHECK-NEXT: ) ;; CHECK-NEXT: (drop - ;; CHECK-NEXT: (global.get $"string.const_unpaired low surrogate \ed\bd\88 ") + ;; CHECK-NEXT: (global.get $"string.const_\"unpaired low surrogate \\ed\\bd\\88 \"") ;; CHECK-NEXT: ) ;; CHECK-NEXT: ) ;; RTRIP: (type $0 (func)) diff --git a/test/lit/passes/string-lowering-instructions.wast b/test/lit/passes/string-lowering-instructions.wast index d8ba996b2f0..1fd470ff822 100644 --- a/test/lit/passes/string-lowering-instructions.wast +++ b/test/lit/passes/string-lowering-instructions.wast @@ -65,9 +65,9 @@ ;; CHECK: (type $25 (func (param externref i32 i32) (result (ref extern)))) - ;; CHECK: (import "string.const" "0" (global $string.const_exported (ref extern))) + ;; CHECK: (import "string.const" "0" (global $"string.const_\"exported\"" (ref extern))) - ;; CHECK: (import "string.const" "1" (global $string.const_value (ref extern))) + ;; CHECK: (import "string.const" "1" (global $"string.const_\"value\"" (ref extern))) ;; CHECK: (import "colliding" "name" (func $fromCodePoint (type $0))) (import "colliding" "name" (func $fromCodePoint)) @@ -269,7 +269,7 @@ ) ;; CHECK: (func $exported-string-returner (type $16) (result externref) - ;; CHECK-NEXT: (global.get $string.const_exported) + ;; CHECK-NEXT: (global.get $"string.const_\"exported\"") ;; CHECK-NEXT: ) (func $exported-string-returner (export "export.1") (result stringref) ;; We should update the signature of this function even though it is public @@ -368,7 +368,7 @@ ;; CHECK-NEXT: ) ;; CHECK-NEXT: (drop ;; CHECK-NEXT: (struct.new $struct-of-string - ;; CHECK-NEXT: (global.get $string.const_value) + ;; CHECK-NEXT: (global.get $"string.const_\"value\"") ;; CHECK-NEXT: (i32.const 10) ;; CHECK-NEXT: (ref.null none) ;; CHECK-NEXT: ) From 2d1547b631c23cee70f54c05cca8f44f1d5924c4 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Mon, 16 Sep 2024 14:36:10 -0700 Subject: [PATCH 2/2] comments --- test/lit/passes/string-lowering-imports.wast | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/lit/passes/string-lowering-imports.wast b/test/lit/passes/string-lowering-imports.wast index 597fe293f8b..1c6d62be72d 100644 --- a/test/lit/passes/string-lowering-imports.wast +++ b/test/lit/passes/string-lowering-imports.wast @@ -1,7 +1,11 @@ ;; NOTE: Assertions have been generated by update_lit_checks.py --all-items and should not be edited. ;; RUN: wasm-opt %s -all --string-lowering-magic-imports --remove-unused-module-elements -S -o - | filecheck %s + +;; Text round tripping ;; RUN: wasm-opt %s -all --string-lowering-magic-imports --remove-unused-module-elements -S -o - | wasm-opt -all -S -o - | filecheck %s + +;; Binary round tripping ;; RUN: wasm-opt %s -all --string-lowering-magic-imports --remove-unused-module-elements --roundtrip -S -o - | filecheck %s --check-prefix=RTRIP