std.zig.tokenizer: simplify

I pointed a fuzzer at the tokenizer and it crashed immediately. Upon inspection, I was dissatisfied with the implementation. This commit removes several mechanisms: * Removes the "invalid byte" compile error note. * Dramatically simplifies tokenizer recovery by making recovery always occur at newlines, and never otherwise. * Removes UTF-8 validation. * Moves some character validation logic to `std.zig.parseCharLiteral`. Removing UTF-8 validation is a regression of ziglang#663, however, the existing implementation was already buggy. When adding this functionality back, it must be fuzz-tested while checking the property that it matches an independent Unicode validation implementation on the same file. While we're at it, fuzzing should check the other properties of that proposal, such as no ASCII control characters existing inside the source code. Other changes included in this commit: * Deprecate `std.unicode.utf8Decode` and its WTF-8 counterpart. This function has an awkward API that is too easy to misuse. * Make `utf8Decode2` and friends use arrays as parameters, eliminating a runtime assertion in favor of using the type system. After this commit, the crash found by fuzzing, which was "\x07\xd5\x80\xc3=o\xda|a\xfc{\x9a\xec\x91\xdf\x0f\\\x1a^\xbe;\x8c\xbf\xee\xea" no longer causes a crash. However, I did not feel the need to add this test case because the simplified logic eradicates most crashes of this nature.
igor84 · Aug 11, 2024 · a053f01 · a053f01
1 parent 659056a
commit a053f01
Show file tree

Hide file tree

Showing 12 changed files with 234 additions and 392 deletions.
diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig
@@ -95,16 +95,13 @@ pub inline fn utf8EncodeComptime(comptime c: u21) [
 
 const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;
 
-/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
-/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
-/// If you already know the length at comptime, you can call one of
-/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
+/// Deprecated. This function has an awkward API that is too easy to use incorrectly.
 pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 {
     return switch (bytes.len) {
-        1 => @as(u21, bytes[0]),
-        2 => utf8Decode2(bytes),
-        3 => utf8Decode3(bytes),
-        4 => utf8Decode4(bytes),
+        1 => bytes[0],
+        2 => utf8Decode2(bytes[0..2].*),
+        3 => utf8Decode3(bytes[0..3].*),
+        4 => utf8Decode4(bytes[0..4].*),
         else => unreachable,
     };
 }
@@ -113,8 +110,7 @@ const Utf8Decode2Error = error{
     Utf8ExpectedContinuation,
     Utf8OverlongEncoding,
 };
-pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
-    assert(bytes.len == 2);
+pub fn utf8Decode2(bytes: [2]u8) Utf8Decode2Error!u21 {
     assert(bytes[0] & 0b11100000 == 0b11000000);
     var value: u21 = bytes[0] & 0b00011111;
 
@@ -130,7 +126,7 @@ pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
 const Utf8Decode3Error = Utf8Decode3AllowSurrogateHalfError || error{
     Utf8EncodesSurrogateHalf,
 };
-pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 {
+pub fn utf8Decode3(bytes: [3]u8) Utf8Decode3Error!u21 {
     const value = try utf8Decode3AllowSurrogateHalf(bytes);
 
     if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;
@@ -142,8 +138,7 @@ const Utf8Decode3AllowSurrogateHalfError = error{
     Utf8ExpectedContinuation,
     Utf8OverlongEncoding,
 };
-pub fn utf8Decode3AllowSurrogateHalf(bytes: []const u8) Utf8Decode3AllowSurrogateHalfError!u21 {
-    assert(bytes.len == 3);
+pub fn utf8Decode3AllowSurrogateHalf(bytes: [3]u8) Utf8Decode3AllowSurrogateHalfError!u21 {
     assert(bytes[0] & 0b11110000 == 0b11100000);
     var value: u21 = bytes[0] & 0b00001111;
 
@@ -165,8 +160,7 @@ const Utf8Decode4Error = error{
     Utf8OverlongEncoding,
     Utf8CodepointTooLarge,
 };
-pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
-    assert(bytes.len == 4);
+pub fn utf8Decode4(bytes: [4]u8) Utf8Decode4Error!u21 {
     assert(bytes[0] & 0b11111000 == 0b11110000);
     var value: u21 = bytes[0] & 0b00000111;
 
@@ -1637,12 +1631,13 @@ pub fn wtf8Encode(c: u21, out: []u8) error{CodepointTooLarge}!u3 {
 
 const Wtf8DecodeError = Utf8Decode2Error || Utf8Decode3AllowSurrogateHalfError || Utf8Decode4Error;
 
+/// Deprecated. This function has an awkward API that is too easy to use incorrectly.
 pub fn wtf8Decode(bytes: []const u8) Wtf8DecodeError!u21 {
     return switch (bytes.len) {
-        1 => @as(u21, bytes[0]),
-        2 => utf8Decode2(bytes),
-        3 => utf8Decode3AllowSurrogateHalf(bytes),
-        4 => utf8Decode4(bytes),
+        1 => bytes[0],
+        2 => utf8Decode2(bytes[0..2].*),
+        3 => utf8Decode3AllowSurrogateHalf(bytes[0..3].*),
+        4 => utf8Decode4(bytes[0..4].*),
         else => unreachable,
     };
 }

diff --git a/lib/std/zig/Ast.zig b/lib/std/zig/Ast.zig
@@ -69,7 +69,7 @@ pub fn parse(gpa: Allocator, source: [:0]const u8, mode: Mode) Allocator.Error!A
         const token = tokenizer.next();
         try tokens.append(gpa, .{
             .tag = token.tag,
-            .start = @as(u32, @intCast(token.loc.start)),
+            .start = @intCast(token.loc.start),
         });
         if (token.tag == .eof) break;
     }

diff --git a/lib/std/zig/AstGen.zig b/lib/std/zig/AstGen.zig
@@ -11351,6 +11351,9 @@ fn failWithStrLitError(astgen: *AstGen, err: std.zig.string_literal.Error, token
                 .{raw_string[bad_index]},
             );
         },
+        .empty_char_literal => {
+            return astgen.failOff(token, offset, "empty character literal", .{});
+        },
     }
 }
 
@@ -13820,21 +13823,9 @@ fn lowerAstErrors(astgen: *AstGen) !void {
     var msg: std.ArrayListUnmanaged(u8) = .{};
     defer msg.deinit(gpa);
 
-    const token_starts = tree.tokens.items(.start);
-    const token_tags = tree.tokens.items(.tag);
-
     var notes: std.ArrayListUnmanaged(u32) = .{};
     defer notes.deinit(gpa);
 
-    const tok = parse_err.token + @intFromBool(parse_err.token_is_prev);
-    if (token_tags[tok] == .invalid) {
-        const bad_off: u32 = @intCast(tree.tokenSlice(tok).len);
-        const byte_abs = token_starts[tok] + bad_off;
-        try notes.append(gpa, try astgen.errNoteTokOff(tok, bad_off, "invalid byte: '{'}'", .{
-            std.zig.fmtEscapes(tree.source[byte_abs..][0..1]),
-        }));
-    }
-
     for (tree.errors[1..]) |note| {
         if (!note.is_note) break;
 

diff --git a/lib/std/zig/parser_test.zig b/lib/std/zig/parser_test.zig
@@ -6061,7 +6061,6 @@ test "recovery: invalid container members" {
     , &[_]Error{
         .expected_expr,
         .expected_comma_after_field,
-        .expected_type_expr,
         .expected_semi_after_stmt,
     });
 }

diff --git a/lib/std/zig/string_literal.zig b/lib/std/zig/string_literal.zig
@@ -1,6 +1,5 @@
 const std = @import("../std.zig");
 const assert = std.debug.assert;
-const utf8Decode = std.unicode.utf8Decode;
 const utf8Encode = std.unicode.utf8Encode;
 
 pub const ParseError = error{
@@ -37,12 +36,16 @@ pub const Error = union(enum) {
     expected_single_quote: usize,
     /// The character at this index cannot be represented without an escape sequence.
     invalid_character: usize,
+    /// `''`. Not returned for string literals.
+    empty_char_literal,
 };
 
-/// Only validates escape sequence characters.
-/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between.
+/// Asserts the slice starts and ends with single-quotes.
+/// Returns an error if there is not exactly one UTF-8 codepoint in between.
 pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
-    assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\'');
+    if (slice.len < 3) return .{ .failure = .empty_char_literal };
+    assert(slice[0] == '\'');
+    assert(slice[slice.len - 1] == '\'');
 
     switch (slice[1]) {
         '\\' => {
@@ -55,7 +58,18 @@ pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
         },
         0 => return .{ .failure = .{ .invalid_character = 1 } },
         else => {
-            const codepoint = utf8Decode(slice[1 .. slice.len - 1]) catch unreachable;
+            const inner = slice[1 .. slice.len - 1];
+            const n = std.unicode.utf8ByteSequenceLength(inner[0]) catch return .{
+                .failure = .{ .invalid_unicode_codepoint = 1 },
+            };
+            if (inner.len > n) return .{ .failure = .{ .expected_single_quote = 1 + n } };
+            const codepoint = switch (n) {
+                1 => inner[0],
+                2 => std.unicode.utf8Decode2(inner[0..2].*),
+                3 => std.unicode.utf8Decode3(inner[0..3].*),
+                4 => std.unicode.utf8Decode4(inner[0..4].*),
+                else => unreachable,
+            } catch return .{ .failure = .{ .invalid_unicode_codepoint = 1 } };
             return .{ .success = codepoint };
         },
     }