ziglang · andrewrk · Mar 12, 2024 · Mar 2, 2024 · Mar 7, 2024 · Mar 8, 2024
diff --git a/src/resinator/ani.zig → lib/compiler/resinator/ani.zig b/src/resinator/ani.zig → lib/compiler/resinator/ani.zig
diff --git a/src/resinator/ast.zig → lib/compiler/resinator/ast.zig b/src/resinator/ast.zig → lib/compiler/resinator/ast.zig
diff --git a/src/resinator/bmp.zig → lib/compiler/resinator/bmp.zig b/src/resinator/bmp.zig → lib/compiler/resinator/bmp.zig
diff --git a/src/resinator/cli.zig → lib/compiler/resinator/cli.zig b/src/resinator/cli.zig → lib/compiler/resinator/cli.zig
diff --git a/src/resinator/code_pages.zig → lib/compiler/resinator/code_pages.zig b/src/resinator/code_pages.zig → lib/compiler/resinator/code_pages.zig
@@ -279,6 +279,9 @@ pub const CodePage = enum(u16) {
 pub const Utf8 = struct {
     /// Implements decoding with rejection of ill-formed UTF-8 sequences based on section
     /// D92 of Chapter 3 of the Unicode standard (Table 3-7 specifically).
+    ///
+    /// Note: This does not match "U+FFFD Substitution of Maximal Subparts", but instead
+    ///       matches the behavior of the Windows RC compiler.
     pub const WellFormedDecoder = struct {
         /// Like std.unicode.utf8ByteSequenceLength, but:
         /// - Rejects non-well-formed first bytes, i.e. C0-C1, F5-FF
@@ -347,9 +350,6 @@ pub const Utf8 = struct {
                     // Only include the byte in the invalid sequence if it's in the range
                     // of a continuation byte. All other values should not be included in the
                     // invalid sequence.
-                    //
-                    // Note: This is how the Windows RC compiler handles this, this may not
-                    //       be the correct-as-according-to-the-Unicode-standard way to do it.
                     if (isContinuationByte(byte)) len += 1;
                     return .{ .value = Codepoint.invalid, .byte_len = len };
                 }
@@ -437,6 +437,19 @@ test "codepointAt invalid utf8" {
         }, CodePage.utf8.codepointAt(1, invalid_utf8).?);
         try std.testing.expectEqual(@as(?Codepoint, null), CodePage.windows1252.codepointAt(2, invalid_utf8));
     }
+
+    {
+        // encoded high surrogate
+        const invalid_utf8 = "\xED\xA0\xBD";
+        try std.testing.expectEqual(Codepoint{
+            .value = Codepoint.invalid,
+            .byte_len = 2,
+        }, CodePage.utf8.codepointAt(0, invalid_utf8).?);
+        try std.testing.expectEqual(Codepoint{
+            .value = Codepoint.invalid,
+            .byte_len = 1,
+        }, CodePage.utf8.codepointAt(2, invalid_utf8).?);
+    }
 }
 
 test "codepointAt utf8 encoded" {

diff --git a/src/resinator/comments.zig → lib/compiler/resinator/comments.zig b/src/resinator/comments.zig → lib/compiler/resinator/comments.zig
@@ -22,7 +22,7 @@ const formsLineEndingPair = @import("source_mapping.zig").formsLineEndingPair;
 
 /// `buf` must be at least as long as `source`
 /// In-place transformation is supported (i.e. `source` and `buf` can be the same slice)
-pub fn removeComments(source: []const u8, buf: []u8, source_mappings: ?*SourceMappings) []u8 {
+pub fn removeComments(source: []const u8, buf: []u8, source_mappings: ?*SourceMappings) ![]u8 {
     std.debug.assert(buf.len >= source.len);
     var result = UncheckedSliceWriter{ .slice = buf };
     const State = enum {
@@ -85,7 +85,7 @@ pub fn removeComments(source: []const u8, buf: []u8, source_mappings: ?*SourceMa
                 else => {},
             },
             .multiline_comment => switch (c) {
-                '\r' => handleMultilineCarriageReturn(source, &line_handler, index, &result, source_mappings),
+                '\r' => try handleMultilineCarriageReturn(source, &line_handler, index, &result, source_mappings),
                 '\n' => {
                     _ = line_handler.incrementLineNumber(index);
                     result.write(c);
@@ -95,7 +95,7 @@ pub fn removeComments(source: []const u8, buf: []u8, source_mappings: ?*SourceMa
             },
             .multiline_comment_end => switch (c) {
                 '\r' => {
-                    handleMultilineCarriageReturn(source, &line_handler, index, &result, source_mappings);
+                    try handleMultilineCarriageReturn(source, &line_handler, index, &result, source_mappings);
                     // We only want to treat this as a newline if it's part of a CRLF pair. If it's
                     // not, then we still want to stay in .multiline_comment_end, so that e.g. `*<\r>/` still
                     // functions as a `*/` comment ending. Kinda crazy, but that's how the Win32 implementation works.
@@ -184,13 +184,21 @@ inline fn handleMultilineCarriageReturn(
     index: usize,
     result: *UncheckedSliceWriter,
     source_mappings: ?*SourceMappings,
-) void {
+) !void {
+    // This is a dumb way to go about this, but basically we want to determine
+    // if this is part of a distinct CRLF or LFCR pair. This function call will detect
+    // LFCR pairs correctly since the function we're in will only be called on CR,
+    // but will not detect CRLF pairs since it only looks at the line ending before the
+    // CR. So, we do a second (forward) check if the first fails to detect CRLF that is
+    // not part of another pair.
+    const is_lfcr_pair = line_handler.currentIndexFormsLineEndingPair(index);
+    const is_crlf_pair = !is_lfcr_pair and formsLineEndingPair(source, '\r', index + 1);
     // Note: Bare \r within a multiline comment should *not* be treated as a line ending for the
     // purposes of removing comments, but *should* be treated as a line ending for the
     // purposes of line counting/source mapping
     _ = line_handler.incrementLineNumber(index);
-    // So only write the \r if it's part of a CRLF pair
-    if (formsLineEndingPair(source, '\r', index + 1)) {
+    // So only write the \r if it's part of a CRLF/LFCR pair
+    if (is_lfcr_pair or is_crlf_pair) {
         result.write('\r');
     }
     // And otherwise, we want to collapse the source mapping so that we can still know which
@@ -200,15 +208,15 @@ inline fn handleMultilineCarriageReturn(
         // the next collapse acts on the first of the collapsed line numbers
         line_handler.line_number -= 1;
         if (source_mappings) |mappings| {
-            mappings.collapse(line_handler.line_number, 1);
+            try mappings.collapse(line_handler.line_number, 1);
         }
     }
 }
 
 pub fn removeCommentsAlloc(allocator: Allocator, source: []const u8, source_mappings: ?*SourceMappings) ![]u8 {
     const buf = try allocator.alloc(u8, source.len);
     errdefer allocator.free(buf);
-    const result = removeComments(source, buf, source_mappings);
+    const result = try removeComments(source, buf, source_mappings);
     return allocator.realloc(buf, result.len);
 }
 
@@ -252,6 +260,16 @@ test "line comments retain newlines" {
     try testRemoveComments("\r\n", "//comment\r\n");
 }
 
+test "unfinished multiline comment" {
+    try testRemoveComments(
+        \\unfinished
+        \\
+    ,
+        \\unfinished/*
+        \\
+    );
+}
+
 test "crazy" {
     try testRemoveComments(
         \\blah"/*som*/\""BLAH
@@ -321,20 +339,20 @@ test "remove comments with mappings" {
     var mut_source = "blah/*\rcommented line*\r/blah".*;
     var mappings = SourceMappings{};
     _ = try mappings.files.put(allocator, "test.rc");
-    try mappings.set(allocator, 1, .{ .start_line = 1, .end_line = 1, .filename_offset = 0 });
-    try mappings.set(allocator, 2, .{ .start_line = 2, .end_line = 2, .filename_offset = 0 });
-    try mappings.set(allocator, 3, .{ .start_line = 3, .end_line = 3, .filename_offset = 0 });
+    try mappings.set(1, 1, 0);
+    try mappings.set(2, 2, 0);
+    try mappings.set(3, 3, 0);
     defer mappings.deinit(allocator);
 
-    const result = removeComments(&mut_source, &mut_source, &mappings);
+    const result = try removeComments(&mut_source, &mut_source, &mappings);
 
     try std.testing.expectEqualStrings("blahblah", result);
-    try std.testing.expectEqual(@as(usize, 1), mappings.mapping.items.len);
-    try std.testing.expectEqual(@as(usize, 3), mappings.mapping.items[0].end_line);
+    try std.testing.expectEqual(@as(usize, 1), mappings.end_line);
+    try std.testing.expectEqual(@as(usize, 3), mappings.getCorrespondingSpan(1).?.end_line);
 }
 
 test "in place" {
     var mut_source = "blah /* comment */ blah".*;
-    const result = removeComments(&mut_source, &mut_source, null);
+    const result = try removeComments(&mut_source, &mut_source, null);
     try std.testing.expectEqualStrings("blah  blah", result);
 }
diff --git a/src/resinator/compile.zig → lib/compiler/resinator/compile.zig b/src/resinator/compile.zig → lib/compiler/resinator/compile.zig
@@ -321,10 +321,7 @@ pub const Compiler = struct {
 
                         return buf.toOwnedSlice();
                     },
-                    else => {
-                        std.debug.print("unexpected filename token type: {}\n", .{literal_node.token});
-                        unreachable; // no other token types should be in a filename literal node
-                    },
+                    else => unreachable, // no other token types should be in a filename literal node
                 }
             },
             .binary_expression => {
@@ -404,6 +401,72 @@ pub const Compiler = struct {
         return first_error orelse error.FileNotFound;
     }
 
+    pub fn parseDlgIncludeString(self: *Compiler, token: Token) ![]u8 {
+        // For the purposes of parsing, we want to strip the L prefix
+        // if it exists since we want escaped integers to be limited to
+        // their ascii string range.
+        //
+        // We keep track of whether or not there was an L prefix, though,
+        // since there's more weirdness to come.
+        var bytes = self.sourceBytesForToken(token);
+        var was_wide_string = false;
+        if (bytes.slice[0] == 'L' or bytes.slice[0] == 'l') {
+            was_wide_string = true;
+            bytes.slice = bytes.slice[1..];
+        }
+
+        var buf = try std.ArrayList(u8).initCapacity(self.allocator, bytes.slice.len);
+        errdefer buf.deinit();
+
+        var iterative_parser = literals.IterativeStringParser.init(bytes, .{
+            .start_column = token.calculateColumn(self.source, 8, null),
+            .diagnostics = .{ .diagnostics = self.diagnostics, .token = token },
+        });
+
+        // No real idea what's going on here, but this matches the rc.exe behavior
+        while (try iterative_parser.next()) |parsed| {
+            const c = parsed.codepoint;
+            switch (was_wide_string) {
+                true => {
+                    switch (c) {
+                        0...0x7F, 0xA0...0xFF => try buf.append(@intCast(c)),
+                        0x80...0x9F => {
+                            if (windows1252.bestFitFromCodepoint(c)) |_| {
+                                try buf.append(@intCast(c));
+                            } else {
+                                try buf.append('?');
+                            }
+                        },
+                        else => {
+                            if (windows1252.bestFitFromCodepoint(c)) |best_fit| {
+                                try buf.append(best_fit);
+                            } else if (c < 0x10000 or c == code_pages.Codepoint.invalid) {
+                                try buf.append('?');
+                            } else {
+                                try buf.appendSlice("??");
+                            }
+                        },
+                    }
+                },
+                false => {
+                    if (parsed.from_escaped_integer) {
+                        try buf.append(@truncate(c));
+                    } else {
+                        if (windows1252.bestFitFromCodepoint(c)) |best_fit| {
+                            try buf.append(best_fit);
+                        } else if (c < 0x10000 or c == code_pages.Codepoint.invalid) {
+                            try buf.append('?');
+                        } else {
+                            try buf.appendSlice("??");
+                        }
+                    }
+                },
+            }
+        }
+
+        return buf.toOwnedSlice();
+    }
+
     pub fn writeResourceExternal(self: *Compiler, node: *Node.ResourceExternal, writer: anytype) !void {
         // Init header with data size zero for now, will need to fill it in later
         var header = try self.resourceHeader(node.id, node.type, .{});
@@ -414,13 +477,16 @@ pub const Compiler = struct {
         // DLGINCLUDE has special handling that doesn't actually need the file to exist
         if (maybe_predefined_type != null and maybe_predefined_type.? == .DLGINCLUDE) {
             const filename_token = node.filename.cast(.literal).?.token;
-            const parsed_filename = try self.parseQuotedStringAsAsciiString(filename_token);
+            const parsed_filename = try self.parseDlgIncludeString(filename_token);
             defer self.allocator.free(parsed_filename);
 
+            // NUL within the parsed string acts as a terminator
+            const parsed_filename_terminated = std.mem.sliceTo(parsed_filename, 0);
+
             header.applyMemoryFlags(node.common_resource_attributes, self.source);
-            header.data_size = @intCast(parsed_filename.len + 1);
+            header.data_size = @intCast(parsed_filename_terminated.len + 1);
             try header.write(writer, .{ .diagnostics = self.diagnostics, .token = node.id });
-            try writer.writeAll(parsed_filename);
+            try writer.writeAll(parsed_filename_terminated);
             try writer.writeByte(0);
             try writeDataPadding(writer, header.data_size);
             return;
@@ -1141,21 +1207,15 @@ pub const Compiler = struct {
                         errdefer self.allocator.free(parsed_string);
                         return .{ .wide_string = parsed_string };
                     },
-                    else => {
-                        std.debug.print("unexpected token in literal node: {}\n", .{literal_node.token});
-                        unreachable; // no other token types should be in a data literal node
-                    },
+                    else => unreachable, // no other token types should be in a data literal node
                 }
             },
             .binary_expression, .grouped_expression => {
                 const result = evaluateNumberExpression(expression_node, self.source, self.input_code_pages);
                 return .{ .number = result };
             },
             .not_expression => unreachable,
-            else => {
-                std.debug.print("{}\n", .{expression_node.id});
-                @panic("TODO: evaluateDataExpression");
-            },
+            else => unreachable,
         }
     }
 
@@ -1669,6 +1729,7 @@ pub const Compiler = struct {
             };
         }
 
+        // We know the data_buffer len is limited to u32 max.
         const data_size: u32 = @intCast(data_buffer.items.len);
         var header = try self.resourceHeader(node.id, node.type, .{
             .data_size = data_size,
@@ -1966,6 +2027,7 @@ pub const Compiler = struct {
         try data_writer.writeInt(u16, 1, .little);
         try data_writer.writeInt(u16, button_width.asWord(), .little);
         try data_writer.writeInt(u16, button_height.asWord(), .little);
+        // Number of buttons is guaranteed by the parser to be within maxInt(u16).
         try data_writer.writeInt(u16, @as(u16, @intCast(node.buttons.len)), .little);
 
         for (node.buttons) |button_or_sep| {
@@ -2806,19 +2868,6 @@ pub const Compiler = struct {
         );
     }
 
-    /// Helper that calls parseQuotedStringAsAsciiString with the relevant context
-    /// Resulting slice is allocated by `self.allocator`.
-    pub fn parseQuotedStringAsAsciiString(self: *Compiler, token: Token) ![]u8 {
-        return literals.parseQuotedStringAsAsciiString(
-            self.allocator,
-            self.sourceBytesForToken(token),
-            .{
-                .start_column = token.calculateColumn(self.source, 8, null),
-                .diagnostics = .{ .diagnostics = self.diagnostics, .token = token },
-            },
-        );
-    }
-
     fn addErrorDetails(self: *Compiler, details: ErrorDetails) Allocator.Error!void {
         try self.diagnostics.append(details);
     }
@@ -3356,7 +3405,7 @@ test "StringTable" {
         }
         break :ids buf;
     };
-    var prng = std.Random.DefaultPrng.init(0);
+    var prng = std.rand.DefaultPrng.init(0);
     var random = prng.random();
     random.shuffle(u16, &ids);