Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reader: Fix multi-line string literal escaping bug #1634

Merged
merged 1 commit into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sources/dfmc/reader/lexer-transitions.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ define constant $initial-state :: <state>
state(#"3string", #f, // seen """
#('"' . #"close-double-quote"),
#('\\' . #"3string-escape"),
#(" !#-[]-~\r\n" . #"3string"),
#(" !#-[]-~\r\n" . #"3string"), // Ranges #-[ and ]-~ exclude backslash
pair($ascii-8-bit-extensions, #"3string")),
state(#"3string-escape", #f,
#("\\'\"abefnrt0" . #"3string"),
Expand Down
294 changes: 151 additions & 143 deletions sources/dfmc/reader/lexer.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -837,8 +837,9 @@ define method hex-escape-character
(source-location :: <lexer-source-location>, start :: <integer>)
=> (char :: <character>, end-pos :: <integer>)
let (code, epos)
= parse-integer(source-location, radix: 16, start: start,
stop-at-non-digit?: #t);
= parse-integer(source-location,
source-location.source-location-record.contents,
radix: 16, start: start, stop-at-non-digit?: #t);
if (code > $max-lexer-code)
note(<character-code-too-large>,
source-location:
Expand All @@ -853,161 +854,173 @@ define method hex-escape-character
end
end method hex-escape-character;

// Convert a string literal to its internal representation by processing escape
// codes and line endings. Canonicalize CRLF and CR to a single LF. Works for
// both one-line and multi-line strings because the lexer state transitions
// disallow CR and LF in one-line strings in the first place. If escapes? is
// true, process escape codes.
// Convert a string literal to its internal representation by removing the prefix (if
// any), processing escape codes if escapes? is true, and canonicalizing line endings to
// just \n. Works for both one-line and multi-line strings because the lexer state
// transitions disallow CR and LF in one-line strings in the first place. bpos points to
// just after the start delimiter (" or """) and epos points to the first character of
// the end delimiter.
define method decode-string
(source-location :: <lexer-source-location>, bpos :: <integer>,
epos :: <integer>, escapes? :: <boolean>)
=> (string :: <byte-string>, multi-line? :: <boolean>)
let contents = source-location.source-location-record.contents;
let multi-line? = #f;
epos :: <integer>, escapes? :: <boolean>, triple-quoted? :: <boolean>)
=> (string :: <byte-string>)
local
method skip-hex-escape (pos)
// TODO(cgay): signal better error if '>' not found.
if (contents[pos] == as(<integer>, '>'))
pos + 1
method fail (format-string, #rest format-args)
note(<invalid-multi-line-string-literal>,
source-location: source-location,
token-string: extract-string(source-location),
detail: apply(format-to-string,
concatenate("invalid multi-line string literal: ",
format-string),
format-args));
end,
method whitespace-code? (c)
c == $space-code | c == $tab-code
end,
method find-line-break (seq, bpos, epos)
if (bpos < epos)
select (seq[bpos])
$newline-code =>
values(bpos, bpos + 1);
$carriage-return-code =>
if (bpos + 1 < epos & seq[bpos + 1] == $newline-code)
values(bpos, bpos + 2)
else
values(bpos, bpos + 1)
end;
otherwise =>
find-line-break(seq, bpos + 1, epos);
end
end
end,
method remove-prefix (prefix, line)
if (~prefix | empty?(prefix))
line
else
skip-hex-escape(pos + 1)
for (c in line, p in prefix)
if (c ~== p)
fail("each line must begin with the same whitespace prefix, got %=, want %=",
as(<string>, line), as(<string>, prefix))
end;
end;
copy-sequence(line, start: prefix.size)
end
end method,
method loop (pos :: <integer>, len :: <integer>, prev-was-cr? :: <boolean>,
string :: false-or(<string>))
=> (len :: <integer>)
if (pos >= epos)
len
end,
// Can't use hex-escape-character because we don't know the correct offset from the
// beginning of the literal due to using split/join.
method parse-hex-escape (line, start) => (char, epos)
let (code, epos)
= parse-integer(source-location, line,
radix: 16, start: start, stop-at-non-digit?: #t);
assert(epos <= line.size, "epos out of bounds: %d", epos);
assert(line[epos] == as(<integer>, '>'),
"hex escape must end with '>', got %=", line[epos]);
if (code > $max-lexer-code)
note(<character-code-too-large>,
source-location: record-position-as-location
(source-location.source-location-record,
source-location.source-location-source-position),
token-string: extract-string(source-location));
values(0, epos) // If forced, continue with NUL...
else
let code = contents[pos];
select (code)
as(<integer>, '\\') =>
if (~escapes?)
string & (string[len] := '\\');
loop(pos + 1, len + 1, #f, string)
else
let escape-char = as(<character>, contents[pos + 1]);
let new-position
= if (escape-char == '<')
if (string)
let (char, epos)
= hex-escape-character(source-location, pos + 2);
string[len] := char;
epos + 1
else
skip-hex-escape(pos + 2)
end
else
string & (string[len] := escape-character(escape-char));
pos + 2
end;
loop(new-position, len + 1, #f, string);
end if;
as(<integer>, '\r') =>
multi-line? := #t;
string & (string[len] := '\n');
loop(pos + 1, len + 1, #t, string);
as(<integer>, '\n') =>
multi-line? := #t;
let increment = if (prev-was-cr?)
0 // already stored a LF
else
string & (string[len] := '\n');
1
end;
loop(pos + 1, len + increment, #f, string);
otherwise =>
string & (string[len] := as(<character>, code));
loop(pos + 1, len + 1, #f, string);
end select
end if
end method;
let length = loop(bpos, 0, #f, #f);
let string = make(<string>, size: length);
loop(bpos, 0, #f, string);
values(string, multi-line?)
end method decode-string;

// https://opendylan.org/proposals/dep-0012-string-literals.html#the-rectangle-rule
//
// When this is called `string` is known to contain at least one literal newline
// character, the EOL sequence has already been canonicalized to just '\n', escape
// sequences have been processed, and the start/end delimiters have been removed.
define function trim-multi-line-prefix
(string :: <string>, source-location) => (maybe-trimmed :: <string>)
let lines = split(string, '\n');
let junk = first(lines);
let prefix = last(lines);
if (~empty?(junk) & ~whitespace?(junk))
note(<invalid-multi-line-string-literal>,
source-location: source-location,
token-string: extract-string(source-location),
detail:
"only whitespace may follow the start delimiter \"\"\" on the same line");
end;
if (~empty?(prefix) & ~whitespace?(prefix))
note(<invalid-multi-line-string-literal>,
source-location: source-location,
token-string: extract-string(source-location),
detail:
"only whitespace may precede the end delimiter \"\"\" on the same line");
end;
local method remove-prefix (line)
if (line = "")
line
elseif (~starts-with?(line, prefix))
note(<invalid-multi-line-string-literal>,
source-location: source-location,
token-string: extract-string(source-location),
detail:
format-to-string
("each line must begin with the same whitespace that precedes the end"
" delimiter (got %=, want %=)",
copy-sequence(line, end: prefix.size), prefix));
values(code, epos + 1)
end
end,
method process-escapes (line)
let len = line.size;
let new = make(<stretchy-vector>);
iterate loop (pos = 0, escaped? = #f)
if (pos >= len)
as(<byte-vector>, new)
else
let code = line[pos];
if (escaped?)
let new-position
= if (code == as(<integer>, '<'))
let (code, epos) = parse-hex-escape(line, pos + 1);
add!(new, code);
epos
else
add!(new, as(<integer>, escape-character(as(<character>, code))));
pos + 1
end;
loop(new-position, #f)
elseif (code == $escape-code)
loop(pos + 1, #t)
else
copy-sequence(line, start: prefix.size)
add!(new, code);
loop(pos + 1, #f)
end
end method;
select (lines.size)
1 => error("compiler bug while trimming multi-line string prefix");
2 => "";
otherwise =>
let keep = copy-sequence(lines, start: 1, end: lines.size - 1);
let trimmed = map(remove-prefix, keep);
if (every?(empty?, trimmed))
// If all lines are empty the last line needs to be handled specially because of
// the exceptional case of ``abc\n"""`` (where we don't want the final newline)
// vs ``\n\n"""`` (where we do want the final newline).
join(concatenate(trimmed, #("")), "\n")
else
join(trimmed, "\n")
end
end select
end function;
end
end iterate
end,
method process-line (prefix, line)
if (~empty?(line))
if (prefix & ~empty?(prefix))
line := remove-prefix(prefix, line);
end;
if (escapes? & member?($escape-code, line))
line := process-escapes(line);
end;
end;
line
end;
let contents = source-location.source-location-record.contents;
let parts = split(contents, find-line-break, start: bpos, end: epos);
if (parts.size == 1)
as(<string>, process-line(#f, parts[0])) // e.g., """abc"""
else
let prefix = parts.last;
if (~every?(whitespace-code?, prefix))
fail("prefix must be all whitespace, got %=", as(<string>, prefix));
end;
if (~every?(whitespace-code?, parts.first))
fail("only whitespace may follow the open delimiter \"\"\" on the"
" same line, got %=", parts.first);
end;
let parts = map(curry(process-line, prefix), parts);
// Deal with this oddity in our spec:
// """\n
// abc\n => LF excluded, end is before '\n'
// """
// """\n
// \n => LF included, end is after '\n'
// """
as(<string>,
join(copy-sequence(parts,
start: 1,
end: if (empty?(parts[parts.size - 2]))
parts.size
else
parts.size - 1
end),
make(<byte-vector>, size: 1, fill: $newline-code)))
end if
end method decode-string;

// Make a <literal-token> when confronted with the #"foo" syntax.
// These are referred to as "unique strings" in the DRM Lexical Syntax.
//
define method %make-quoted-symbol
(lexer :: <lexer>, source-location :: <lexer-source-location>,
start-offset :: <integer>, end-offset :: <integer>)
start-offset :: <integer>, end-offset :: <integer>, multi-line? :: <boolean>)
=> (res :: <symbol-syntax-symbol-fragment>)
let sym = as(<symbol>,
decode-string(source-location,
source-location.start-posn + start-offset,
source-location.end-posn - end-offset,
#t));
#t, multi-line?));
make(<symbol-syntax-symbol-fragment>,
record: source-location.source-location-record,
source-position: source-location.source-location-source-position,
value: as-fragment-value(sym));
end method;

define constant make-quoted-symbol
= rcurry(%make-quoted-symbol, 2, 1);
= rcurry(%make-quoted-symbol, 2, 1, #f);

define constant make-multi-line-quoted-symbol
= rcurry(%make-quoted-symbol, 4, 3);
= rcurry(%make-quoted-symbol, 4, 3, #t);

// Make a <literal-token> when confronted with the foo: syntax.
//
Expand Down Expand Up @@ -1035,14 +1048,12 @@ define constant $underscore_code :: <integer> = as(<integer>, '_');
// Parse and return an integer in the supplied radix.
//
define method parse-integer
(source-location :: <lexer-source-location>,
(source-location :: <lexer-source-location>, contents :: <byte-vector>,
#key radix :: <integer> = 10,
start :: <integer> = source-location.start-posn,
end: finish :: <integer> = source-location.end-posn,
stop-at-non-digit? = #f)
=> (res :: <abstract-integer>, end-pos :: <integer>)
let contents :: <byte-vector>
= source-location.source-location-record.contents;
// We do our working in negative integers to avoid representation
// overflow until absolutely necessary.
local method repeat (posn :: <integer>, result :: <abstract-integer>)
Expand Down Expand Up @@ -1126,7 +1137,7 @@ define method parse-integer-literal
end if;
end if;

let int = parse-integer(source-location, radix: radix, start: posn);
let int = parse-integer(source-location, contents, radix: radix, start: posn);

if (~extended &
(int < runtime-$minimum-integer
Expand Down Expand Up @@ -1175,15 +1186,11 @@ end method make-character-literal;
define method %make-string-literal
(lexer :: <lexer>, source-location :: <lexer-source-location>,
start-offset :: <integer>, end-offset :: <integer>,
allow-escapes? :: <boolean>)
allow-escapes? :: <boolean>, multi-line? :: <boolean>)
=> (res :: <string-fragment>)
let bpos = source-location.start-posn + start-offset;
let epos = source-location.end-posn - end-offset;
let (string, multi-line?)
= decode-string(source-location, bpos, epos, allow-escapes?);
if (multi-line?)
string := trim-multi-line-prefix(string, source-location);
end;
let string = decode-string(source-location, bpos, epos, allow-escapes?, multi-line?);
make(<string-fragment>,
record: source-location.source-location-record,
source-position: source-location.source-location-source-position,
Expand All @@ -1192,16 +1199,16 @@ define method %make-string-literal
end method;

define constant make-string-literal // "..."
= rcurry(%make-string-literal, 1, 1, #t);
= rcurry(%make-string-literal, 1, 1, #t, #f);

define constant make-multi-line-string-literal // """..."""
= rcurry(%make-string-literal, 3, 3, #t);
= rcurry(%make-string-literal, 3, 3, #t, #t);

define constant make-raw-string-literal // #r"..."
= rcurry(%make-string-literal, 3, 1, #f);
= rcurry(%make-string-literal, 3, 1, #f, #f);

define constant make-multi-line-raw-string-literal // #r"""..."""
= rcurry(%make-string-literal, 5, 3, #f);
= rcurry(%make-string-literal, 5, 3, #f, #t);

define method parse-ratio-literal
(lexer :: <lexer>, source-location :: <lexer-source-location>)
Expand Down Expand Up @@ -1603,6 +1610,7 @@ end method parse-conditional;
// TODO: CORRECTNESS: Multiplatform newline sequence handling.

define constant $space-code = as(<integer>, ' ');
define constant $carriage-return-code = as(<integer>, '\r');
define constant $newline-code = as(<integer>, '\n');
define constant $tab-code = as(<integer>, '\t');

Expand Down
2 changes: 1 addition & 1 deletion sources/dfmc/reader/tests/dfmc-reader-test-suite-app.dylan
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ Module: dfmc-reader-test-suite-app
License: See License.txt in this distribution for details.


run-test-application(dfmc-reader-test-suite);
run-test-application();
Loading
Loading