Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix parser translator ast for heredoc with written newlines #3347

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 28 additions & 6 deletions lib/prism/translation/parser/compiler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2079,27 +2079,49 @@ def visit_heredoc(node)

escaped_lengths = []
normalized_lengths = []
# Keeps track of where an unescaped line should start a new token. An unescaped
# \n would otherwise be indistinguishable from the actual newline at the end of
# of the line. The parser gem only emits a new string node at "real" newlines,
# line line continuations don't start a new node as well.
do_next_tokens = []

if node.opening.end_with?("'")
escaped.each do |line|
escaped_lengths << line.bytesize
normalized_lengths << chomped_bytesize(line)
do_next_tokens << true
end
else
escaped
.chunk_while { |before, after| before.match?(/(?<!\\)\\\r?\n$/) }
.chunk_while { |before, after| before[/(\\*)\r?\n$/, 1].length.odd? }
.each do |lines|
escaped_lengths << lines.sum(&:bytesize)
normalized_lengths << lines.sum { |line| chomped_bytesize(line) }
unescaped_lines_count = lines.sum do |line|
line.scan(/(\\*)n/).count { |(backslashes)| backslashes.length.odd? }
end
do_next_tokens.push(*Array.new(unescaped_lines_count + 1, false))
do_next_tokens[-1] = true
end
end

start_offset = part.location.start_offset

unescaped.map.with_index do |unescaped_line, index|
inner_part = builder.string_internal([unescaped_line, srange_offsets(start_offset, start_offset + normalized_lengths.fetch(index, 0))])
start_offset += escaped_lengths.fetch(index, 0)
inner_part
current_line = +""
current_normalized_length = 0

unescaped.filter_map.with_index do |unescaped_line, index|
current_line << unescaped_line
current_normalized_length += normalized_lengths.fetch(index, 0)

if do_next_tokens[index]
inner_part = builder.string_internal([current_line, srange_offsets(start_offset, start_offset + current_normalized_length)])
start_offset += escaped_lengths.fetch(index, 0)
current_line = +""
current_normalized_length = 0
inner_part
else
nil
end
end
else
[visit(part)]
Expand Down
176 changes: 149 additions & 27 deletions lib/prism/translation/parser/lexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,10 @@ class Lexer
:tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
]

private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES
# Heredocs are complex and require us to keep track of a bit of info to refer to later
HeredocData = Struct.new(:identifier, :common_whitespace, :quote, keyword_init: true)

private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData

# The Parser::Source::Buffer that the tokens were lexed from.
attr_reader :source_buffer
Expand Down Expand Up @@ -230,7 +233,7 @@ def to_a
index = 0
length = lexed.length

heredoc_identifier_stack = []
heredoc_stack = Array.new

while index < length
token, state = lexed[index]
Expand Down Expand Up @@ -299,9 +302,6 @@ def to_a
when :tSPACE
value = nil
when :tSTRING_BEG
if token.type == :HEREDOC_START
heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier])
end
next_token = lexed[index][0]
next_next_token = lexed[index + 1][0]
basic_quotes = ["\"", "'"].include?(value)
Expand All @@ -316,43 +316,81 @@ def to_a
# the parser gem doesn't simplify strings when its value ends in a newline
unless (string_value = next_token.value).end_with?("\n")
next_location = token.location.join(next_next_token.location)
value = string_value.gsub("\\\\", "\\")
value = unescape_string(string_value)
type = :tSTRING
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
index += 2
end
elsif value.start_with?("<<")
elsif token.type == :HEREDOC_START
quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
heredoc = HeredocData.new(
identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
common_whitespace: 0,
quote: quote,
)

if quote == "`"
type = :tXSTRING_BEG
value = "<<`"
end

# The parser gem trims whitespace from squiggly heredocs. We must record
# the most common whitespace to later remove.
if heredoc_type == "~" || heredoc_type == "`"
heredoc.common_whitespace = calculate_heredoc_whitespace(index)
end

if quote == "'" || quote == '"' || quote == "`"
value = "<<#{quote}"
else
value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
value = '<<"'
end

heredoc_stack.push(heredoc)
end
when :tSTRING_CONTENT
unless (lines = token.value.lines).one?
if (lines = token.value.lines).one?
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
# The parser gem only removes indentation when the heredoc is not nested
not_nested = heredoc_stack.size == 1
current_heredoc = heredoc_stack.last
if is_first_token_on_line && not_nested && current_heredoc.common_whitespace > 0
value = trim_heredoc_whitespace(value, heredoc)
end
if current_heredoc
value = unescape_heredoc(value, heredoc)
end
else
# When the parser gem encounters a line continuation inside of a multiline string,
# it emits a single string node. The backslash (and remaining newline) is removed.
current_line = +""
adjustment = 0
start_offset = offset_cache[token.location.start_offset]
lines.map do |line|
newline = line.end_with?("\r\n") ? "\r\n" : "\n"
emit = false

lines.each.with_index do |line, index|
chomped_line = line.chomp
if match = chomped_line.match(/(?<backslashes>\\+)\z/)
adjustment = match[:backslashes].size / 2
adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
if match[:backslashes].size.odd?
adjusted_line.delete_suffix!("\\")
adjustment += 2
else
adjusted_line << newline
end

# When the line ends with an odd number of backslashes, it must be a line continuation.
if chomped_line[/\\{1,}\z/]&.length&.odd?
chomped_line.delete_suffix!("\\")
current_line << chomped_line
adjustment += 2
# If the string ends with a line continuation emit the remainder
emit = index == lines.count - 1
else
adjusted_line = line
adjustment = 0
current_line << line
emit = true
end

end_offset = start_offset + adjusted_line.bytesize + adjustment
tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
start_offset = end_offset
if emit
end_offset = start_offset + current_line.bytesize + adjustment
tokens << [:tSTRING_CONTENT, [unescape_string(current_line), Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
start_offset = end_offset
current_line = +""
adjustment = 0
end
end
next
end
Expand All @@ -361,7 +399,7 @@ def to_a
when :tSTRING_END
if token.type == :HEREDOC_END && value.end_with?("\n")
newline_length = value.end_with?("\r\n") ? 2 : 1
value = heredoc_identifier_stack.pop
value = heredoc_stack.pop.identifier
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - newline_length])
elsif token.type == :REGEXP_END
value = value[0]
Expand Down Expand Up @@ -443,6 +481,90 @@ def parse_rational(value)
rescue ArgumentError
0r
end

# Wonky heredoc tab/spaces rules.
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
def calculate_heredoc_whitespace(heredoc_token_index)
next_token_index = heredoc_token_index
nesting_level = 0
previous_line = -1
result = Float::MAX

while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
next_token_index += 1
next_next_token = lexed[next_token_index] && lexed[next_token_index][0]

# String content inside nested heredocs and interpolation is ignored
if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
nesting_level += 1
elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
nesting_level -= 1
# When we encountered the matching heredoc end, we can exit
break if nesting_level == -1
elsif next_token.type == :STRING_CONTENT && nesting_level == 0
common_whitespace = 0
next_token.value[/^\s*/].each_char do |char|
if char == "\t"
common_whitespace = (common_whitespace / 8 + 1) * 8;
else
common_whitespace += 1
end
end

is_first_token_on_line = next_token.location.start_line != previous_line
# Whitespace is significant if followed by interpolation
whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
if is_first_token_on_line && !whitespace_only && common_whitespace < result
result = common_whitespace
previous_line = next_token.location.start_line
end
end
end
result
end

# Wonky heredoc tab/spaces rules.
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
def trim_heredoc_whitespace(string, heredoc)
trimmed_whitespace = 0
trimmed_characters = 0
while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
if string[trimmed_characters] == "\t"
trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
break if trimmed_whitespace > heredoc.common_whitespace
else
trimmed_whitespace += 1
end
trimmed_characters += 1
end

string[trimmed_characters..]
end

# Naive string escaping handling. Should be closer to the "unescape_heredoc" method
def unescape_string(string)
string.gsub("\\\\", "\\")
end

# Escape sequences that have special and should appear unescaped in the resulting string.
ESCAPES = {
"a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
"n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
"v" => "\v", "\\\\" => "\\"
}.freeze
private_constant :ESCAPES

# TODO: Does not handle "\u1234" and other longer-form escapes.
def unescape_heredoc(string, heredoc)
# In single-quoted heredocs, everything is taken literally.
return string if heredoc.quote == "'"

# When double-quoted, escape sequences can be written literally. For example, "\\n" becomes "\n",
# and "\\\\n" becomes "\\n". Unknown escapes sequences, like "\\o" simply become "o".
string.gsub(/\\./) do |match|
ESCAPES[match[1]] || match[1]
end
end
end
end
end
Expand Down
18 changes: 0 additions & 18 deletions test/prism/ruby/parser_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,12 @@ class ParserTest < TestCase
"seattlerb/heredoc_with_extra_carriage_returns_windows.txt",
"seattlerb/heredoc_with_only_carriage_returns_windows.txt",
"seattlerb/heredoc_with_only_carriage_returns.txt",
"seattlerb/parse_line_heredoc_hardnewline.txt",
"seattlerb/pctW_lineno.txt",
"seattlerb/regexp_esc_C_slash.txt",
"unparser/corpus/literal/literal.txt",
"unparser/corpus/semantic/dstr.txt",
"whitequark/dedenting_interpolating_heredoc_fake_line_continuation.txt",
"whitequark/parser_slash_slash_n_escaping_in_literals.txt",
"whitequark/ruby_bug_11989.txt"
]

# Not sure why these files are failing on JRuby, but skipping them for now.
Expand Down Expand Up @@ -102,20 +100,9 @@ class ParserTest < TestCase
"seattlerb/difficult6__7.txt",
"seattlerb/difficult6__8.txt",
"seattlerb/dsym_esc_to_sym.txt",
"seattlerb/heredoc__backslash_dos_format.txt",
"seattlerb/heredoc_backslash_nl.txt",
"seattlerb/heredoc_squiggly_blank_line_plus_interpolation.txt",
"seattlerb/heredoc_squiggly_blank_lines.txt",
"seattlerb/heredoc_squiggly_interp.txt",
"seattlerb/heredoc_squiggly_tabs_extra.txt",
"seattlerb/heredoc_squiggly_tabs.txt",
"seattlerb/heredoc_squiggly_visually_blank_lines.txt",
"seattlerb/heredoc_squiggly.txt",
"seattlerb/heredoc_unicode.txt",
"seattlerb/heredoc_with_carriage_return_escapes_windows.txt",
"seattlerb/heredoc_with_carriage_return_escapes.txt",
"seattlerb/heredoc_with_interpolation_and_carriage_return_escapes_windows.txt",
"seattlerb/heredoc_with_interpolation_and_carriage_return_escapes.txt",
"seattlerb/module_comments.txt",
"seattlerb/parse_line_block_inline_comment_leading_newlines.txt",
"seattlerb/parse_line_block_inline_comment.txt",
Expand All @@ -135,22 +122,17 @@ class ParserTest < TestCase
"seattlerb/str_newline_hash_line_number.txt",
"seattlerb/TestRubyParserShared.txt",
"unparser/corpus/literal/assignment.txt",
"unparser/corpus/literal/dstr.txt",
"unparser/corpus/semantic/opasgn.txt",
"whitequark/args.txt",
"whitequark/beginless_erange_after_newline.txt",
"whitequark/beginless_irange_after_newline.txt",
"whitequark/bug_ascii_8bit_in_literal.txt",
"whitequark/bug_def_no_paren_eql_begin.txt",
"whitequark/dedenting_heredoc.txt",
"whitequark/dedenting_non_interpolating_heredoc_line_continuation.txt",
"whitequark/forward_arg_with_open_args.txt",
"whitequark/lbrace_arg_after_command_args.txt",
"whitequark/multiple_pattern_matches.txt",
"whitequark/newline_in_hash_argument.txt",
"whitequark/parser_bug_640.txt",
"whitequark/parser_drops_truncated_parts_of_squiggly_heredoc.txt",
"whitequark/ruby_bug_11990.txt",
"whitequark/ruby_bug_14690.txt",
"whitequark/ruby_bug_9669.txt",
"whitequark/slash_newline_in_heredocs.txt",
Expand Down
Loading