Skip to content

Commit

Permalink
[xt but not xc] Delay string literal unescaping.
Browse files Browse the repository at this point in the history
Currently string literals are unescaped twice.

- Once during lexing in `cook_quoted`/`cook_c_string`/`cook_common`.
  This one just checks for errors.

- Again in `LitKind::from_token_lit`, which is called when lowering AST
  to HIR, and also in a few other places during expansion. This one
  actually constructs the unescaped string. It also has error checking
  code, but that part of the code is actually dead (and has several
  bugs) because the check during lexing catches all errors!

This commit removes the error-check-only unescaping during lexing, and
fixes up `LitKind::from_token_lit` so it properly does both checking and
construction. This is a backwards-compatible language change: some
programs now compile that previously did not. For example, it is now
possible for macros to consume "invalid" string literals like "\a\b\c".
This is a continuation of a trend of delaying semantic error checking of
literals to after expansion:
- rust-lang#102944 did this for some cases for numeric literals
- The detection of NUL chars in C string literals is already delayed in
  this way.

Notes about test changes:

- `ignore-block-help.rs`: this requires a parse error for the test to
  work. The error used was an unescaping error, which is now delayed to
  after parsing. So the commit changes it to an "unterminated character
  literal" error which still occurs during parsing.

- `tests/ui/lexer/error-stage.rs`: this shows the newly allowed cases,
  due to delayed literal unescaping.

- Several tests had unescaping errors combined with unterminated literal
  errors. The former are now delayed but the latter remain as lexing
  errors. So the unterminated literal part needed to be split into a
  separate test file otherwise compilation would end before the other
  errors were reported.

- issue-62913.rs: The structure and output changed a bit. Issue rust-lang#62913
  was about an ICE due to an unterminated string literal, so the new
  version should be good enough.

- literals-are-validated-before-expansion.rs: this tests exactly the
  behaviour that has been changed, and so was removed

- A couple of other test produce the same errors, just in a different
  order.
  • Loading branch information
nnethercote committed Dec 18, 2023
1 parent 90e436f commit 1e49114
Show file tree
Hide file tree
Showing 37 changed files with 615 additions and 444 deletions.
270 changes: 194 additions & 76 deletions compiler/rustc_ast/src/util/literal.rs

Large diffs are not rendered by default.

15 changes: 7 additions & 8 deletions compiler/rustc_ast_lowering/src/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use rustc_data_structures::stack::ensure_sufficient_stack;
use rustc_hir as hir;
use rustc_hir::def::{DefKind, Res};
use rustc_middle::span_bug;
use rustc_parse::parser::report_lit_error;
use rustc_parse::parser::token_lit_to_lit_kind_and_report_errs;
use rustc_span::source_map::{respan, Spanned};
use rustc_span::symbol::{kw, sym, Ident, Symbol};
use rustc_span::DUMMY_SP;
Expand Down Expand Up @@ -119,13 +119,12 @@ impl<'hir> LoweringContext<'_, 'hir> {
hir::ExprKind::Unary(op, ohs)
}
ExprKind::Lit(token_lit) => {
let lit_kind = match LitKind::from_token_lit(*token_lit) {
Ok(lit_kind) => lit_kind,
Err(err) => {
report_lit_error(&self.tcx.sess.parse_sess, err, *token_lit, e.span);
LitKind::Err
}
};
let lit_kind = token_lit_to_lit_kind_and_report_errs(
&self.tcx.sess.parse_sess,
*token_lit,
e.span,
)
.unwrap_or(LitKind::Err);
let lit = self.arena.alloc(respan(self.lower_span(e.span), lit_kind));
hir::ExprKind::Lit(lit)
}
Expand Down
63 changes: 31 additions & 32 deletions compiler/rustc_builtin_macros/src/concat.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use rustc_ast as ast;
use rustc_ast::tokenstream::TokenStream;
use rustc_expand::base::{self, DummyResult};
use rustc_parse::parser::report_lit_error;
use rustc_parse::parser::token_lit_to_lit_kind_and_report_errs;
use rustc_span::symbol::Symbol;

use crate::errors;
Expand All @@ -19,44 +19,43 @@ pub fn expand_concat(
let mut has_errors = false;
for e in es {
match e.kind {
ast::ExprKind::Lit(token_lit) => match ast::LitKind::from_token_lit(token_lit) {
Ok(ast::LitKind::Str(s, _) | ast::LitKind::Float(s, _)) => {
accumulator.push_str(s.as_str());
}
Ok(ast::LitKind::Char(c)) => {
accumulator.push(c);
}
Ok(ast::LitKind::Int(i, _)) => {
accumulator.push_str(&i.to_string());
}
Ok(ast::LitKind::Bool(b)) => {
accumulator.push_str(&b.to_string());
}
Ok(ast::LitKind::CStr(..)) => {
cx.emit_err(errors::ConcatCStrLit { span: e.span });
has_errors = true;
}
Ok(ast::LitKind::Byte(..) | ast::LitKind::ByteStr(..)) => {
cx.emit_err(errors::ConcatBytestr { span: e.span });
has_errors = true;
}
Ok(ast::LitKind::Err) => {
has_errors = true;
}
Err(err) => {
report_lit_error(&cx.sess.parse_sess, err, token_lit, e.span);
has_errors = true;
ast::ExprKind::Lit(token_lit) => {
match token_lit_to_lit_kind_and_report_errs(&cx.sess.parse_sess, token_lit, e.span)
{
Ok(ast::LitKind::Str(s, _) | ast::LitKind::Float(s, _)) => {
accumulator.push_str(s.as_str());
}
Ok(ast::LitKind::Char(c)) => {
accumulator.push(c);
}
Ok(ast::LitKind::Int(i, _)) => {
accumulator.push_str(&i.to_string());
}
Ok(ast::LitKind::Bool(b)) => {
accumulator.push_str(&b.to_string());
}
Ok(ast::LitKind::CStr(..)) => {
cx.emit_err(errors::ConcatCStrLit { span: e.span });
has_errors = true;
}
Ok(ast::LitKind::Byte(..) | ast::LitKind::ByteStr(..)) => {
cx.emit_err(errors::ConcatBytestr { span: e.span });
has_errors = true;
}
Ok(ast::LitKind::Err) | Err(()) => {
has_errors = true;
}
}
},
}
// We also want to allow negative numeric literals.
ast::ExprKind::Unary(ast::UnOp::Neg, ref expr)
if let ast::ExprKind::Lit(token_lit) = expr.kind =>
{
match ast::LitKind::from_token_lit(token_lit) {
match token_lit_to_lit_kind_and_report_errs(&cx.sess.parse_sess, token_lit, e.span)
{
Ok(ast::LitKind::Int(i, _)) => accumulator.push_str(&format!("-{i}")),
Ok(ast::LitKind::Float(f, _)) => accumulator.push_str(&format!("-{f}")),
Err(err) => {
report_lit_error(&cx.sess.parse_sess, err, token_lit, e.span);
Err(()) => {
has_errors = true;
}
_ => missing_literal.push(e.span),
Expand Down
8 changes: 3 additions & 5 deletions compiler/rustc_builtin_macros/src/concat_bytes.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use rustc_ast as ast;
use rustc_ast::{ptr::P, tokenstream::TokenStream};
use rustc_expand::base::{self, DummyResult};
use rustc_parse::parser::report_lit_error;
use rustc_parse::parser::token_lit_to_lit_kind_and_report_errs;
use rustc_span::Span;

use crate::errors;
Expand All @@ -17,7 +17,7 @@ fn invalid_type_err(
ConcatBytesInvalid, ConcatBytesInvalidSuggestion, ConcatBytesNonU8, ConcatBytesOob,
};
let snippet = cx.sess.source_map().span_to_snippet(span).ok();
match ast::LitKind::from_token_lit(token_lit) {
match token_lit_to_lit_kind_and_report_errs(&cx.sess.parse_sess, token_lit, span) {
Ok(ast::LitKind::CStr(_, _)) => {
// Avoid ambiguity in handling of terminal `NUL` by refusing to
// concatenate C string literals as bytes.
Expand Down Expand Up @@ -60,9 +60,7 @@ fn invalid_type_err(
cx.emit_err(ConcatBytesNonU8 { span });
}
Ok(ast::LitKind::ByteStr(..) | ast::LitKind::Byte(_)) => unreachable!(),
Err(err) => {
report_lit_error(&cx.sess.parse_sess, err, token_lit, span);
}
Err(()) => {}
}
}

Expand Down
44 changes: 24 additions & 20 deletions compiler/rustc_expand/src/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1235,26 +1235,30 @@ pub fn expr_to_spanned_string<'a>(
let expr = cx.expander().fully_expand_fragment(AstFragment::Expr(expr)).make_expr();

Err(match expr.kind {
ast::ExprKind::Lit(token_lit) => match ast::LitKind::from_token_lit(token_lit) {
Ok(ast::LitKind::Str(s, style)) => return Ok((s, style, expr.span)),
Ok(ast::LitKind::ByteStr(..)) => {
let mut err = cx.struct_span_err(expr.span, err_msg);
let span = expr.span.shrink_to_lo();
err.span_suggestion(
span.with_hi(span.lo() + BytePos(1)),
"consider removing the leading `b`",
"",
Applicability::MaybeIncorrect,
);
Some((err, true))
}
Ok(ast::LitKind::Err) => None,
Err(err) => {
parser::report_lit_error(&cx.sess.parse_sess, err, token_lit, expr.span);
None
}
_ => Some((cx.struct_span_err(expr.span, err_msg), false)),
},
ast::ExprKind::Lit(token_lit) => {
let res = match parser::token_lit_to_lit_kind_and_report_errs(
&cx.sess.parse_sess,
token_lit,
expr.span,
) {
Ok(ast::LitKind::Str(s, style)) => return Ok((s, style, expr.span)),
Ok(ast::LitKind::ByteStr(..)) => {
let mut err = cx.struct_span_err(expr.span, err_msg);
let span = expr.span.shrink_to_lo();
err.span_suggestion(
span.with_hi(span.lo() + BytePos(1)),
"consider removing the leading `b`",
"",
Applicability::MaybeIncorrect,
);
Some((err, true))
}
Ok(ast::LitKind::Err) => None,
Err(()) => None,
_ => Some((cx.struct_span_err(expr.span, err_msg), false)),
};
res
}
ast::ExprKind::Err => None,
_ => Some((cx.struct_span_err(expr.span, err_msg), false)),
})
Expand Down
2 changes: 1 addition & 1 deletion compiler/rustc_lexer/src/unescape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ where
// them in the range computation.
while let Some(c) = chars.next() {
let start = src.len() - chars.as_str().len() - c.len_utf8();
let res = match c {
let res: Result<T, EscapeError> = match c {
'\\' => {
match chars.clone().next() {
Some('\n') => {
Expand Down
91 changes: 12 additions & 79 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use std::ops::Range;

use crate::errors;
use crate::lexer::unicode_chars::UNICODE_ARRAY;
use crate::make_unclosed_delims_error;
Expand All @@ -8,7 +6,6 @@ use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind};
use rustc_ast::tokenstream::TokenStream;
use rustc_ast::util::unicode::contains_text_flow_control_chars;
use rustc_errors::{error_code, Applicability, Diagnostic, DiagnosticBuilder, StashKey};
use rustc_lexer::unescape::{self, EscapeError, Mode};
use rustc_lexer::{Base, DocStyle, RawStrError};
use rustc_lexer::{Cursor, LiteralKind};
use rustc_session::lint::builtin::{
Expand All @@ -21,10 +18,10 @@ use rustc_span::{edition::Edition, BytePos, Pos, Span};

mod diagnostics;
mod tokentrees;
mod unescape_error_reporting;
pub(crate) mod unescape_error_reporting;
mod unicode_chars;

use unescape_error_reporting::{emit_unescape_error, escaped_char};
use unescape_error_reporting::escaped_char;

// This type is used a lot. Make sure it doesn't unintentionally get bigger.
//
Expand Down Expand Up @@ -409,7 +406,7 @@ impl<'a> StringReader<'a> {
error_code!(E0762),
)
}
self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' '
self.cook_quoted(token::Char, start, end, 1, 1) // ' '
}
rustc_lexer::LiteralKind::Byte { terminated } => {
if !terminated {
Expand All @@ -419,7 +416,7 @@ impl<'a> StringReader<'a> {
error_code!(E0763),
)
}
self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
self.cook_quoted(token::Byte, start, end, 2, 1) // b' '
}
rustc_lexer::LiteralKind::Str { terminated } => {
if !terminated {
Expand All @@ -429,7 +426,7 @@ impl<'a> StringReader<'a> {
error_code!(E0765),
)
}
self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " "
self.cook_quoted(token::Str, start, end, 1, 1) // " "
}
rustc_lexer::LiteralKind::ByteStr { terminated } => {
if !terminated {
Expand All @@ -439,7 +436,7 @@ impl<'a> StringReader<'a> {
error_code!(E0766),
)
}
self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" "
self.cook_quoted(token::ByteStr, start, end, 2, 1) // b" "
}
rustc_lexer::LiteralKind::CStr { terminated } => {
if !terminated {
Expand All @@ -449,13 +446,13 @@ impl<'a> StringReader<'a> {
error_code!(E0767),
)
}
self.cook_c_string(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
self.cook_quoted(token::CStr, start, end, 2, 1) // c" "
}
rustc_lexer::LiteralKind::RawStr { n_hashes } => {
if let Some(n_hashes) = n_hashes {
let n = u32::from(n_hashes);
let kind = token::StrRaw(n_hashes);
self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "##
self.cook_quoted(kind, start, end, 2 + n, 1 + n) // r##" "##
} else {
self.report_raw_str_error(start, 1);
}
Expand All @@ -464,7 +461,7 @@ impl<'a> StringReader<'a> {
if let Some(n_hashes) = n_hashes {
let n = u32::from(n_hashes);
let kind = token::ByteStrRaw(n_hashes);
self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "##
self.cook_quoted(kind, start, end, 3 + n, 1 + n) // br##" "##
} else {
self.report_raw_str_error(start, 2);
}
Expand All @@ -473,7 +470,7 @@ impl<'a> StringReader<'a> {
if let Some(n_hashes) = n_hashes {
let n = u32::from(n_hashes);
let kind = token::CStrRaw(n_hashes);
self.cook_c_string(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "##
self.cook_quoted(kind, start, end, 3 + n, 1 + n) // cr##" "##
} else {
self.report_raw_str_error(start, 2);
}
Expand Down Expand Up @@ -693,82 +690,18 @@ impl<'a> StringReader<'a> {
self.sess.emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
}

fn cook_common(
fn cook_quoted(
&self,
kind: token::LitKind,
mode: Mode,
start: BytePos,
end: BytePos,
prefix_len: u32,
postfix_len: u32,
unescape: fn(&str, Mode, &mut dyn FnMut(Range<usize>, Result<(), EscapeError>)),
) -> (token::LitKind, Symbol) {
let mut has_fatal_err = false;
let content_start = start + BytePos(prefix_len);
let content_end = end - BytePos(postfix_len);
let lit_content = self.str_from_to(content_start, content_end);
unescape(lit_content, mode, &mut |range, result| {
// Here we only check for errors. The actual unescaping is done later.
if let Err(err) = result {
let span_with_quotes = self.mk_sp(start, end);
let (start, end) = (range.start as u32, range.end as u32);
let lo = content_start + BytePos(start);
let hi = lo + BytePos(end - start);
let span = self.mk_sp(lo, hi);
if err.is_fatal() {
has_fatal_err = true;
}
emit_unescape_error(
&self.sess.span_diagnostic,
lit_content,
span_with_quotes,
span,
mode,
range,
err,
);
}
});

// We normally exclude the quotes for the symbol, but for errors we
// include it because it results in clearer error messages.
if !has_fatal_err {
(kind, Symbol::intern(lit_content))
} else {
(token::Err, self.symbol_from_to(start, end))
}
}

fn cook_quoted(
&self,
kind: token::LitKind,
mode: Mode,
start: BytePos,
end: BytePos,
prefix_len: u32,
postfix_len: u32,
) -> (token::LitKind, Symbol) {
self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
unescape::unescape_literal(src, mode, &mut |span, result| {
callback(span, result.map(drop))
})
})
}

fn cook_c_string(
&self,
kind: token::LitKind,
mode: Mode,
start: BytePos,
end: BytePos,
prefix_len: u32,
postfix_len: u32,
) -> (token::LitKind, Symbol) {
self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
unescape::unescape_c_string(src, mode, &mut |span, result| {
callback(span, result.map(drop))
})
})
(kind, Symbol::intern(lit_content))
}
}

Expand Down
Loading

0 comments on commit 1e49114

Please sign in to comment.