Skip to content

Commit

Permalink
Rollup merge of #123951 - pitaj:reserve-guarded-strings, r=traviscross
Browse files Browse the repository at this point in the history
Reserve guarded string literals (RFC 3593)

Implementation for RFC 3593, including:
- lexer / parser changes
- diagnostics
- migration lint
- tests

We reserve `#"`, `##"`, `###"`, `####`, and any other string of four or more repeated `#`. This avoids infinite lookahead in the lexer, though we still use infinite lookahead in the parser to provide better forward compatibility diagnostics.

This PR does not implement any special lexing of the string internals:
- strings preceded by one or more `#` are denied
- regardless of the number of trailing `#`
- string contents are lexed as if it was just a bare `"string"`

Tracking issue: #123735
RFC: rust-lang/rfcs#3593
  • Loading branch information
matthiaskrgr authored Oct 9, 2024
2 parents a1eceec + 321a5db commit b41e939
Show file tree
Hide file tree
Showing 23 changed files with 1,514 additions and 9 deletions.
93 changes: 85 additions & 8 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ pub enum TokenKind {
/// for emoji identifier recovery, as those are not meant to be ever accepted.
InvalidPrefix,

/// Guarded string literal prefix: `#"` or `##`.
///
/// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
/// Split into the component tokens on older editions.
GuardedStrPrefix,

/// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
/// suffix, but may be present here on string and float literals. Users of
/// this type will need to check for and reject that case.
Expand Down Expand Up @@ -191,30 +197,41 @@ pub enum DocStyle {
/// `rustc_ast::ast::LitKind`).
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LiteralKind {
/// "12_u8", "0o100", "0b120i99", "1f32".
/// `12_u8`, `0o100`, `0b120i99`, `1f32`.
Int { base: Base, empty_int: bool },
/// "12.34f32", "1e3", but not "1f32".
/// `12.34f32`, `1e3`, but not `1f32`.
Float { base: Base, empty_exponent: bool },
/// "'a'", "'\\'", "'''", "';"
/// `'a'`, `'\\'`, `'''`, `';`
Char { terminated: bool },
/// "b'a'", "b'\\'", "b'''", "b';"
/// `b'a'`, `b'\\'`, `b'''`, `b';`
Byte { terminated: bool },
/// ""abc"", ""abc"
/// `"abc"`, `"abc`
Str { terminated: bool },
/// "b"abc"", "b"abc"
/// `b"abc"`, `b"abc`
ByteStr { terminated: bool },
/// `c"abc"`, `c"abc`
CStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates
/// `r"abc"`, `r#"abc"#`, `r####"ab"###"c"####`, `r#"a`. `None` indicates
/// an invalid literal.
RawStr { n_hashes: Option<u8> },
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None`
/// `br"abc"`, `br#"abc"#`, `br####"ab"###"c"####`, `br#"a`. `None`
/// indicates an invalid literal.
RawByteStr { n_hashes: Option<u8> },
/// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal.
RawCStr { n_hashes: Option<u8> },
}

/// `#"abc"#`, `##"a"` (fewer closing), or even `#"a` (unterminated).
///
/// Can capture fewer closing hashes than starting hashes,
/// for more efficient lexing and better backwards diagnostics.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct GuardedStr {
pub n_hashes: u32,
pub terminated: bool,
pub token_len: u32,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum RawStrError {
/// Non `#` characters exist between `r` and `"`, e.g. `r##~"abcde"##`
Expand Down Expand Up @@ -403,6 +420,12 @@ impl Cursor<'_> {
TokenKind::Literal { kind: literal_kind, suffix_start }
}

// Guarded string literal prefix: `#"` or `##`
'#' if matches!(self.first(), '"' | '#') => {
self.bump();
TokenKind::GuardedStrPrefix
}

// One-symbol tokens.
';' => Semi,
',' => Comma,
Expand Down Expand Up @@ -780,6 +803,60 @@ impl Cursor<'_> {
false
}

/// Attempt to lex for a guarded string literal.
///
/// Used by `rustc_parse::lexer` to lex for guarded strings
/// conditionally based on edition.
///
/// Note: this will not reset the `Cursor` when a
/// guarded string is not found. It is the caller's
/// responsibility to do so.
pub fn guarded_double_quoted_string(&mut self) -> Option<GuardedStr> {
debug_assert!(self.prev() != '#');

let mut n_start_hashes: u32 = 0;
while self.first() == '#' {
n_start_hashes += 1;
self.bump();
}

if self.first() != '"' {
return None;
}
self.bump();
debug_assert!(self.prev() == '"');

// Lex the string itself as a normal string literal
// so we can recover that for older editions later.
let terminated = self.double_quoted_string();
if !terminated {
let token_len = self.pos_within_token();
self.reset_pos_within_token();

return Some(GuardedStr { n_hashes: n_start_hashes, terminated: false, token_len });
}

// Consume closing '#' symbols.
// Note that this will not consume extra trailing `#` characters:
// `###"abcde"####` is lexed as a `GuardedStr { n_end_hashes: 3, .. }`
// followed by a `#` token.
let mut n_end_hashes = 0;
while self.first() == '#' && n_end_hashes < n_start_hashes {
n_end_hashes += 1;
self.bump();
}

// Reserved syntax, always an error, so it doesn't matter if
// `n_start_hashes != n_end_hashes`.

self.eat_literal_suffix();

let token_len = self.pos_within_token();
self.reset_pos_within_token();

Some(GuardedStr { n_hashes: n_start_hashes, terminated: true, token_len })
}

/// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
fn raw_double_quoted_string(&mut self, prefix_len: u32) -> Result<u8, RawStrError> {
// Wrap the actual function to handle the error with too many hashes.
Expand Down
3 changes: 3 additions & 0 deletions compiler/rustc_lint/messages.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,9 @@ lint_reserved_prefix = prefix `{$prefix}` is unknown
.label = unknown prefix
.suggestion = insert whitespace here to avoid this being parsed as a prefix in Rust 2021
lint_reserved_string = will be parsed as a guarded string in Rust 2024
.suggestion = insert whitespace here to avoid this being parsed as a guarded string in Rust 2024
lint_shadowed_into_iter =
this method call resolves to `<&{$target} as IntoIterator>::into_iter` (due to backwards compatibility), but will resolve to `<{$target} as IntoIterator>::into_iter` in Rust {$edition}
.use_iter_suggestion = use `.iter()` instead of `.into_iter()` to avoid ambiguity
Expand Down
3 changes: 3 additions & 0 deletions compiler/rustc_lint/src/context/diagnostics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,9 @@ pub(super) fn decorate_lint(sess: &Session, diagnostic: BuiltinLintDiag, diag: &
lints::RawPrefix { label: label_span, suggestion: label_span.shrink_to_hi() }
.decorate_lint(diag);
}
BuiltinLintDiag::ReservedString(suggestion) => {
lints::ReservedString { suggestion }.decorate_lint(diag);
}
BuiltinLintDiag::UnusedBuiltinAttribute { attr_name, macro_name, invoc_span } => {
lints::UnusedBuiltinAttribute { invoc_span, attr_name, macro_name }.decorate_lint(diag);
}
Expand Down
7 changes: 7 additions & 0 deletions compiler/rustc_lint/src/lints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3053,3 +3053,10 @@ pub(crate) enum MutRefSugg {
#[derive(LintDiagnostic)]
#[diag(lint_unqualified_local_imports)]
pub(crate) struct UnqualifiedLocalImportsDiag {}

#[derive(LintDiagnostic)]
#[diag(lint_reserved_string)]
pub(crate) struct ReservedString {
#[suggestion(code = " ", applicability = "machine-applicable")]
pub suggestion: Span,
}
41 changes: 41 additions & 0 deletions compiler/rustc_lint_defs/src/builtin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ declare_lint_pass! {
RUST_2021_INCOMPATIBLE_OR_PATTERNS,
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
RUST_2021_PRELUDE_COLLISIONS,
RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
RUST_2024_INCOMPATIBLE_PAT,
RUST_2024_PRELUDE_COLLISIONS,
SELF_CONSTRUCTOR_FROM_OUTER_ITEM,
Expand Down Expand Up @@ -4996,3 +4997,43 @@ declare_lint! {
Warn,
"detects pointer to integer transmutes in const functions and associated constants",
}

declare_lint! {
/// The `rust_2024_guarded_string_incompatible_syntax` lint detects `#` tokens
/// that will be parsed as part of a guarded string literal in Rust 2024.
///
/// ### Example
///
/// ```rust,edition2021,compile_fail
/// #![deny(rust_2024_guarded_string_incompatible_syntax)]
///
/// macro_rules! m {
/// (# $x:expr #) => ();
/// (# $x:expr) => ();
/// }
///
/// m!(#"hey"#);
/// m!(#"hello");
/// ```
///
/// {{produces}}
///
/// ### Explanation
///
/// Prior to Rust 2024, `#"hey"#` is three tokens: the first `#`
/// followed by the string literal `"hey"` then the final `#`.
/// In Rust 2024, the whole sequence is considered a single token.
///
/// This lint suggests to add whitespace between the leading `#`
/// and the string to keep them separated in Rust 2024.
// Allow this lint -- rustdoc doesn't yet support threading edition into this lint's parser.
#[allow(rustdoc::invalid_rust_codeblocks)]
pub RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
Allow,
"will be parsed as a guarded string in Rust 2024",
@future_incompatible = FutureIncompatibleInfo {
reason: FutureIncompatibilityReason::EditionError(Edition::Edition2024),
reference: "issue #123735 <https://github.com/rust-lang/rust/issues/123735>",
};
crate_level_only
}
2 changes: 2 additions & 0 deletions compiler/rustc_lint_defs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,8 @@ pub enum BuiltinLintDiag {
ReservedPrefix(Span, String),
/// `'r#` in edition < 2021.
RawPrefix(Span),
/// `##` or `#"` is edition < 2024.
ReservedString(Span),
TrailingMacro(bool, Ident),
BreakWithLabelAndLoop(Span),
UnicodeTextFlow(Span, String),
Expand Down
4 changes: 4 additions & 0 deletions compiler/rustc_parse/messages.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,10 @@ parse_require_colon_after_labeled_expression = labeled expression must be follow
.label = the label
.suggestion = add `:` after the label
parse_reserved_string = invalid string literal
.note = unprefixed guarded string literals are reserved for future use since Rust 2024
.suggestion_whitespace = consider inserting whitespace here
parse_return_types_use_thin_arrow = return types are denoted using `->`
.suggestion = use `->` instead
Expand Down
18 changes: 18 additions & 0 deletions compiler/rustc_parse/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2110,6 +2110,24 @@ pub(crate) enum UnknownPrefixSugg {
},
}

#[derive(Diagnostic)]
#[diag(parse_reserved_string)]
#[note]
pub(crate) struct ReservedString {
#[primary_span]
pub span: Span,
#[subdiagnostic]
pub sugg: Option<GuardedStringSugg>,
}
#[derive(Subdiagnostic)]
#[suggestion(
parse_suggestion_whitespace,
code = " ",
applicability = "maybe-incorrect",
style = "verbose"
)]
pub(crate) struct GuardedStringSugg(#[primary_span] pub Span);

#[derive(Diagnostic)]
#[diag(parse_too_many_hashes)]
pub(crate) struct TooManyHashes {
Expand Down
84 changes: 83 additions & 1 deletion compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ use rustc_lexer::unescape::{self, EscapeError, Mode};
use rustc_lexer::{Base, Cursor, DocStyle, LiteralKind, RawStrError};
use rustc_session::lint::BuiltinLintDiag;
use rustc_session::lint::builtin::{
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
};
use rustc_session::parse::ParseSess;
use rustc_span::symbol::Symbol;
Expand Down Expand Up @@ -251,6 +252,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
let prefix_span = self.mk_sp(start, lit_start);
return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
}
rustc_lexer::TokenKind::GuardedStrPrefix => self.maybe_report_guarded_str(start, str_before),
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
let suffix_start = start + BytePos(suffix_start);
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
Expand Down Expand Up @@ -781,6 +783,86 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
}
}

/// Detect guarded string literal syntax
///
/// RFC 3598 reserved this syntax for future use. As of Rust 2024,
/// using this syntax produces an error. In earlier editions, however, it
/// only results in an (allowed by default) lint, and is treated as
/// separate tokens.
fn maybe_report_guarded_str(&mut self, start: BytePos, str_before: &'src str) -> TokenKind {
let span = self.mk_sp(start, self.pos);
let edition2024 = span.edition().at_least_rust_2024();

let space_pos = start + BytePos(1);
let space_span = self.mk_sp(space_pos, space_pos);

let mut cursor = Cursor::new(str_before);

let (span, unterminated) = match cursor.guarded_double_quoted_string() {
Some(rustc_lexer::GuardedStr { n_hashes, terminated, token_len }) => {
let end = start + BytePos(token_len);
let span = self.mk_sp(start, end);
let str_start = start + BytePos(n_hashes);

if edition2024 {
self.cursor = cursor;
self.pos = end;
}

let unterminated = if terminated { None } else { Some(str_start) };

(span, unterminated)
}
_ => {
// We should only get here in the `##+` case.
debug_assert_eq!(self.str_from_to(start, start + BytePos(2)), "##");

(span, None)
}
};
if edition2024 {
if let Some(str_start) = unterminated {
// Only a fatal error if string is unterminated.
self.dcx()
.struct_span_fatal(
self.mk_sp(str_start, self.pos),
"unterminated double quote string",
)
.with_code(E0765)
.emit()
}

let sugg = if span.from_expansion() {
None
} else {
Some(errors::GuardedStringSugg(space_span))
};

// In Edition 2024 and later, emit a hard error.
let err = self.dcx().emit_err(errors::ReservedString { span, sugg });

token::Literal(token::Lit {
kind: token::Err(err),
symbol: self.symbol_from_to(start, self.pos),
suffix: None,
})
} else {
// Before Rust 2024, only emit a lint for migration.
self.psess.buffer_lint(
RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
span,
ast::CRATE_NODE_ID,
BuiltinLintDiag::ReservedString(space_span),
);

// For backwards compatibility, roll back to after just the first `#`
// and return the `Pound` token.
self.pos = start + BytePos(1);
self.cursor = Cursor::new(&str_before[1..]);
token::Pound
}
}

fn report_too_many_hashes(&self, start: BytePos, num: u32) -> ! {
self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
}
Expand Down
1 change: 1 addition & 0 deletions src/librustdoc/html/highlight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,7 @@ impl<'src> Classifier<'src> {
// Number literals.
LiteralKind::Float { .. } | LiteralKind::Int { .. } => Class::Number,
},
TokenKind::GuardedStrPrefix => return no_highlight(sink),
TokenKind::Ident | TokenKind::RawIdent if lookahead == Some(TokenKind::Bang) => {
self.in_macro = true;
sink(Highlight::EnterSpan { class: Class::Macro(self.new_span(before, text)) });
Expand Down
6 changes: 6 additions & 0 deletions src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,12 @@ impl<'a> Converter<'a> {
}

rustc_lexer::TokenKind::RawIdent => IDENT,

rustc_lexer::TokenKind::GuardedStrPrefix => {
err = "Invalid string literal (reserved syntax)";
ERROR
},

rustc_lexer::TokenKind::Literal { kind, .. } => {
self.extend_literal(token_text.len(), kind);
return;
Expand Down
Loading

0 comments on commit b41e939

Please sign in to comment.