From 1322e476bf5eecfad98f0b200f15c1b46a0d46d2 Mon Sep 17 00:00:00 2001 From: Nilstrieb <48135649+Nilstrieb@users.noreply.github.com> Date: Tue, 27 Dec 2022 21:24:35 +0100 Subject: [PATCH 1/3] Improve debug logs of `find_width_of_character_at_span` --- compiler/rustc_span/src/source_map.rs | 29 +++++++++++---------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/compiler/rustc_span/src/source_map.rs b/compiler/rustc_span/src/source_map.rs index d9c87ac0ba82b..fa09b4faa441f 100644 --- a/compiler/rustc_span/src/source_map.rs +++ b/compiler/rustc_span/src/source_map.rs @@ -964,45 +964,40 @@ impl SourceMap { /// Finds the width of the character, either before or after the end of provided span, /// depending on the `forwards` parameter. + #[instrument(skip(self, sp))] fn find_width_of_character_at_span(&self, sp: Span, forwards: bool) -> u32 { let sp = sp.data(); if sp.lo == sp.hi && !forwards { - debug!("find_width_of_character_at_span: early return empty span"); + debug!("early return empty span"); return 1; } let local_begin = self.lookup_byte_offset(sp.lo); let local_end = self.lookup_byte_offset(sp.hi); - debug!( - "find_width_of_character_at_span: local_begin=`{:?}`, local_end=`{:?}`", - local_begin, local_end - ); + debug!("local_begin=`{:?}`, local_end=`{:?}`", local_begin, local_end); if local_begin.sf.start_pos != local_end.sf.start_pos { - debug!("find_width_of_character_at_span: begin and end are in different files"); + debug!("begin and end are in different files"); return 1; } let start_index = local_begin.pos.to_usize(); let end_index = local_end.pos.to_usize(); - debug!( - "find_width_of_character_at_span: start_index=`{:?}`, end_index=`{:?}`", - start_index, end_index - ); + debug!("start_index=`{:?}`, end_index=`{:?}`", start_index, end_index); // Disregard indexes that are at the start or end of their spans, they can't fit bigger // characters. if (!forwards && end_index == usize::MIN) || (forwards && start_index == usize::MAX) { - debug!("find_width_of_character_at_span: start or end of span, cannot be multibyte"); + debug!("start or end of span, cannot be multibyte"); return 1; } let source_len = (local_begin.sf.end_pos - local_begin.sf.start_pos).to_usize(); - debug!("find_width_of_character_at_span: source_len=`{:?}`", source_len); + debug!("source_len=`{:?}`", source_len); // Ensure indexes are also not malformed. if start_index > end_index || end_index > source_len - 1 { - debug!("find_width_of_character_at_span: source indexes are malformed"); + debug!("source indexes are malformed"); return 1; } @@ -1017,10 +1012,10 @@ impl SourceMap { } else { return 1; }; - debug!("find_width_of_character_at_span: snippet=`{:?}`", snippet); + debug!("snippet=`{:?}`", snippet); let mut target = if forwards { end_index + 1 } else { end_index - 1 }; - debug!("find_width_of_character_at_span: initial target=`{:?}`", target); + debug!("initial target=`{:?}`", target); while !snippet.is_char_boundary(target - start_index) && target < source_len { target = if forwards { @@ -1033,9 +1028,9 @@ impl SourceMap { } } }; - debug!("find_width_of_character_at_span: target=`{:?}`", target); + debug!("target=`{:?}`", target); } - debug!("find_width_of_character_at_span: final target=`{:?}`", target); + debug!("final target=`{:?}`", target); if forwards { (target - end_index) as u32 } else { (end_index - target) as u32 } } From e6c02aad9345925cfed74f86b414c4d0715d381b Mon Sep 17 00:00:00 2001 From: Nilstrieb <48135649+Nilstrieb@users.noreply.github.com> Date: Tue, 27 Dec 2022 22:15:25 +0100 Subject: [PATCH 2/3] Improve heuristics whether `format_args` string is a source literal Previously, it only checked whether there was _a_ literal at the span of the first argument, not whether the literal actually matched up. This caused issues when a proc macro was generating a different literal with the same span. This requires an annoying special case for literals ending in `\n` because otherwise `println` wouldn't give detailed diagnostics anymore which would be bad. --- compiler/rustc_parse_format/src/lib.rs | 37 ++++++++++++++++++- .../fmt/auxiliary/format-string-proc-macro.rs | 14 ++++++- .../ui/fmt/respanned-literal-issue-106191.rs | 10 +++++ .../fmt/respanned-literal-issue-106191.stderr | 19 ++++++++++ 4 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 src/test/ui/fmt/respanned-literal-issue-106191.rs create mode 100644 src/test/ui/fmt/respanned-literal-issue-106191.stderr diff --git a/compiler/rustc_parse_format/src/lib.rs b/compiler/rustc_parse_format/src/lib.rs index ab0463045faf9..84243e53fafe0 100644 --- a/compiler/rustc_parse_format/src/lib.rs +++ b/compiler/rustc_parse_format/src/lib.rs @@ -20,6 +20,7 @@ pub use Flag::*; pub use Piece::*; pub use Position::*; +use rustc_lexer::unescape; use std::iter; use std::str; use std::string; @@ -306,7 +307,7 @@ impl<'a> Parser<'a> { append_newline: bool, mode: ParseMode, ) -> Parser<'a> { - let (width_map, is_literal) = find_width_map_from_snippet(snippet, style); + let (width_map, is_literal) = find_width_map_from_snippet(s, snippet, style); Parser { mode, input: s, @@ -844,6 +845,7 @@ impl<'a> Parser<'a> { /// written code (code snippet) and the `InternedString` that gets processed in the `Parser` /// in order to properly synthesise the intra-string `Span`s for error diagnostics. fn find_width_map_from_snippet( + input: &str, snippet: Option, str_style: Option, ) -> (Vec, bool) { @@ -856,8 +858,27 @@ fn find_width_map_from_snippet( return (vec![], true); } + // Strip quotes. let snippet = &snippet[1..snippet.len() - 1]; + // Macros like `println` add a newline at the end. That technically doens't make them "literals" anymore, but it's fine + // since we will never need to point our spans there, so we lie about it here by ignoring it. + // Since there might actually be newlines in the source code, we need to normalize away all trailing newlines. + // If we only trimmed it off the input, `format!("\n")` would cause a mismatch as here we they actually match up. + // Alternatively, we could just count the trailing newlines and only trim one from the input if they don't match up. + let input_no_nl = input.trim_end_matches('\n'); + let Ok(unescaped) = unescape_string(snippet) else { + return (vec![], false); + }; + + let unescaped_no_nl = unescaped.trim_end_matches('\n'); + + if unescaped_no_nl != input_no_nl { + // The source string that we're pointing at isn't our input, so spans pointing at it will be incorrect. + // This can for example happen with proc macros that respan generated literals. + return (vec![], false); + } + let mut s = snippet.char_indices(); let mut width_mappings = vec![]; while let Some((pos, c)) = s.next() { @@ -936,9 +957,23 @@ fn find_width_map_from_snippet( _ => {} } } + (width_mappings, true) } +fn unescape_string(string: &str) -> Result { + let mut buf = string::String::new(); + let mut error = Ok(()); + unescape::unescape_literal(string, unescape::Mode::Str, &mut |_, unescaped_char| { + match unescaped_char { + Ok(c) => buf.push(c), + Err(err) => error = Err(err), + } + }); + + error.map(|_| buf) +} + // Assert a reasonable size for `Piece` #[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))] rustc_data_structures::static_assert_size!(Piece<'_>, 16); diff --git a/src/test/ui/fmt/auxiliary/format-string-proc-macro.rs b/src/test/ui/fmt/auxiliary/format-string-proc-macro.rs index e44a84776bc69..539c8fb27b3b0 100644 --- a/src/test/ui/fmt/auxiliary/format-string-proc-macro.rs +++ b/src/test/ui/fmt/auxiliary/format-string-proc-macro.rs @@ -5,7 +5,8 @@ extern crate proc_macro; -use proc_macro::{Literal, Span, TokenStream, TokenTree}; +use proc_macro::{Delimiter, Group, Ident, Literal, Punct, Spacing, Span, TokenStream, TokenTree}; +use std::iter::FromIterator; #[proc_macro] pub fn foo_with_input_span(input: TokenStream) -> TokenStream { @@ -26,3 +27,14 @@ pub fn err_with_input_span(input: TokenStream) -> TokenStream { TokenStream::from(TokenTree::Literal(lit)) } + +#[proc_macro] +pub fn respan_to_invalid_format_literal(input: TokenStream) -> TokenStream { + let mut s = Literal::string("{"); + s.set_span(input.into_iter().next().unwrap().span()); + TokenStream::from_iter([ + TokenTree::from(Ident::new("format", Span::call_site())), + TokenTree::from(Punct::new('!', Spacing::Alone)), + TokenTree::from(Group::new(Delimiter::Parenthesis, TokenTree::from(s).into())), + ]) +} diff --git a/src/test/ui/fmt/respanned-literal-issue-106191.rs b/src/test/ui/fmt/respanned-literal-issue-106191.rs new file mode 100644 index 0000000000000..44642a10fc076 --- /dev/null +++ b/src/test/ui/fmt/respanned-literal-issue-106191.rs @@ -0,0 +1,10 @@ +// aux-build:format-string-proc-macro.rs + +extern crate format_string_proc_macro; + +fn main() { + format_string_proc_macro::respan_to_invalid_format_literal!("¡"); + //~^ ERROR invalid format string: expected `'}'` but string was terminated + format_args!(r#concat!("¡ {")); + //~^ ERROR invalid format string: expected `'}'` but string was terminated +} diff --git a/src/test/ui/fmt/respanned-literal-issue-106191.stderr b/src/test/ui/fmt/respanned-literal-issue-106191.stderr new file mode 100644 index 0000000000000..73a3af65a3849 --- /dev/null +++ b/src/test/ui/fmt/respanned-literal-issue-106191.stderr @@ -0,0 +1,19 @@ +error: invalid format string: expected `'}'` but string was terminated + --> $DIR/respanned-literal-issue-106191.rs:6:65 + | +LL | format_string_proc_macro::respan_to_invalid_format_literal!("¡"); + | ^^^ expected `'}'` in format string + | + = note: if you intended to print `{`, you can escape it using `{{` + +error: invalid format string: expected `'}'` but string was terminated + --> $DIR/respanned-literal-issue-106191.rs:8:18 + | +LL | format_args!(r#concat!("¡ {")); + | ^^^^^^^^^^^^^^^^^^^^^^^ expected `'}'` in format string + | + = note: if you intended to print `{`, you can escape it using `{{` + = note: this error originates in the macro `concat` (in Nightly builds, run with -Z macro-backtrace for more info) + +error: aborting due to 2 previous errors + From 31b490d8ba8ff60b9d9ee3ccca522629429d9a3f Mon Sep 17 00:00:00 2001 From: Nilstrieb <48135649+Nilstrieb@users.noreply.github.com> Date: Tue, 27 Dec 2022 23:00:03 +0100 Subject: [PATCH 3/3] Add enum for `find_width_map_from_snippet` This makes the relationship between the vec and the boolean clearer. --- compiler/rustc_parse_format/src/lib.rs | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/compiler/rustc_parse_format/src/lib.rs b/compiler/rustc_parse_format/src/lib.rs index 84243e53fafe0..9f2aaca0acffe 100644 --- a/compiler/rustc_parse_format/src/lib.rs +++ b/compiler/rustc_parse_format/src/lib.rs @@ -57,6 +57,13 @@ impl InnerWidthMapping { } } +/// Whether the input string is a literal. If yes, it contains the inner width mappings. +#[derive(Clone, PartialEq, Eq)] +enum InputStringKind { + NotALiteral, + Literal { width_mappings: Vec }, +} + /// The type of format string that we are parsing. #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub enum ParseMode { @@ -307,7 +314,11 @@ impl<'a> Parser<'a> { append_newline: bool, mode: ParseMode, ) -> Parser<'a> { - let (width_map, is_literal) = find_width_map_from_snippet(s, snippet, style); + let input_string_kind = find_width_map_from_snippet(s, snippet, style); + let (width_map, is_literal) = match input_string_kind { + InputStringKind::Literal { width_mappings } => (width_mappings, true), + InputStringKind::NotALiteral => (Vec::new(), false), + }; Parser { mode, input: s, @@ -848,14 +859,14 @@ fn find_width_map_from_snippet( input: &str, snippet: Option, str_style: Option, -) -> (Vec, bool) { +) -> InputStringKind { let snippet = match snippet { Some(ref s) if s.starts_with('"') || s.starts_with("r\"") || s.starts_with("r#") => s, - _ => return (vec![], false), + _ => return InputStringKind::NotALiteral, }; if str_style.is_some() { - return (vec![], true); + return InputStringKind::Literal { width_mappings: Vec::new() }; } // Strip quotes. @@ -868,7 +879,7 @@ fn find_width_map_from_snippet( // Alternatively, we could just count the trailing newlines and only trim one from the input if they don't match up. let input_no_nl = input.trim_end_matches('\n'); let Ok(unescaped) = unescape_string(snippet) else { - return (vec![], false); + return InputStringKind::NotALiteral; }; let unescaped_no_nl = unescaped.trim_end_matches('\n'); @@ -876,7 +887,7 @@ fn find_width_map_from_snippet( if unescaped_no_nl != input_no_nl { // The source string that we're pointing at isn't our input, so spans pointing at it will be incorrect. // This can for example happen with proc macros that respan generated literals. - return (vec![], false); + return InputStringKind::NotALiteral; } let mut s = snippet.char_indices(); @@ -958,7 +969,7 @@ fn find_width_map_from_snippet( } } - (width_mappings, true) + InputStringKind::Literal { width_mappings } } fn unescape_string(string: &str) -> Result {