From 8ff3903643b530c9029e8f2c6c6956fda8f21d77 Mon Sep 17 00:00:00 2001 From: Deadbeef Date: Sun, 5 Mar 2023 15:03:22 +0000 Subject: [PATCH 01/10] initial step towards implementing C string literals --- compiler/rustc_ast/src/ast.rs | 3 + compiler/rustc_ast/src/token.rs | 7 + compiler/rustc_ast/src/util/literal.rs | 55 ++++- compiler/rustc_ast_pretty/src/pprust/state.rs | 2 + compiler/rustc_builtin_macros/src/concat.rs | 4 + .../rustc_builtin_macros/src/concat_bytes.rs | 4 + .../rustc_expand/src/proc_macro_server.rs | 4 + compiler/rustc_hir/src/lang_items.rs | 1 + .../rustc_hir_typeck/src/fn_ctxt/checks.rs | 5 + compiler/rustc_lexer/src/lib.rs | 30 +++ compiler/rustc_lexer/src/unescape.rs | 199 +++++++++++------- compiler/rustc_parse/src/lexer/mod.rs | 64 ++++++ library/core/src/ffi/c_str.rs | 1 + src/librustdoc/html/highlight.rs | 4 +- .../src/matches/match_same_arms.rs | 1 + .../clippy/clippy_lints/src/utils/author.rs | 5 + src/tools/clippy/clippy_utils/src/consts.rs | 1 + 17 files changed, 310 insertions(+), 80 deletions(-) diff --git a/compiler/rustc_ast/src/ast.rs b/compiler/rustc_ast/src/ast.rs index ea04ba4f66e46..fb22e4640647d 100644 --- a/compiler/rustc_ast/src/ast.rs +++ b/compiler/rustc_ast/src/ast.rs @@ -1814,6 +1814,8 @@ pub enum LitKind { /// A byte string (`b"foo"`). Not stored as a symbol because it might be /// non-utf8, and symbols only allow utf8 strings. ByteStr(Lrc<[u8]>, StrStyle), + /// A C String (`c"foo"`). + CStr(Lrc<[u8]>, StrStyle), /// A byte char (`b'f'`). Byte(u8), /// A character literal (`'a'`). @@ -1868,6 +1870,7 @@ impl LitKind { // unsuffixed variants LitKind::Str(..) | LitKind::ByteStr(..) + | LitKind::CStr(..) | LitKind::Byte(..) | LitKind::Char(..) | LitKind::Int(_, LitIntType::Unsuffixed) diff --git a/compiler/rustc_ast/src/token.rs b/compiler/rustc_ast/src/token.rs index f947ae4d05732..42b843482a32b 100644 --- a/compiler/rustc_ast/src/token.rs +++ b/compiler/rustc_ast/src/token.rs @@ -74,6 +74,8 @@ pub enum LitKind { StrRaw(u8), // raw string delimited by `n` hash symbols ByteStr, ByteStrRaw(u8), // raw byte string delimited by `n` hash symbols + CStr, + CStrRaw(u8), Err, } @@ -141,6 +143,10 @@ impl fmt::Display for Lit { delim = "#".repeat(n as usize), string = symbol )?, + CStr => write!(f, "c\"{symbol}\"")?, + CStrRaw(n) => { + write!(f, "cr{delim}\"{symbol}\"{delim}", delim = "#".repeat(n as usize))? + } Integer | Float | Bool | Err => write!(f, "{symbol}")?, } @@ -170,6 +176,7 @@ impl LitKind { Float => "float", Str | StrRaw(..) => "string", ByteStr | ByteStrRaw(..) => "byte string", + CStr | CStrRaw(..) => "C string", Err => "error", } } diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 74b842ac96eac..8534011e18921 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -2,7 +2,10 @@ use crate::ast::{self, LitKind, MetaItemLit, StrStyle}; use crate::token::{self, Token}; -use rustc_lexer::unescape::{byte_from_char, unescape_byte, unescape_char, unescape_literal, Mode}; +use rustc_lexer::unescape::{ + byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit, + Mode, +}; use rustc_span::symbol::{kw, sym, Symbol}; use rustc_span::Span; use std::{ascii, fmt, str}; @@ -158,6 +161,52 @@ impl LitKind { LitKind::ByteStr(bytes.into(), StrStyle::Raw(n)) } + token::CStr => { + let s = symbol.as_str(); + let mut buf = Vec::with_capacity(s.len()); + let mut error = Ok(()); + unescape_c_string(s, Mode::CStr, &mut |span, c| match c { + Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => { + error = Err(LitError::NulInCStr(span)); + } + Ok(CStrUnit::Byte(b)) => buf.push(b), + Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8), + Ok(CStrUnit::Char(c)) => { + buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) + } + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); + } + } + }); + error?; + buf.push(b'\0'); + LitKind::CStr(buf.into(), StrStyle::Cooked) + } + token::CStrRaw(n) => { + let s = symbol.as_str(); + let mut buf = Vec::with_capacity(s.len()); + let mut error = Ok(()); + unescape_c_string(s, Mode::RawCStr, &mut |span, c| match c { + Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => { + error = Err(LitError::NulInCStr(span)); + } + Ok(CStrUnit::Byte(b)) => buf.push(b), + Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8), + Ok(CStrUnit::Char(c)) => { + buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) + } + Err(err) => { + if err.is_fatal() { + error = Err(LitError::LexerError); + } + } + }); + error?; + buf.push(b'\0'); + LitKind::CStr(buf.into(), StrStyle::Raw(n)) + } token::Err => LitKind::Err, }) } @@ -191,6 +240,8 @@ impl fmt::Display for LitKind { string = symbol )?; } + // TODO need to reescape + LitKind::CStr(..) => todo!(), LitKind::Int(n, ty) => { write!(f, "{n}")?; match ty { @@ -237,6 +288,8 @@ impl MetaItemLit { LitKind::Str(_, ast::StrStyle::Raw(n)) => token::StrRaw(n), LitKind::ByteStr(_, ast::StrStyle::Cooked) => token::ByteStr, LitKind::ByteStr(_, ast::StrStyle::Raw(n)) => token::ByteStrRaw(n), + LitKind::CStr(_, ast::StrStyle::Cooked) => token::CStr, + LitKind::CStr(_, ast::StrStyle::Raw(n)) => token::CStrRaw(n), LitKind::Byte(_) => token::Byte, LitKind::Char(_) => token::Char, LitKind::Int(..) => token::Integer, diff --git a/compiler/rustc_ast_pretty/src/pprust/state.rs b/compiler/rustc_ast_pretty/src/pprust/state.rs index 849336c8669a1..535ac89e751d5 100644 --- a/compiler/rustc_ast_pretty/src/pprust/state.rs +++ b/compiler/rustc_ast_pretty/src/pprust/state.rs @@ -210,6 +210,8 @@ pub fn literal_to_string(lit: token::Lit) -> String { token::ByteStrRaw(n) => { format!("br{delim}\"{string}\"{delim}", delim = "#".repeat(n as usize), string = symbol) } + // TODO + token::CStr | token::CStrRaw(_) => todo!(), token::Integer | token::Float | token::Bool | token::Err => symbol.to_string(), }; diff --git a/compiler/rustc_builtin_macros/src/concat.rs b/compiler/rustc_builtin_macros/src/concat.rs index b92964d03e9f9..50e88ae2eeede 100644 --- a/compiler/rustc_builtin_macros/src/concat.rs +++ b/compiler/rustc_builtin_macros/src/concat.rs @@ -32,6 +32,10 @@ pub fn expand_concat( Ok(ast::LitKind::Bool(b)) => { accumulator.push_str(&b.to_string()); } + Ok(ast::LitKind::CStr(..)) => { + cx.span_err(e.span, "cannot concatenate a C string literal"); + has_errors = true; + } Ok(ast::LitKind::Byte(..) | ast::LitKind::ByteStr(..)) => { cx.emit_err(errors::ConcatBytestr { span: e.span }); has_errors = true; diff --git a/compiler/rustc_builtin_macros/src/concat_bytes.rs b/compiler/rustc_builtin_macros/src/concat_bytes.rs index ba639c0a9fe3c..ae674995e4294 100644 --- a/compiler/rustc_builtin_macros/src/concat_bytes.rs +++ b/compiler/rustc_builtin_macros/src/concat_bytes.rs @@ -18,6 +18,10 @@ fn invalid_type_err( }; let snippet = cx.sess.source_map().span_to_snippet(span).ok(); match ast::LitKind::from_token_lit(token_lit) { + Ok(ast::LitKind::CStr(_, _)) => { + // TODO + cx.span_err(span, "cannot concatenate C string litearls"); + } Ok(ast::LitKind::Char(_)) => { let sugg = snippet.map(|snippet| ConcatBytesInvalidSuggestion::CharLit { span, snippet }); diff --git a/compiler/rustc_expand/src/proc_macro_server.rs b/compiler/rustc_expand/src/proc_macro_server.rs index 1e7d07bc22d52..04bdea273ebd5 100644 --- a/compiler/rustc_expand/src/proc_macro_server.rs +++ b/compiler/rustc_expand/src/proc_macro_server.rs @@ -61,6 +61,8 @@ impl FromInternal for LitKind { token::StrRaw(n) => LitKind::StrRaw(n), token::ByteStr => LitKind::ByteStr, token::ByteStrRaw(n) => LitKind::ByteStrRaw(n), + // TODO + token::CStr | token::CStrRaw(_) => todo!(), token::Err => LitKind::Err, token::Bool => unreachable!(), } @@ -436,6 +438,8 @@ impl server::FreeFunctions for Rustc<'_, '_> { | token::LitKind::StrRaw(_) | token::LitKind::ByteStr | token::LitKind::ByteStrRaw(_) + | token::LitKind::CStr + | token::LitKind::CStrRaw(_) | token::LitKind::Err => return Err(()), token::LitKind::Integer | token::LitKind::Float => {} } diff --git a/compiler/rustc_hir/src/lang_items.rs b/compiler/rustc_hir/src/lang_items.rs index e1c030d3e198a..7ddafc9083a15 100644 --- a/compiler/rustc_hir/src/lang_items.rs +++ b/compiler/rustc_hir/src/lang_items.rs @@ -332,6 +332,7 @@ language_item_table! { RangeTo, sym::RangeTo, range_to_struct, Target::Struct, GenericRequirement::None; String, sym::String, string, Target::Struct, GenericRequirement::None; + CStr, sym::CStr, c_str, Target::Struct, GenericRequirement::None; } pub enum GenericRequirement { diff --git a/compiler/rustc_hir_typeck/src/fn_ctxt/checks.rs b/compiler/rustc_hir_typeck/src/fn_ctxt/checks.rs index f42c825d9e8b1..374266638d160 100644 --- a/compiler/rustc_hir_typeck/src/fn_ctxt/checks.rs +++ b/compiler/rustc_hir_typeck/src/fn_ctxt/checks.rs @@ -1300,6 +1300,11 @@ impl<'a, 'tcx> FnCtxt<'a, 'tcx> { opt_ty.unwrap_or_else(|| self.next_float_var()) } ast::LitKind::Bool(_) => tcx.types.bool, + ast::LitKind::CStr(_, _) => tcx.mk_imm_ref( + tcx.lifetimes.re_static, + tcx.type_of(tcx.require_lang_item(hir::LangItem::CStr, Some(lit.span))) + .skip_binder(), + ), ast::LitKind::Err => tcx.ty_error_misc(), } } diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index b3f4b5cd5e5a0..95cb1f93e3906 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -186,12 +186,16 @@ pub enum LiteralKind { Str { terminated: bool }, /// "b"abc"", "b"abc" ByteStr { terminated: bool }, + /// `c"abc"`, `c"abc` + CStr { terminated: bool }, /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a". `None` indicates /// an invalid literal. RawStr { n_hashes: Option }, /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None` /// indicates an invalid literal. RawByteStr { n_hashes: Option }, + /// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` is invalid. + RawCStr { n_hashes: Option }, } #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] @@ -391,6 +395,32 @@ impl Cursor<'_> { _ => self.ident_or_unknown_prefix(), }, + // TODO deduplicate this code + // c-string literal, raw c-string literal or identifier. + 'c' => match (self.first(), self.second()) { + ('"', _) => { + self.bump(); + let terminated = self.double_quoted_string(); + let suffix_start = self.pos_within_token(); + if terminated { + self.eat_literal_suffix(); + } + let kind = CStr { terminated }; + Literal { kind, suffix_start } + } + ('r', '"') | ('r', '#') => { + self.bump(); + let res = self.raw_double_quoted_string(2); + let suffix_start = self.pos_within_token(); + if res.is_ok() { + self.eat_literal_suffix(); + } + let kind = RawCStr { n_hashes: res.ok() }; + Literal { kind, suffix_start } + } + _ => self.ident_or_unknown_prefix(), + }, + // Identifier (this should be checked after other variant that can // start as identifier). c if is_id_start(c) => self.ident_or_unknown_prefix(), diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index bb4d91247b81d..4b707c9ec9619 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -90,6 +90,39 @@ where Mode::RawStr | Mode::RawByteStr => { unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback) } + Mode::CStr | Mode::RawCStr => unreachable!(), + } +} + +pub enum CStrUnit { + Byte(u8), + Char(char), +} + +impl From for CStrUnit { + fn from(value: u8) -> Self { + CStrUnit::Byte(value) + } +} + +impl From for CStrUnit { + fn from(value: char) -> Self { + CStrUnit::Char(value) + } +} + +pub fn unescape_c_string(src: &str, mode: Mode, callback: &mut F) +where + F: FnMut(Range, Result), +{ + if mode == Mode::RawCStr { + unescape_raw_str_or_raw_byte_str( + src, + mode.characters_should_be_ascii(), + &mut |r, result| callback(r, result.map(CStrUnit::Char)), + ); + } else { + unescape_str_common(src, mode, callback); } } @@ -114,19 +147,26 @@ pub enum Mode { ByteStr, RawStr, RawByteStr, + CStr, + RawCStr, } impl Mode { pub fn in_double_quotes(self) -> bool { match self { - Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => true, + Mode::Str + | Mode::ByteStr + | Mode::RawStr + | Mode::RawByteStr + | Mode::CStr + | Mode::RawCStr => true, Mode::Char | Mode::Byte => false, } } pub fn is_byte(self) -> bool { match self { - Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, + Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => true, Mode::Char | Mode::Str | Mode::RawStr => false, } } @@ -163,62 +203,63 @@ fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result { - // We've parsed '\u', now we have to parse '{..}'. + 'u' => scan_unicode(chars, is_byte)?, + _ => return Err(EscapeError::InvalidEscape), + }; + Ok(res) +} - if chars.next() != Some('{') { - return Err(EscapeError::NoBraceInUnicodeEscape); - } +fn scan_unicode(chars: &mut Chars<'_>, is_byte: bool) -> Result { + // We've parsed '\u', now we have to parse '{..}'. - // First character must be a hexadecimal digit. - let mut n_digits = 1; - let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { - '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), - '}' => return Err(EscapeError::EmptyUnicodeEscape), - c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?, - }; - - // First character is valid, now parse the rest of the number - // and closing brace. - loop { - match chars.next() { - None => return Err(EscapeError::UnclosedUnicodeEscape), - Some('_') => continue, - Some('}') => { - if n_digits > 6 { - return Err(EscapeError::OverlongUnicodeEscape); - } - - // Incorrect syntax has higher priority for error reporting - // than unallowed value for a literal. - if is_byte { - return Err(EscapeError::UnicodeEscapeInByte); - } - - break std::char::from_u32(value).ok_or_else(|| { - if value > 0x10FFFF { - EscapeError::OutOfRangeUnicodeEscape - } else { - EscapeError::LoneSurrogateUnicodeEscape - } - })?; - } - Some(c) => { - let digit: u32 = - c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; - n_digits += 1; - if n_digits > 6 { - // Stop updating value since we're sure that it's incorrect already. - continue; - } - value = value * 16 + digit; + if chars.next() != Some('{') { + return Err(EscapeError::NoBraceInUnicodeEscape); + } + + // First character must be a hexadecimal digit. + let mut n_digits = 1; + let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { + '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), + '}' => return Err(EscapeError::EmptyUnicodeEscape), + c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?, + }; + + // First character is valid, now parse the rest of the number + // and closing brace. + loop { + match chars.next() { + None => return Err(EscapeError::UnclosedUnicodeEscape), + Some('_') => continue, + Some('}') => { + if n_digits > 6 { + return Err(EscapeError::OverlongUnicodeEscape); + } + + // Incorrect syntax has higher priority for error reporting + // than unallowed value for a literal. + if is_byte { + return Err(EscapeError::UnicodeEscapeInByte); + } + + break std::char::from_u32(value).ok_or_else(|| { + if value > 0x10FFFF { + EscapeError::OutOfRangeUnicodeEscape + } else { + EscapeError::LoneSurrogateUnicodeEscape } - }; + }); } - } - _ => return Err(EscapeError::InvalidEscape), - }; - Ok(res) + Some(c) => { + let digit: u32 = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; + n_digits += 1; + if n_digits > 6 { + // Stop updating value since we're sure that it's incorrect already. + continue; + } + value = value * 16 + digit; + } + }; + } } #[inline] @@ -266,7 +307,9 @@ where // if unescaped '\' character is followed by '\n'. // For details see [Rust language reference] // (https://doc.rust-lang.org/reference/tokens.html#string-literals). - skip_ascii_whitespace(&mut chars, start, callback); + skip_ascii_whitespace(&mut chars, start, &mut |range, err| { + callback(range, Err(err)) + }); continue; } _ => scan_escape(&mut chars, is_byte), @@ -281,32 +324,32 @@ where let end = src.len() - chars.as_str().len(); callback(start..end, res); } +} - fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) - where - F: FnMut(Range, Result), - { - let tail = chars.as_str(); - let first_non_space = tail - .bytes() - .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') - .unwrap_or(tail.len()); - if tail[1..first_non_space].contains('\n') { - // The +1 accounts for the escaping slash. - let end = start + first_non_space + 1; - callback(start..end, Err(EscapeError::MultipleSkippedLinesWarning)); - } - let tail = &tail[first_non_space..]; - if let Some(c) = tail.chars().nth(0) { - if c.is_whitespace() { - // For error reporting, we would like the span to contain the character that was not - // skipped. The +1 is necessary to account for the leading \ that started the escape. - let end = start + first_non_space + c.len_utf8() + 1; - callback(start..end, Err(EscapeError::UnskippedWhitespaceWarning)); - } +fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) +where + F: FnMut(Range, EscapeError), +{ + let tail = chars.as_str(); + let first_non_space = tail + .bytes() + .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') + .unwrap_or(tail.len()); + if tail[1..first_non_space].contains('\n') { + // The +1 accounts for the escaping slash. + let end = start + first_non_space + 1; + callback(start..end, EscapeError::MultipleSkippedLinesWarning); + } + let tail = &tail[first_non_space..]; + if let Some(c) = tail.chars().nth(0) { + if c.is_whitespace() { + // For error reporting, we would like the span to contain the character that was not + // skipped. The +1 is necessary to account for the leading \ that started the escape. + let end = start + first_non_space + c.len_utf8() + 1; + callback(start..end, EscapeError::UnskippedWhitespaceWarning); } - *chars = tail.chars(); } + *chars = tail.chars(); } /// Takes a contents of a string literal (without quotes) and produces a diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index a4a75fcb96995..b2050780309f8 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -415,6 +415,16 @@ impl<'a> StringReader<'a> { } self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" " } + rustc_lexer::LiteralKind::CStr { terminated } => { + if !terminated { + self.sess.span_diagnostic.span_fatal_with_code( + self.mk_sp(start + BytePos(1), end), + "unterminated C string", + error_code!(E0767), + ) + } + self.cook_c_string(token::CStr, Mode::CStr, start, end, 2, 1) // c" " + } rustc_lexer::LiteralKind::RawStr { n_hashes } => { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); @@ -433,6 +443,15 @@ impl<'a> StringReader<'a> { self.report_raw_str_error(start, 2); } } + rustc_lexer::LiteralKind::RawCStr { n_hashes } => { + if let Some(n_hashes) = n_hashes { + let n = u32::from(n_hashes); + let kind = token::CStrRaw(n_hashes); + self.cook_c_string(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "## + } else { + self.report_raw_str_error(start, 2); + } + } rustc_lexer::LiteralKind::Int { base, empty_int } => { if empty_int { let span = self.mk_sp(start, end); @@ -692,6 +711,51 @@ impl<'a> StringReader<'a> { (token::Err, self.symbol_from_to(start, end)) } } + + fn cook_c_string( + &self, + kind: token::LitKind, + mode: Mode, + start: BytePos, + end: BytePos, + prefix_len: u32, + postfix_len: u32, + ) -> (token::LitKind, Symbol) { + let mut has_fatal_err = false; + let content_start = start + BytePos(prefix_len); + let content_end = end - BytePos(postfix_len); + let lit_content = self.str_from_to(content_start, content_end); + unescape::unescape_c_string(lit_content, mode, &mut |range, result| { + // Here we only check for errors. The actual unescaping is done later. + if let Err(err) = result { + let span_with_quotes = self.mk_sp(start, end); + let (start, end) = (range.start as u32, range.end as u32); + let lo = content_start + BytePos(start); + let hi = lo + BytePos(end - start); + let span = self.mk_sp(lo, hi); + if err.is_fatal() { + has_fatal_err = true; + } + emit_unescape_error( + &self.sess.span_diagnostic, + lit_content, + span_with_quotes, + span, + mode, + range, + err, + ); + } + }); + + // We normally exclude the quotes for the symbol, but for errors we + // include it because it results in clearer error messages. + if !has_fatal_err { + (kind, Symbol::intern(lit_content)) + } else { + (token::Err, self.symbol_from_to(start, end)) + } + } } pub fn nfc_normalize(string: &str) -> Symbol { diff --git a/library/core/src/ffi/c_str.rs b/library/core/src/ffi/c_str.rs index bd2b2c36c4315..2ac679b6bc347 100644 --- a/library/core/src/ffi/c_str.rs +++ b/library/core/src/ffi/c_str.rs @@ -82,6 +82,7 @@ use crate::str; #[cfg_attr(not(test), rustc_diagnostic_item = "CStr")] #[stable(feature = "core_c_str", since = "1.64.0")] #[rustc_has_incoherent_inherent_impls] +#[cfg_attr(not(bootstrap), lang = "CStr")] // FIXME: // `fn from` in `impl From<&CStr> for Box` current implementation relies // on `CStr` being layout-compatible with `[u8]`. diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 946c85a205f5a..c94968b4817cb 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -811,7 +811,9 @@ impl<'src> Classifier<'src> { | LiteralKind::Str { .. } | LiteralKind::ByteStr { .. } | LiteralKind::RawStr { .. } - | LiteralKind::RawByteStr { .. } => Class::String, + | LiteralKind::RawByteStr { .. } + | LiteralKind::CStr { .. } + | LiteralKind::RawCStr { .. } => Class::String, // Number literals. LiteralKind::Float { .. } | LiteralKind::Int { .. } => Class::Number, }, diff --git a/src/tools/clippy/clippy_lints/src/matches/match_same_arms.rs b/src/tools/clippy/clippy_lints/src/matches/match_same_arms.rs index 158e6caa4de54..a48f4c77f857f 100644 --- a/src/tools/clippy/clippy_lints/src/matches/match_same_arms.rs +++ b/src/tools/clippy/clippy_lints/src/matches/match_same_arms.rs @@ -284,6 +284,7 @@ impl<'a> NormalizedPat<'a> { LitKind::Str(sym, _) => Self::LitStr(sym), LitKind::ByteStr(ref bytes, _) => Self::LitBytes(bytes), LitKind::Byte(val) => Self::LitInt(val.into()), + LitKind::CStr(ref bytes, _) => Self::LitBytes(bytes), LitKind::Char(val) => Self::LitInt(val.into()), LitKind::Int(val, _) => Self::LitInt(val), LitKind::Bool(val) => Self::LitBool(val), diff --git a/src/tools/clippy/clippy_lints/src/utils/author.rs b/src/tools/clippy/clippy_lints/src/utils/author.rs index 01927b6b5f10d..f75dff46624e4 100644 --- a/src/tools/clippy/clippy_lints/src/utils/author.rs +++ b/src/tools/clippy/clippy_lints/src/utils/author.rs @@ -304,6 +304,11 @@ impl<'a, 'tcx> PrintVisitor<'a, 'tcx> { kind!("ByteStr(ref {vec})"); chain!(self, "let [{:?}] = **{vec}", vec.value); }, + LitKind::CStr(ref vec, _) => { + bind!(self, vec); + kind!("CStr(ref {vec})"); + chain!(self, "let [{:?}] = **{vec}", vec.value); + } LitKind::Str(s, _) => { bind!(self, s); kind!("Str({s}, _)"); diff --git a/src/tools/clippy/clippy_utils/src/consts.rs b/src/tools/clippy/clippy_utils/src/consts.rs index 99bfc4b5717c8..7c7ec6d334d9b 100644 --- a/src/tools/clippy/clippy_utils/src/consts.rs +++ b/src/tools/clippy/clippy_utils/src/consts.rs @@ -211,6 +211,7 @@ pub fn lit_to_mir_constant(lit: &LitKind, ty: Option>) -> Constant { LitKind::Str(ref is, _) => Constant::Str(is.to_string()), LitKind::Byte(b) => Constant::Int(u128::from(b)), LitKind::ByteStr(ref s, _) => Constant::Binary(Lrc::clone(s)), + LitKind::CStr(ref s, _) => Constant::Binary(Lrc::clone(s)), LitKind::Char(c) => Constant::Char(c), LitKind::Int(n, _) => Constant::Int(n), LitKind::Float(ref is, LitFloatType::Suffixed(fty)) => match fty { From 76d1f93896fb642cd27cbe8ef481b66e974dbdf9 Mon Sep 17 00:00:00 2001 From: Deadbeef Date: Mon, 6 Mar 2023 07:10:23 +0000 Subject: [PATCH 02/10] update and add a few tests --- compiler/rustc_ast/src/ast.rs | 2 +- compiler/rustc_ast/src/util/literal.rs | 4 ++-- compiler/rustc_ast_passes/src/feature_gate.rs | 1 + compiler/rustc_feature/src/active.rs | 2 ++ .../src/build/expr/as_constant.rs | 6 +++++ compiler/rustc_span/src/symbol.rs | 1 + .../rfcs/rfc-3348-c-string-literals/basic.rs | 7 ++++++ .../rfcs/rfc-3348-c-string-literals/gate.rs | 7 ++++++ .../rfc-3348-c-string-literals/gate.stderr | 21 ++++++++++++++++++ .../rfc-3348-c-string-literals/no-nuls.rs | Bin 0 -> 322 bytes .../rfc-3348-c-string-literals/no-nuls.stderr | Bin 0 -> 670 bytes 11 files changed, 48 insertions(+), 3 deletions(-) create mode 100644 tests/ui/rfcs/rfc-3348-c-string-literals/basic.rs create mode 100644 tests/ui/rfcs/rfc-3348-c-string-literals/gate.rs create mode 100644 tests/ui/rfcs/rfc-3348-c-string-literals/gate.stderr create mode 100644 tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs create mode 100644 tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.stderr diff --git a/compiler/rustc_ast/src/ast.rs b/compiler/rustc_ast/src/ast.rs index fb22e4640647d..fb90d309fcde7 100644 --- a/compiler/rustc_ast/src/ast.rs +++ b/compiler/rustc_ast/src/ast.rs @@ -1814,7 +1814,7 @@ pub enum LitKind { /// A byte string (`b"foo"`). Not stored as a symbol because it might be /// non-utf8, and symbols only allow utf8 strings. ByteStr(Lrc<[u8]>, StrStyle), - /// A C String (`c"foo"`). + /// A C String (`c"foo"`). Guaranteed only have `\0` in the end. CStr(Lrc<[u8]>, StrStyle), /// A byte char (`b'f'`). Byte(u8), diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 8534011e18921..5fa0ea354550a 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -181,7 +181,7 @@ impl LitKind { } }); error?; - buf.push(b'\0'); + buf.push(0); LitKind::CStr(buf.into(), StrStyle::Cooked) } token::CStrRaw(n) => { @@ -204,7 +204,7 @@ impl LitKind { } }); error?; - buf.push(b'\0'); + buf.push(0); LitKind::CStr(buf.into(), StrStyle::Raw(n)) } token::Err => LitKind::Err, diff --git a/compiler/rustc_ast_passes/src/feature_gate.rs b/compiler/rustc_ast_passes/src/feature_gate.rs index 17bcd24ee39fd..9defc6603e837 100644 --- a/compiler/rustc_ast_passes/src/feature_gate.rs +++ b/compiler/rustc_ast_passes/src/feature_gate.rs @@ -1,3 +1,4 @@ +use ast::token; use rustc_ast as ast; use rustc_ast::visit::{self, AssocCtxt, FnCtxt, FnKind, Visitor}; use rustc_ast::{attr, AssocConstraint, AssocConstraintKind, NodeId}; diff --git a/compiler/rustc_feature/src/active.rs b/compiler/rustc_feature/src/active.rs index 6201e5b619b87..45f462a63ee8c 100644 --- a/compiler/rustc_feature/src/active.rs +++ b/compiler/rustc_feature/src/active.rs @@ -313,6 +313,8 @@ declare_features! ( (active, async_fn_in_trait, "1.66.0", Some(91611), None), /// Treat `extern "C"` function as nounwind. (active, c_unwind, "1.52.0", Some(74990), None), + /// Allows `c"foo"` literals. + (active, c_str_literals, "CURRENT_RUSTC_VERSION", Some(105723), None), /// Allows using C-variadics. (active, c_variadic, "1.34.0", Some(44930), None), /// Allows the use of `#[cfg(sanitize = "option")]`; set when -Zsanitizer is used. diff --git a/compiler/rustc_mir_build/src/build/expr/as_constant.rs b/compiler/rustc_mir_build/src/build/expr/as_constant.rs index fbcfd43372433..59549435233c5 100644 --- a/compiler/rustc_mir_build/src/build/expr/as_constant.rs +++ b/compiler/rustc_mir_build/src/build/expr/as_constant.rs @@ -146,6 +146,12 @@ pub(crate) fn lit_to_mir_constant<'tcx>( let id = tcx.allocate_bytes(data); ConstValue::Scalar(Scalar::from_pointer(id.into(), &tcx)) } + (ast::LitKind::CStr(data, _), ty::Ref(_, inner_ty, _)) if matches!(inner_ty.kind(), ty::Adt(def, _) if Some(def.did()) == tcx.lang_items().c_str()) => + { + let allocation = Allocation::from_bytes_byte_aligned_immutable(data as &[u8]); + let allocation = tcx.mk_const_alloc(allocation); + ConstValue::Slice { data: allocation, start: 0, end: data.len() } + } (ast::LitKind::Byte(n), ty::Uint(ty::UintTy::U8)) => { ConstValue::Scalar(Scalar::from_uint(*n, Size::from_bytes(1))) } diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 7969b848fd956..f83aa504b90e6 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -441,6 +441,7 @@ symbols! { bridge, bswap, c_str, + c_str_literals, c_unwind, c_variadic, call, diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/basic.rs b/tests/ui/rfcs/rfc-3348-c-string-literals/basic.rs new file mode 100644 index 0000000000000..e4b07ab8108e0 --- /dev/null +++ b/tests/ui/rfcs/rfc-3348-c-string-literals/basic.rs @@ -0,0 +1,7 @@ +// run-pass + +#![feature(c_str_literals)] + +fn main() { + assert_eq!(b"test\0", c"test".to_bytes_with_nul()); +} diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/gate.rs b/tests/ui/rfcs/rfc-3348-c-string-literals/gate.rs new file mode 100644 index 0000000000000..674b0c5e23351 --- /dev/null +++ b/tests/ui/rfcs/rfc-3348-c-string-literals/gate.rs @@ -0,0 +1,7 @@ +fn main() { + c"foo"; + //~^ ERROR: `c".."` literals are experimental + + m!(c"test"); + //~^ ERROR: `c".."` literals are experimental +} diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/gate.stderr b/tests/ui/rfcs/rfc-3348-c-string-literals/gate.stderr new file mode 100644 index 0000000000000..bc0c537aada83 --- /dev/null +++ b/tests/ui/rfcs/rfc-3348-c-string-literals/gate.stderr @@ -0,0 +1,21 @@ +error[E0658]: `c".."` literals are experimental + --> $DIR/gate.rs:8:5 + | +LL | c"foo"; + | ^^^^^^ + | + = note: see issue #105723 for more information + = help: add `#![feature(c_str_literals)]` to the crate attributes to enable + +error[E0658]: `c".."` literals are experimental + --> $DIR/gate.rs:11:8 + | +LL | m!(c"test"); + | ^^^^^^^ + | + = note: see issue #105723 for more information + = help: add `#![feature(c_str_literals)]` to the crate attributes to enable + +error: aborting due to 2 previous errors + +For more information about this error, try `rustc --explain E0658`. diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs b/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs new file mode 100644 index 0000000000000000000000000000000000000000..f6c86a1ba8762358537827376cbffc5c3f4508e9 GIT binary patch literal 322 zcmY#Zj802UEGaEY)kuynE-8x7$t+1NO3W$NjOF4=%Tvfr%*@l!RH)`s0D@$t7y~71 zFjHT@E>6KUD9As^N+GW_Cr2SUBe5tk8K_qwGf%-;0cccaUOE>{TWPg{K`mKY8OYKV QQ(-`sON*39w6B&60N$!x!TeXXdftt2!GI~Yrix-cM3w;1ma$Enh(Pc#!UVj(ggLC5kOomYT5DR{C8&R_#& z`K>OKf1}P$Lfcgd?nWL;VFqm$4^_YPMWO$ VFr1x))mY+2^4@4?5MIMD_6Dmm#F_vA literal 0 HcmV?d00001 From a49570fd20a16c0a41b1dfdaf121ef69f60acd7e Mon Sep 17 00:00:00 2001 From: Deadbeef Date: Mon, 6 Mar 2023 07:42:04 +0000 Subject: [PATCH 03/10] fix TODO comments --- compiler/rustc_ast/src/util/literal.rs | 10 +- compiler/rustc_ast_pretty/src/pprust/state.rs | 6 +- .../rustc_builtin_macros/src/concat_bytes.rs | 5 +- .../rustc_expand/src/proc_macro_server.rs | 6 +- compiler/rustc_feature/src/active.rs | 4 +- compiler/rustc_lexer/src/lib.rs | 108 +++++++++--------- library/proc_macro/src/bridge/mod.rs | 4 + .../rfcs/rfc-3348-c-string-literals/gate.rs | 6 + 8 files changed, 82 insertions(+), 67 deletions(-) diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 5fa0ea354550a..cd3b163e3ac97 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -240,8 +240,14 @@ impl fmt::Display for LitKind { string = symbol )?; } - // TODO need to reescape - LitKind::CStr(..) => todo!(), + LitKind::CStr(ref bytes, StrStyle::Cooked) => { + write!(f, "c\"{}\"", escape_byte_str_symbol(bytes))? + } + LitKind::CStr(ref bytes, StrStyle::Raw(n)) => { + // This can only be valid UTF-8. + let symbol = str::from_utf8(bytes).unwrap(); + write!(f, "cr{delim}\"{symbol}\"{delim}", delim = "#".repeat(n as usize),)?; + } LitKind::Int(n, ty) => { write!(f, "{n}")?; match ty { diff --git a/compiler/rustc_ast_pretty/src/pprust/state.rs b/compiler/rustc_ast_pretty/src/pprust/state.rs index 535ac89e751d5..61b7863c686cf 100644 --- a/compiler/rustc_ast_pretty/src/pprust/state.rs +++ b/compiler/rustc_ast_pretty/src/pprust/state.rs @@ -210,8 +210,10 @@ pub fn literal_to_string(lit: token::Lit) -> String { token::ByteStrRaw(n) => { format!("br{delim}\"{string}\"{delim}", delim = "#".repeat(n as usize), string = symbol) } - // TODO - token::CStr | token::CStrRaw(_) => todo!(), + token::CStr => format!("c\"{symbol}\""), + token::CStrRaw(n) => { + format!("cr{delim}\"{symbol}\"{delim}", delim = "#".repeat(n as usize)) + } token::Integer | token::Float | token::Bool | token::Err => symbol.to_string(), }; diff --git a/compiler/rustc_builtin_macros/src/concat_bytes.rs b/compiler/rustc_builtin_macros/src/concat_bytes.rs index ae674995e4294..5ef35af0a059a 100644 --- a/compiler/rustc_builtin_macros/src/concat_bytes.rs +++ b/compiler/rustc_builtin_macros/src/concat_bytes.rs @@ -19,8 +19,9 @@ fn invalid_type_err( let snippet = cx.sess.source_map().span_to_snippet(span).ok(); match ast::LitKind::from_token_lit(token_lit) { Ok(ast::LitKind::CStr(_, _)) => { - // TODO - cx.span_err(span, "cannot concatenate C string litearls"); + // FIXME(c_str_literals): should concatenation of C string literals + // include the null bytes in the end? + cx.span_err(span, "cannot concatenate C string literals"); } Ok(ast::LitKind::Char(_)) => { let sugg = diff --git a/compiler/rustc_expand/src/proc_macro_server.rs b/compiler/rustc_expand/src/proc_macro_server.rs index 04bdea273ebd5..891e84a2f3071 100644 --- a/compiler/rustc_expand/src/proc_macro_server.rs +++ b/compiler/rustc_expand/src/proc_macro_server.rs @@ -61,8 +61,8 @@ impl FromInternal for LitKind { token::StrRaw(n) => LitKind::StrRaw(n), token::ByteStr => LitKind::ByteStr, token::ByteStrRaw(n) => LitKind::ByteStrRaw(n), - // TODO - token::CStr | token::CStrRaw(_) => todo!(), + token::CStr => LitKind::CStr, + token::CStrRaw(n) => LitKind::CStrRaw(n), token::Err => LitKind::Err, token::Bool => unreachable!(), } @@ -80,6 +80,8 @@ impl ToInternal for LitKind { LitKind::StrRaw(n) => token::StrRaw(n), LitKind::ByteStr => token::ByteStr, LitKind::ByteStrRaw(n) => token::ByteStrRaw(n), + LitKind::CStr => token::CStr, + LitKind::CStrRaw(n) => token::CStrRaw(n), LitKind::Err => token::Err, } } diff --git a/compiler/rustc_feature/src/active.rs b/compiler/rustc_feature/src/active.rs index 45f462a63ee8c..4e5eebd285bbb 100644 --- a/compiler/rustc_feature/src/active.rs +++ b/compiler/rustc_feature/src/active.rs @@ -311,10 +311,10 @@ declare_features! ( (active, async_closure, "1.37.0", Some(62290), None), /// Allows async functions to be declared, implemented, and used in traits. (active, async_fn_in_trait, "1.66.0", Some(91611), None), - /// Treat `extern "C"` function as nounwind. - (active, c_unwind, "1.52.0", Some(74990), None), /// Allows `c"foo"` literals. (active, c_str_literals, "CURRENT_RUSTC_VERSION", Some(105723), None), + /// Treat `extern "C"` function as nounwind. + (active, c_unwind, "1.52.0", Some(74990), None), /// Allows using C-variadics. (active, c_variadic, "1.34.0", Some(44930), None), /// Allows the use of `#[cfg(sanitize = "option")]`; set when -Zsanitizer is used. diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 95cb1f93e3906..ce8c9ebe7ce77 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -361,65 +361,18 @@ impl Cursor<'_> { }, // Byte literal, byte string literal, raw byte string literal or identifier. - 'b' => match (self.first(), self.second()) { - ('\'', _) => { - self.bump(); - let terminated = self.single_quoted_string(); - let suffix_start = self.pos_within_token(); - if terminated { - self.eat_literal_suffix(); - } - let kind = Byte { terminated }; - Literal { kind, suffix_start } - } - ('"', _) => { - self.bump(); - let terminated = self.double_quoted_string(); - let suffix_start = self.pos_within_token(); - if terminated { - self.eat_literal_suffix(); - } - let kind = ByteStr { terminated }; - Literal { kind, suffix_start } - } - ('r', '"') | ('r', '#') => { - self.bump(); - let res = self.raw_double_quoted_string(2); - let suffix_start = self.pos_within_token(); - if res.is_ok() { - self.eat_literal_suffix(); - } - let kind = RawByteStr { n_hashes: res.ok() }; - Literal { kind, suffix_start } - } - _ => self.ident_or_unknown_prefix(), - }, + 'b' => self.c_or_byte_string( + |terminated| ByteStr { terminated }, + |n_hashes| RawByteStr { n_hashes }, + Some(|terminated| Byte { terminated }), + ), - // TODO deduplicate this code // c-string literal, raw c-string literal or identifier. - 'c' => match (self.first(), self.second()) { - ('"', _) => { - self.bump(); - let terminated = self.double_quoted_string(); - let suffix_start = self.pos_within_token(); - if terminated { - self.eat_literal_suffix(); - } - let kind = CStr { terminated }; - Literal { kind, suffix_start } - } - ('r', '"') | ('r', '#') => { - self.bump(); - let res = self.raw_double_quoted_string(2); - let suffix_start = self.pos_within_token(); - if res.is_ok() { - self.eat_literal_suffix(); - } - let kind = RawCStr { n_hashes: res.ok() }; - Literal { kind, suffix_start } - } - _ => self.ident_or_unknown_prefix(), - }, + 'c' => self.c_or_byte_string( + |terminated| CStr { terminated }, + |n_hashes| RawCStr { n_hashes }, + None, + ), // Identifier (this should be checked after other variant that can // start as identifier). @@ -583,6 +536,47 @@ impl Cursor<'_> { } } + fn c_or_byte_string( + &mut self, + mk_kind: impl FnOnce(bool) -> LiteralKind, + mk_kind_raw: impl FnOnce(Option) -> LiteralKind, + single_quoted: Option LiteralKind>, + ) -> TokenKind { + match (self.first(), self.second(), single_quoted) { + ('\'', _, Some(mk_kind)) => { + self.bump(); + let terminated = self.single_quoted_string(); + let suffix_start = self.pos_within_token(); + if terminated { + self.eat_literal_suffix(); + } + let kind = mk_kind(terminated); + Literal { kind, suffix_start } + } + ('"', _, _) => { + self.bump(); + let terminated = self.double_quoted_string(); + let suffix_start = self.pos_within_token(); + if terminated { + self.eat_literal_suffix(); + } + let kind = mk_kind(terminated); + Literal { kind, suffix_start } + } + ('r', '"', _) | ('r', '#', _) => { + self.bump(); + let res = self.raw_double_quoted_string(2); + let suffix_start = self.pos_within_token(); + if res.is_ok() { + self.eat_literal_suffix(); + } + let kind = mk_kind_raw(res.ok()); + Literal { kind, suffix_start } + } + _ => self.ident_or_unknown_prefix(), + } + } + fn number(&mut self, first_digit: char) -> LiteralKind { debug_assert!('0' <= self.prev() && self.prev() <= '9'); let mut base = Base::Decimal; diff --git a/library/proc_macro/src/bridge/mod.rs b/library/proc_macro/src/bridge/mod.rs index 54b11c543f162..caecda1bc63fd 100644 --- a/library/proc_macro/src/bridge/mod.rs +++ b/library/proc_macro/src/bridge/mod.rs @@ -337,6 +337,8 @@ pub enum LitKind { StrRaw(u8), ByteStr, ByteStrRaw(u8), + CStr, + CStrRaw(u8), Err, } @@ -350,6 +352,8 @@ rpc_encode_decode!( StrRaw(n), ByteStr, ByteStrRaw(n), + CStr, + CStrRaw(n), Err, } ); diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/gate.rs b/tests/ui/rfcs/rfc-3348-c-string-literals/gate.rs index 674b0c5e23351..b27da26ed23bb 100644 --- a/tests/ui/rfcs/rfc-3348-c-string-literals/gate.rs +++ b/tests/ui/rfcs/rfc-3348-c-string-literals/gate.rs @@ -1,3 +1,9 @@ +// gate-test-c_str_literals + +macro_rules! m { + ($t:tt) => {} +} + fn main() { c"foo"; //~^ ERROR: `c".."` literals are experimental From d5e7206ca674661a13d7bbe03284b81031e1ac33 Mon Sep 17 00:00:00 2001 From: Deadbeef Date: Mon, 6 Mar 2023 14:09:28 +0000 Subject: [PATCH 04/10] rm diag item, use lang item --- library/core/src/ffi/c_str.rs | 1 - src/tools/clippy/clippy_lints/src/strlen_on_c_strings.rs | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/library/core/src/ffi/c_str.rs b/library/core/src/ffi/c_str.rs index 2ac679b6bc347..07b11814f965f 100644 --- a/library/core/src/ffi/c_str.rs +++ b/library/core/src/ffi/c_str.rs @@ -79,7 +79,6 @@ use crate::str; /// /// [str]: prim@str "str" #[derive(Hash)] -#[cfg_attr(not(test), rustc_diagnostic_item = "CStr")] #[stable(feature = "core_c_str", since = "1.64.0")] #[rustc_has_incoherent_inherent_impls] #[cfg_attr(not(bootstrap), lang = "CStr")] diff --git a/src/tools/clippy/clippy_lints/src/strlen_on_c_strings.rs b/src/tools/clippy/clippy_lints/src/strlen_on_c_strings.rs index 03324c66e8efc..2f2e84fa35a12 100644 --- a/src/tools/clippy/clippy_lints/src/strlen_on_c_strings.rs +++ b/src/tools/clippy/clippy_lints/src/strlen_on_c_strings.rs @@ -1,11 +1,11 @@ use clippy_utils::diagnostics::span_lint_and_sugg; use clippy_utils::source::snippet_with_context; -use clippy_utils::ty::is_type_diagnostic_item; +use clippy_utils::ty::{is_type_diagnostic_item, is_type_lang_item}; use clippy_utils::visitors::is_expr_unsafe; use clippy_utils::{get_parent_node, match_libc_symbol}; use if_chain::if_chain; use rustc_errors::Applicability; -use rustc_hir::{Block, BlockCheckMode, Expr, ExprKind, Node, UnsafeSource}; +use rustc_hir::{Block, BlockCheckMode, Expr, ExprKind, LangItem, Node, UnsafeSource}; use rustc_lint::{LateContext, LateLintPass}; use rustc_session::{declare_lint_pass, declare_tool_lint}; use rustc_span::symbol::sym; @@ -67,7 +67,7 @@ impl<'tcx> LateLintPass<'tcx> for StrlenOnCStrings { let val_name = snippet_with_context(cx, self_arg.span, ctxt, "..", &mut app).0; let method_name = if is_type_diagnostic_item(cx, ty, sym::cstring_type) { "as_bytes" - } else if is_type_diagnostic_item(cx, ty, sym::CStr) { + } else if is_type_lang_item(cx, ty, LangItem::CStr) { "to_bytes" } else { return; From 4c01d494b8233c930868be33cf4880b4267ede82 Mon Sep 17 00:00:00 2001 From: Deadbeef Date: Mon, 6 Mar 2023 14:14:55 +0000 Subject: [PATCH 05/10] refactor unescape --- compiler/rustc_lexer/src/unescape.rs | 91 +++++++++++++------ .../src/lexer/unescape_error_reporting.rs | 22 +++-- 2 files changed, 75 insertions(+), 38 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 4b707c9ec9619..c9ad54d8d9806 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -86,7 +86,8 @@ where let res = unescape_char_or_byte(&mut chars, mode == Mode::Byte); callback(0..(src.len() - chars.as_str().len()), res); } - Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(src, mode == Mode::ByteStr, callback), + Mode::Str | Mode::ByteStr => unescape_str_common(src, mode, callback), + Mode::RawStr | Mode::RawByteStr => { unescape_raw_str_or_raw_byte_str(src, mode == Mode::RawByteStr, callback) } @@ -94,6 +95,7 @@ where } } +/// A unit within CStr. Must not be a nul character. pub enum CStrUnit { Byte(u8), Char(char), @@ -164,24 +166,52 @@ impl Mode { } } - pub fn is_byte(self) -> bool { + /// Non-byte literals should have `\xXX` escapes that are within the ASCII range. + pub fn ascii_escapes_should_be_ascii(self) -> bool { + match self { + Mode::Char | Mode::Str | Mode::RawStr => true, + Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => false, + } + } + + /// Whether characters within the literal must be within the ASCII range + pub fn characters_should_be_ascii(self) -> bool { + match self { + Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, + Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false, + } + } + + /// Byte literals do not allow unicode escape. + pub fn is_unicode_escape_disallowed(self) -> bool { match self { - Mode::Byte | Mode::ByteStr | Mode::RawByteStr | Mode::CStr | Mode::RawCStr => true, - Mode::Char | Mode::Str | Mode::RawStr => false, + Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, + Mode::Char | Mode::Str | Mode::RawStr | Mode::CStr | Mode::RawCStr => false, + } + } + + pub fn prefix_noraw(self) -> &'static str { + match self { + Mode::Byte | Mode::ByteStr | Mode::RawByteStr => "b", + Mode::CStr | Mode::RawCStr => "c", + Mode::Char | Mode::Str | Mode::RawStr => "", } } } -fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result { +fn scan_escape + From>( + chars: &mut Chars<'_>, + mode: Mode, +) -> Result { // Previous character was '\\', unescape what follows. let res = match chars.next().ok_or(EscapeError::LoneSlash)? { - '"' => '"', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '0' => '\0', + '"' => b'"', + 'n' => b'\n', + 'r' => b'\r', + 't' => b'\t', + '\\' => b'\\', + '\'' => b'\'', + '0' => b'\0', 'x' => { // Parse hexadecimal character code. @@ -194,22 +224,23 @@ fn scan_escape(chars: &mut Chars<'_>, is_byte: bool) -> Result scan_unicode(chars, is_byte)?, + 'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(Into::into), _ => return Err(EscapeError::InvalidEscape), }; - Ok(res) + Ok(res.into()) } -fn scan_unicode(chars: &mut Chars<'_>, is_byte: bool) -> Result { +fn scan_unicode( + chars: &mut Chars<'_>, + is_unicode_escape_disallowed: bool, +) -> Result { // We've parsed '\u', now we have to parse '{..}'. if chars.next() != Some('{') { @@ -237,7 +268,7 @@ fn scan_unicode(chars: &mut Chars<'_>, is_byte: bool) -> Result, is_byte: bool) -> Result Result { - if is_byte && !c.is_ascii() { +fn ascii_check(c: char, characters_should_be_ascii: bool) -> Result { + if characters_should_be_ascii && !c.is_ascii() { // Byte literal can't be a non-ascii character. Err(EscapeError::NonAsciiCharInByte) } else { @@ -275,7 +306,7 @@ fn ascii_check(c: char, is_byte: bool) -> Result { fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result { let c = chars.next().ok_or(EscapeError::ZeroChars)?; let res = match c { - '\\' => scan_escape(chars, is_byte), + '\\' => scan_escape(chars, if is_byte { Mode::Byte } else { Mode::Char }), '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), _ => ascii_check(c, is_byte), @@ -288,9 +319,9 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, is_byte: bool) -> Result(src: &str, is_byte: bool, callback: &mut F) +fn unescape_str_common + From>(src: &str, mode: Mode, callback: &mut F) where - F: FnMut(Range, Result), + F: FnMut(Range, Result), { let mut chars = src.chars(); @@ -312,17 +343,17 @@ where }); continue; } - _ => scan_escape(&mut chars, is_byte), + _ => scan_escape::(&mut chars, mode), } } - '\n' => Ok('\n'), - '\t' => Ok('\t'), + '\n' => Ok(b'\n'.into()), + '\t' => Ok(b'\t'.into()), '"' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, is_byte), + _ => ascii_check(c, mode.characters_should_be_ascii()).map(Into::into), }; let end = src.len() - chars.as_str().len(); - callback(start..end, res); + callback(start..end, res.map(Into::into)); } } diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs index 0d12ec6081d83..2e4c798ab2259 100644 --- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs +++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs @@ -78,8 +78,7 @@ pub(crate) fn emit_unescape_error( } }; let sugg = sugg.unwrap_or_else(|| { - let is_byte = mode.is_byte(); - let prefix = if is_byte { "b" } else { "" }; + let prefix = mode.prefix_noraw(); let mut escaped = String::with_capacity(lit.len()); let mut chrs = lit.chars().peekable(); while let Some(first) = chrs.next() { @@ -97,7 +96,11 @@ pub(crate) fn emit_unescape_error( }; } let sugg = format!("{prefix}\"{escaped}\""); - MoreThanOneCharSugg::Quotes { span: span_with_quotes, is_byte, sugg } + MoreThanOneCharSugg::Quotes { + span: span_with_quotes, + is_byte: mode == Mode::Byte, + sugg, + } }); handler.emit_err(UnescapeError::MoreThanOneChar { span: span_with_quotes, @@ -112,7 +115,7 @@ pub(crate) fn emit_unescape_error( char_span, escaped_sugg: c.escape_default().to_string(), escaped_msg: escaped_char(c), - byte: mode.is_byte(), + byte: mode == Mode::Byte, }); } EscapeError::BareCarriageReturn => { @@ -126,12 +129,15 @@ pub(crate) fn emit_unescape_error( EscapeError::InvalidEscape => { let (c, span) = last_char(); - let label = - if mode.is_byte() { "unknown byte escape" } else { "unknown character escape" }; + let label = if mode == Mode::Byte || mode == Mode::ByteStr { + "unknown byte escape" + } else { + "unknown character escape" + }; let ec = escaped_char(c); let mut diag = handler.struct_span_err(span, &format!("{}: `{}`", label, ec)); diag.span_label(span, label); - if c == '{' || c == '}' && !mode.is_byte() { + if c == '{' || c == '}' && matches!(mode, Mode::Str | Mode::RawStr) { diag.help( "if used in a formatting string, curly braces are escaped with `{{` and `}}`", ); @@ -141,7 +147,7 @@ pub(crate) fn emit_unescape_error( version control settings", ); } else { - if !mode.is_byte() { + if mode == Mode::Str || mode == Mode::Char { diag.span_suggestion( span_with_quotes, "if you meant to write a literal backslash (perhaps escaping in a regular expression), consider a raw string literal", From 78e3455d375feb5d100a43110f78b405a8ff05f1 Mon Sep 17 00:00:00 2001 From: Deadbeef Date: Mon, 6 Mar 2023 14:15:02 +0000 Subject: [PATCH 06/10] address comments --- compiler/rustc_ast/src/ast.rs | 2 +- compiler/rustc_lexer/src/lib.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler/rustc_ast/src/ast.rs b/compiler/rustc_ast/src/ast.rs index fb90d309fcde7..8555ba3e7cee4 100644 --- a/compiler/rustc_ast/src/ast.rs +++ b/compiler/rustc_ast/src/ast.rs @@ -1814,7 +1814,7 @@ pub enum LitKind { /// A byte string (`b"foo"`). Not stored as a symbol because it might be /// non-utf8, and symbols only allow utf8 strings. ByteStr(Lrc<[u8]>, StrStyle), - /// A C String (`c"foo"`). Guaranteed only have `\0` in the end. + /// A C String (`c"foo"`). Guaranteed to only have `\0` at the end. CStr(Lrc<[u8]>, StrStyle), /// A byte char (`b'f'`). Byte(u8), diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index ce8c9ebe7ce77..c07dc19a0ac3a 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -194,7 +194,7 @@ pub enum LiteralKind { /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a". `None` /// indicates an invalid literal. RawByteStr { n_hashes: Option }, - /// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` is invalid. + /// `cr"abc"`, "cr#"abc"#", `cr#"a`. `None` indicates an invalid literal. RawCStr { n_hashes: Option }, } From bf3ca5979e47774802e95623c11e71fb303e5ff3 Mon Sep 17 00:00:00 2001 From: Deadbeef Date: Tue, 7 Mar 2023 05:09:19 +0000 Subject: [PATCH 07/10] try gating early, add non-ascii test --- compiler/rustc_ast_passes/src/feature_gate.rs | 1 - compiler/rustc_parse/src/parser/expr.rs | 1 + tests/ui/rfcs/rfc-3348-c-string-literals/non-ascii.rs | 10 ++++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 tests/ui/rfcs/rfc-3348-c-string-literals/non-ascii.rs diff --git a/compiler/rustc_ast_passes/src/feature_gate.rs b/compiler/rustc_ast_passes/src/feature_gate.rs index 9defc6603e837..17bcd24ee39fd 100644 --- a/compiler/rustc_ast_passes/src/feature_gate.rs +++ b/compiler/rustc_ast_passes/src/feature_gate.rs @@ -1,4 +1,3 @@ -use ast::token; use rustc_ast as ast; use rustc_ast::visit::{self, AssocCtxt, FnCtxt, FnKind, Visitor}; use rustc_ast::{attr, AssocConstraint, AssocConstraintKind, NodeId}; diff --git a/compiler/rustc_parse/src/parser/expr.rs b/compiler/rustc_parse/src/parser/expr.rs index bff9de5c652f9..c89b4ca8d6f0f 100644 --- a/compiler/rustc_parse/src/parser/expr.rs +++ b/compiler/rustc_parse/src/parser/expr.rs @@ -1922,6 +1922,7 @@ impl<'a> Parser<'a> { let recovered = self.recover_after_dot(); let token = recovered.as_ref().unwrap_or(&self.token); let span = token.span; + token::Lit::from_token(token).map(|token_lit| { self.bump(); (token_lit, span) diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/non-ascii.rs b/tests/ui/rfcs/rfc-3348-c-string-literals/non-ascii.rs new file mode 100644 index 0000000000000..82e8e2090d7db --- /dev/null +++ b/tests/ui/rfcs/rfc-3348-c-string-literals/non-ascii.rs @@ -0,0 +1,10 @@ +// run-pass + +#![feature(c_str_literals)] + +fn main() { + assert_eq!( + c"\xEF\x80🦀\u{1F980}".to_bytes_with_nul(), + &[0xEF, 0x80, 0xF0, 0x9F, 0xA6, 0x80, 0xF0, 0x9F, 0xA6, 0x80, 0x00], + ); +} From abb181dfd9b9df22908ab08d7cfb46509295e2e6 Mon Sep 17 00:00:00 2001 From: Deadbeef Date: Fri, 10 Mar 2023 14:18:58 +0000 Subject: [PATCH 08/10] make it semantic error --- compiler/rustc_ast/src/util/literal.rs | 2 ++ compiler/rustc_ast_passes/src/feature_gate.rs | 1 + compiler/rustc_parse/src/lexer/mod.rs | 3 +++ compiler/rustc_session/messages.ftl | 2 ++ compiler/rustc_session/src/errors.rs | 15 ++++++++++++++- .../rfcs/rfc-3348-c-string-literals/no-nuls.rs | Bin 322 -> 570 bytes .../rfc-3348-c-string-literals/no-nuls.stderr | Bin 670 -> 674 bytes 7 files changed, 22 insertions(+), 1 deletion(-) diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index cd3b163e3ac97..15a54fe13d0b7 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -8,6 +8,7 @@ use rustc_lexer::unescape::{ }; use rustc_span::symbol::{kw, sym, Symbol}; use rustc_span::Span; +use std::ops::Range; use std::{ascii, fmt, str}; // Escapes a string, represented as a symbol. Reuses the original symbol, @@ -38,6 +39,7 @@ pub enum LitError { InvalidFloatSuffix, NonDecimalFloat(u32), IntTooLarge(u32), + NulInCStr(Range), } impl LitKind { diff --git a/compiler/rustc_ast_passes/src/feature_gate.rs b/compiler/rustc_ast_passes/src/feature_gate.rs index 17bcd24ee39fd..c4578ec4af1ef 100644 --- a/compiler/rustc_ast_passes/src/feature_gate.rs +++ b/compiler/rustc_ast_passes/src/feature_gate.rs @@ -572,6 +572,7 @@ pub fn check_crate(krate: &ast::Crate, sess: &Session) { } }; } + gate_all!(c_str_literals, "`c\"..\"` literals are experimental"); gate_all!( if_let_guard, "`if let` guards are experimental", diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index b2050780309f8..050f18986154a 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -204,6 +204,9 @@ impl<'a> StringReader<'a> { rustc_lexer::TokenKind::Literal { kind, suffix_start } => { let suffix_start = start + BytePos(suffix_start); let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind); + if let token::LitKind::CStr | token::LitKind::CStrRaw(_) = kind { + self.sess.gated_spans.gate(sym::c_str_literals, self.mk_sp(start, self.pos)); + } let suffix = if suffix_start < self.pos { let string = self.str_from(suffix_start); if string == "_" { diff --git a/compiler/rustc_session/messages.ftl b/compiler/rustc_session/messages.ftl index ff53f22d43f93..2420857e739ed 100644 --- a/compiler/rustc_session/messages.ftl +++ b/compiler/rustc_session/messages.ftl @@ -93,3 +93,5 @@ session_invalid_int_literal_width = invalid width `{$width}` for integer literal .help = valid widths are 8, 16, 32, 64 and 128 session_optimization_fuel_exhausted = optimization-fuel-exhausted: {$msg} + +session_nul_in_c_str = null characters in C string literals are not supported diff --git a/compiler/rustc_session/src/errors.rs b/compiler/rustc_session/src/errors.rs index bd32adbbdbb54..22af74eb1d9e0 100644 --- a/compiler/rustc_session/src/errors.rs +++ b/compiler/rustc_session/src/errors.rs @@ -6,7 +6,7 @@ use rustc_ast::token; use rustc_ast::util::literal::LitError; use rustc_errors::{error_code, DiagnosticMessage, EmissionGuarantee, IntoDiagnostic, MultiSpan}; use rustc_macros::Diagnostic; -use rustc_span::{Span, Symbol}; +use rustc_span::{BytePos, Span, Symbol}; use rustc_target::spec::{SplitDebuginfo, StackProtector, TargetTriple}; #[derive(Diagnostic)] @@ -307,6 +307,13 @@ pub(crate) struct BinaryFloatLiteralNotSupported { pub span: Span, } +#[derive(Diagnostic)] +#[diag(session_nul_in_c_str)] +pub(crate) struct NulInCStr { + #[primary_span] + pub span: Span, +} + pub fn report_lit_error(sess: &ParseSess, err: LitError, lit: token::Lit, span: Span) { // Checks if `s` looks like i32 or u1234 etc. fn looks_like_width_suffix(first_chars: &[char], s: &str) -> bool { @@ -385,6 +392,12 @@ pub fn report_lit_error(sess: &ParseSess, err: LitError, lit: token::Lit, span: }; sess.emit_err(IntLiteralTooLarge { span, limit }); } + LitError::NulInCStr(range) => { + let lo = BytePos(span.lo().0 + range.start as u32 + 2); + let hi = BytePos(span.lo().0 + range.end as u32 + 2); + let span = span.with_lo(lo).with_hi(hi); + sess.emit_err(NulInCStr { span }); + } } } diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs b/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs index f6c86a1ba8762358537827376cbffc5c3f4508e9..e66519f294cd08f751f03b725967b5cd60dcee1d 100644 GIT binary patch literal 570 zcmbV}F>Avx5QRPKSDYOWQkQs6Xd#4jDiqw&PK^+9YLJy6ouS3JzdaYZZGy>C-z1*& z?w-C|#6_(oc209ud32R&P&;Y7*fUmJXk}x$fSv)BO3Ex*hvRbj{SLc4f6Z55J7Yk7 zNGBC}Jv0`!K)o7!V86bu&$3~jH=1WFKeKsZUGm?F17odJ?pV3bXdi=aPx$r2jHf(B zTp@}F0gq464{Jv67lo=1{CV^8wjoT$jeCafw@y%Qo3Q zPSCPEM;jBMN(!DFx@K1cWyzSFfW04Mhd1Y+KEI4HjfNE;5i#rWCql1B(8Y@nzCYRR fyP*I|{vI!G(MUg&)j`5rY|)T>DjOU+cR0qrx?#zt literal 670 zcmcJNI}5@v5XYVKDgG4KYQ#q+f{QLfC+AYAHE5wEa!Ew6es@F0SVV;0bT96AFG-|H zK)JR>eXXdftt2!GI~Yrix-cM3w;1ma$Enh(Pc#!UVj(ggLC5kOomYT5DR{C8&R_#& z`K>OKf1}P$Lfcgd?nWL;VFqm$4^_YPMWO$ VFr1x))mY+2^4@4?5MIMD_6Dmm#F_vA From 6d905a8cc14783bc577dc0534d2516d16ef3e43b Mon Sep 17 00:00:00 2001 From: Deadbeef Date: Fri, 10 Mar 2023 14:29:26 +0000 Subject: [PATCH 09/10] fix tidy --- .../rfcs/rfc-3348-c-string-literals/no-nuls.rs | Bin 570 -> 565 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs b/tests/ui/rfcs/rfc-3348-c-string-literals/no-nuls.rs index e66519f294cd08f751f03b725967b5cd60dcee1d..7bc6097f124aabfded4a9e9791611cfe4fbf9f78 100644 GIT binary patch delta 7 OcmdnRvXy0nDH8w-N&;U1 delta 13 ScmdnWvWsPdDHAIf2mk;YE&`SS From d30c6681751b10a14265e09e5f74f39d2a32e641 Mon Sep 17 00:00:00 2001 From: Deadbeef Date: Fri, 28 Apr 2023 15:28:52 +0000 Subject: [PATCH 10/10] make cook generic --- compiler/rustc_parse/src/lexer/mod.rs | 64 +++++++++++---------------- 1 file changed, 27 insertions(+), 37 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 050f18986154a..9d0037b8a8083 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -1,3 +1,5 @@ +use std::ops::Range; + use crate::errors; use crate::lexer::unicode_chars::UNICODE_ARRAY; use crate::make_unclosed_delims_error; @@ -6,7 +8,7 @@ use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind}; use rustc_ast::tokenstream::TokenStream; use rustc_ast::util::unicode::contains_text_flow_control_chars; use rustc_errors::{error_code, Applicability, Diagnostic, DiagnosticBuilder, StashKey}; -use rustc_lexer::unescape::{self, Mode}; +use rustc_lexer::unescape::{self, EscapeError, Mode}; use rustc_lexer::Cursor; use rustc_lexer::{Base, DocStyle, RawStrError}; use rustc_session::lint::builtin::{ @@ -670,7 +672,7 @@ impl<'a> StringReader<'a> { self.sess.emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num }); } - fn cook_quoted( + fn cook_common( &self, kind: token::LitKind, mode: Mode, @@ -678,12 +680,13 @@ impl<'a> StringReader<'a> { end: BytePos, prefix_len: u32, postfix_len: u32, + unescape: fn(&str, Mode, &mut dyn FnMut(Range, Result<(), EscapeError>)), ) -> (token::LitKind, Symbol) { let mut has_fatal_err = false; let content_start = start + BytePos(prefix_len); let content_end = end - BytePos(postfix_len); let lit_content = self.str_from_to(content_start, content_end); - unescape::unescape_literal(lit_content, mode, &mut |range, result| { + unescape(lit_content, mode, &mut |range, result| { // Here we only check for errors. The actual unescaping is done later. if let Err(err) = result { let span_with_quotes = self.mk_sp(start, end); @@ -715,7 +718,7 @@ impl<'a> StringReader<'a> { } } - fn cook_c_string( + fn cook_quoted( &self, kind: token::LitKind, mode: Mode, @@ -724,40 +727,27 @@ impl<'a> StringReader<'a> { prefix_len: u32, postfix_len: u32, ) -> (token::LitKind, Symbol) { - let mut has_fatal_err = false; - let content_start = start + BytePos(prefix_len); - let content_end = end - BytePos(postfix_len); - let lit_content = self.str_from_to(content_start, content_end); - unescape::unescape_c_string(lit_content, mode, &mut |range, result| { - // Here we only check for errors. The actual unescaping is done later. - if let Err(err) = result { - let span_with_quotes = self.mk_sp(start, end); - let (start, end) = (range.start as u32, range.end as u32); - let lo = content_start + BytePos(start); - let hi = lo + BytePos(end - start); - let span = self.mk_sp(lo, hi); - if err.is_fatal() { - has_fatal_err = true; - } - emit_unescape_error( - &self.sess.span_diagnostic, - lit_content, - span_with_quotes, - span, - mode, - range, - err, - ); - } - }); + self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| { + unescape::unescape_literal(src, mode, &mut |span, result| { + callback(span, result.map(drop)) + }) + }) + } - // We normally exclude the quotes for the symbol, but for errors we - // include it because it results in clearer error messages. - if !has_fatal_err { - (kind, Symbol::intern(lit_content)) - } else { - (token::Err, self.symbol_from_to(start, end)) - } + fn cook_c_string( + &self, + kind: token::LitKind, + mode: Mode, + start: BytePos, + end: BytePos, + prefix_len: u32, + postfix_len: u32, + ) -> (token::LitKind, Symbol) { + self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| { + unescape::unescape_c_string(src, mode, &mut |span, result| { + callback(span, result.map(drop)) + }) + }) } }