From bfa5f278472d0bad4e7db4a4259b2f1fa97ca0ab Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Thu, 25 Apr 2019 11:48:25 +0300 Subject: [PATCH 1/2] introduce unescape module Currently, we deal with escape sequences twice: once when we lex a string, and a second time when we unescape literals. This PR aims to remove this duplication, by introducing a new `unescape` mode as a single source of truth for character escaping rules --- src/librustc_errors/diagnostic_builder.rs | 2 +- src/libsyntax/parse/lexer/mod.rs | 585 +++++------------- src/libsyntax/parse/mod.rs | 262 ++------ src/libsyntax/parse/unescape.rs | 515 +++++++++++++++ .../parse/unescape_error_reporting.rs | 200 ++++++ src/test/ui/fmt/format-string-error-2.rs | 3 +- src/test/ui/fmt/format-string-error-2.stderr | 52 +- .../parser/ascii-only-character-escape.stderr | 12 +- src/test/ui/parser/byte-literals.stderr | 6 +- .../ui/parser/byte-string-literals.stderr | 4 +- .../ui/parser/issue-23620-invalid-escapes.rs | 7 +- .../parser/issue-23620-invalid-escapes.stderr | 60 +- .../ui/parser/lex-bad-char-literals-1.stderr | 8 +- .../ui/parser/lex-bad-char-literals-2.stderr | 4 + src/test/ui/parser/lex-bad-char-literals-4.rs | 2 +- .../ui/parser/lex-bad-char-literals-4.stderr | 4 +- .../ui/parser/lex-bad-char-literals-6.stderr | 12 + src/test/ui/parser/lex-bad-char-literals-7.rs | 14 + .../ui/parser/lex-bad-char-literals-7.stderr | 20 + ...literals-are-validated-before-expansion.rs | 10 + ...rals-are-validated-before-expansion.stderr | 18 + .../ui/parser/new-unicode-escapes-1.stderr | 4 +- .../ui/parser/new-unicode-escapes-2.stderr | 4 +- .../ui/parser/new-unicode-escapes-3.stderr | 8 +- 24 files changed, 1047 insertions(+), 769 deletions(-) create mode 100644 src/libsyntax/parse/unescape.rs create mode 100644 src/libsyntax/parse/unescape_error_reporting.rs create mode 100644 src/test/ui/parser/lex-bad-char-literals-7.rs create mode 100644 src/test/ui/parser/lex-bad-char-literals-7.stderr create mode 100644 src/test/ui/parser/macro/literals-are-validated-before-expansion.rs create mode 100644 src/test/ui/parser/macro/literals-are-validated-before-expansion.stderr diff --git a/src/librustc_errors/diagnostic_builder.rs b/src/librustc_errors/diagnostic_builder.rs index c8d47339fb365..f74dcd6070c70 100644 --- a/src/librustc_errors/diagnostic_builder.rs +++ b/src/librustc_errors/diagnostic_builder.rs @@ -184,7 +184,7 @@ impl<'a> DiagnosticBuilder<'a> { ) -> &mut Self); forward!(pub fn warn(&mut self, msg: &str) -> &mut Self); forward!(pub fn span_warn>(&mut self, sp: S, msg: &str) -> &mut Self); - forward!(pub fn help(&mut self , msg: &str) -> &mut Self); + forward!(pub fn help(&mut self, msg: &str) -> &mut Self); forward!(pub fn span_help>(&mut self, sp: S, msg: &str, diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index cf8f8abe2ab50..3c342c2ff26a9 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -1,8 +1,10 @@ use crate::ast::{self, Ident}; use crate::parse::{token, ParseSess}; use crate::symbol::Symbol; +use crate::parse::unescape; +use crate::parse::unescape_error_reporting::{emit_unescape_error, push_escaped_char}; -use errors::{Applicability, FatalError, Diagnostic, DiagnosticBuilder}; +use errors::{FatalError, Diagnostic, DiagnosticBuilder}; use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION}; use core::unicode::property::Pattern_White_Space; @@ -334,25 +336,12 @@ impl<'a> StringReader<'a> { self.err_span(self.mk_sp(from_pos, to_pos), m) } - /// Pushes a character to a message string for error reporting - fn push_escaped_char_for_msg(m: &mut String, c: char) { - match c { - '\u{20}'..='\u{7e}' => { - // Don't escape \, ' or " for user-facing messages - m.push(c); - } - _ => { - m.extend(c.escape_default()); - } - } - } - /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an /// escaped character to the error message fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError { let mut m = m.to_string(); m.push_str(": "); - Self::push_escaped_char_for_msg(&mut m, c); + push_escaped_char(&mut m, c); self.fatal_span_(from_pos, to_pos, &m[..]) } @@ -368,7 +357,7 @@ impl<'a> StringReader<'a> { { let mut m = m.to_string(); m.push_str(": "); - Self::push_escaped_char_for_msg(&mut m, c); + push_escaped_char(&mut m, c); self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..]) } @@ -378,29 +367,10 @@ impl<'a> StringReader<'a> { fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) { let mut m = m.to_string(); m.push_str(": "); - Self::push_escaped_char_for_msg(&mut m, c); + push_escaped_char(&mut m, c); self.err_span_(from_pos, to_pos, &m[..]); } - fn struct_err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) - -> DiagnosticBuilder<'a> - { - let mut m = m.to_string(); - m.push_str(": "); - Self::push_escaped_char_for_msg(&mut m, c); - - self.sess.span_diagnostic.struct_span_err(self.mk_sp(from_pos, to_pos), &m[..]) - } - - /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the - /// offending string to the error message - fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError { - m.push_str(": "); - m.push_str(&self.src[self.src_index(from_pos)..self.src_index(to_pos)]); - - self.fatal_span_(from_pos, to_pos, &m[..]) - } - /// Advance peek_tok and peek_span to refer to the next token, and /// possibly update the interner. fn advance_token(&mut self) -> Result<(), ()> { @@ -863,271 +833,6 @@ impl<'a> StringReader<'a> { } } - /// Scan over `n_digits` hex digits, stopping at `delim`, reporting an - /// error if too many or too few digits are encountered. - fn scan_hex_digits(&mut self, n_digits: usize, delim: char, below_0x7f_only: bool) -> bool { - debug!("scanning {} digits until {:?}", n_digits, delim); - let start_bpos = self.pos; - let mut accum_int = 0; - - let mut valid = true; - for _ in 0..n_digits { - if self.is_eof() { - let last_bpos = self.pos; - self.fatal_span_(start_bpos, - last_bpos, - "unterminated numeric character escape").raise(); - } - if self.ch_is(delim) { - let last_bpos = self.pos; - self.err_span_(start_bpos, - last_bpos, - "numeric character escape is too short"); - valid = false; - break; - } - let c = self.ch.unwrap_or('\x00'); - accum_int *= 16; - accum_int += c.to_digit(16).unwrap_or_else(|| { - self.err_span_char(self.pos, - self.next_pos, - "invalid character in numeric character escape", - c); - - valid = false; - 0 - }); - self.bump(); - } - - if below_0x7f_only && accum_int >= 0x80 { - self.err_span_(start_bpos, - self.pos, - "this form of character escape may only be used with characters in \ - the range [\\x00-\\x7f]"); - valid = false; - } - - match char::from_u32(accum_int) { - Some(_) => valid, - None => { - let last_bpos = self.pos; - self.err_span_(start_bpos, last_bpos, "invalid numeric character escape"); - false - } - } - } - - /// Scan for a single (possibly escaped) byte or char - /// in a byte, (non-raw) byte string, char, or (non-raw) string literal. - /// `start` is the position of `first_source_char`, which is already consumed. - /// - /// Returns `true` if there was a valid char/byte. - fn scan_char_or_byte(&mut self, - start: BytePos, - first_source_char: char, - ascii_only: bool, - delim: char) - -> bool - { - match first_source_char { - '\\' => { - // '\X' for some X must be a character constant: - let escaped = self.ch; - let escaped_pos = self.pos; - self.bump(); - match escaped { - None => {} // EOF here is an error that will be checked later. - Some(e) => { - return match e { - 'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true, - 'x' => self.scan_byte_escape(delim, !ascii_only), - 'u' => { - let valid = if self.ch_is('{') { - self.scan_unicode_escape(delim) && !ascii_only - } else { - let span = self.mk_sp(start, self.pos); - let mut suggestion = "\\u{".to_owned(); - let msg = "incorrect unicode escape sequence"; - let mut err = self.sess.span_diagnostic.struct_span_err( - span, - msg, - ); - let mut i = 0; - while let (Some(ch), true) = (self.ch, i < 6) { - if ch.is_digit(16) { - suggestion.push(ch); - self.bump(); - i += 1; - } else { - break; - } - } - if i != 0 { - suggestion.push('}'); - err.span_suggestion( - self.mk_sp(start, self.pos), - "format of unicode escape sequences uses braces", - suggestion, - Applicability::MaybeIncorrect, - ); - } else { - err.span_label(span, msg); - err.help( - "format of unicode escape sequences is `\\u{...}`", - ); - } - err.emit(); - false - }; - if ascii_only { - self.err_span_(start, - self.pos, - "unicode escape sequences cannot be used as a \ - byte or in a byte string"); - } - valid - - } - '\n' if delim == '"' => { - self.consume_whitespace(); - true - } - '\r' if delim == '"' && self.ch_is('\n') => { - self.consume_whitespace(); - true - } - c => { - let pos = self.pos; - let msg = if ascii_only { - "unknown byte escape" - } else { - "unknown character escape" - }; - let mut err = self.struct_err_span_char(escaped_pos, pos, msg, c); - err.span_label(self.mk_sp(escaped_pos, pos), msg); - if e == '\r' { - err.help( - "this is an isolated carriage return; consider checking \ - your editor and version control settings", - ); - } - if (e == '{' || e == '}') && !ascii_only { - err.help( - "if used in a formatting string, curly braces are escaped \ - with `{{` and `}}`", - ); - } - err.emit(); - false - } - } - } - } - } - '\t' | '\n' | '\r' | '\'' if delim == '\'' => { - let pos = self.pos; - self.err_span_char(start, - pos, - if ascii_only { - "byte constant must be escaped" - } else { - "character constant must be escaped" - }, - first_source_char); - return false; - } - '\r' => { - if self.ch_is('\n') { - self.bump(); - return true; - } else { - self.err_span_(start, - self.pos, - "bare CR not allowed in string, use \\r instead"); - return false; - } - } - _ => { - if ascii_only && first_source_char > '\x7F' { - let pos = self.pos; - self.err_span_(start, - pos, - "byte constant must be ASCII. Use a \\xHH escape for a \ - non-ASCII byte"); - return false; - } - } - } - true - } - - /// Scan over a `\u{...}` escape - /// - /// At this point, we have already seen the `\` and the `u`, the `{` is the current character. - /// We will read a hex number (with `_` separators), with 1 to 6 actual digits, - /// and pass over the `}`. - fn scan_unicode_escape(&mut self, delim: char) -> bool { - self.bump(); // past the { - let start_bpos = self.pos; - let mut valid = true; - - if let Some('_') = self.ch { - // disallow leading `_` - self.err_span_(self.pos, - self.next_pos, - "invalid start of unicode escape"); - valid = false; - } - - let count = self.scan_digits(16, 16); - - if count > 6 { - self.err_span_(start_bpos, - self.pos, - "overlong unicode escape (must have at most 6 hex digits)"); - valid = false; - } - - loop { - match self.ch { - Some('}') => { - if valid && count == 0 { - self.err_span_(start_bpos, - self.pos, - "empty unicode escape (must have at least 1 hex digit)"); - valid = false; - } - self.bump(); // past the ending `}` - break; - }, - Some(c) => { - if c == delim { - self.err_span_(self.pos, - self.pos, - "unterminated unicode escape (needed a `}`)"); - valid = false; - break; - } else if valid { - self.err_span_char(start_bpos, - self.pos, - "invalid character in unicode escape", - c); - valid = false; - } - }, - None => { - self.fatal_span_(start_bpos, - self.pos, - "unterminated unicode escape (found EOF)").raise(); - } - } - self.bump(); - } - - valid - } - /// Scan over a float exponent. fn scan_float_exponent(&mut self) { if self.ch_is('e') || self.ch_is('E') { @@ -1393,26 +1098,21 @@ impl<'a> StringReader<'a> { self.bump(); let start = self.pos; - // the eof will be picked up by the final `'` check below - let c2 = self.ch.unwrap_or('\x00'); - self.bump(); - // If the character is an ident start not followed by another single // quote, then this is a lifetime name: - if (ident_start(Some(c2)) || c2.is_numeric()) && !self.ch_is('\'') { + let starts_with_number = self.ch.unwrap_or('\x00').is_numeric(); + if (ident_start(self.ch) || starts_with_number) && !self.nextch_is('\'') { + self.bump(); while ident_continue(self.ch) { self.bump(); } // lifetimes shouldn't end with a single quote // if we find one, then this is an invalid character literal if self.ch_is('\'') { - self.err_span_( - start_with_quote, - self.next_pos, - "character literal may only contain one codepoint"); + let id = self.name_from(start); self.bump(); - return Ok(token::Literal(token::Err(Symbol::intern("??")), None)) - + self.validate_char_escape(start_with_quote); + return Ok(token::Literal(token::Char(id), None)) } // Include the leading `'` in the real identifier, for macro @@ -1422,7 +1122,7 @@ impl<'a> StringReader<'a> { self.mk_ident(lifetime_name) }); - if c2.is_numeric() { + if starts_with_number { // this is a recovered lifetime written `'1`, error but accept it self.err_span_( start_with_quote, @@ -1433,58 +1133,30 @@ impl<'a> StringReader<'a> { return Ok(token::Lifetime(ident)); } - - let valid = self.scan_char_or_byte(start, c2, /* ascii_only */ false, '\''); - - if !self.ch_is('\'') { - let pos = self.pos; - - loop { - self.bump(); - if self.ch_is('\'') { - let start = self.src_index(start); - let end = self.src_index(self.pos); - self.bump(); - let span = self.mk_sp(start_with_quote, self.pos); - self.sess.span_diagnostic - .struct_span_err(span, - "character literal may only contain one codepoint") - .span_suggestion( - span, - "if you meant to write a `str` literal, use double quotes", - format!("\"{}\"", &self.src[start..end]), - Applicability::MachineApplicable - ).emit(); - return Ok(token::Literal(token::Err(Symbol::intern("??")), None)) - } - if self.ch_is('\n') || self.is_eof() || self.ch_is('/') { - // Only attempt to infer single line string literals. If we encounter - // a slash, bail out in order to avoid nonsensical suggestion when - // involving comments. - break; - } - } - - self.fatal_span_verbose(start_with_quote, pos, - String::from("character literal may only contain one codepoint")).raise(); - } - - let id = if valid { - self.name_from(start) - } else { - Symbol::intern("0") - }; - - self.bump(); // advance ch past token + let msg = "unterminated character literal"; + let id = self.scan_single_quoted_string(start_with_quote, msg); + self.validate_char_escape(start_with_quote); let suffix = self.scan_optional_raw_name(); - Ok(token::Literal(token::Char(id), suffix)) } 'b' => { self.bump(); let lit = match self.ch { - Some('\'') => self.scan_byte(), - Some('"') => self.scan_byte_string(), + Some('\'') => { + let start_with_quote = self.pos; + self.bump(); + let msg = "unterminated byte constant"; + let id = self.scan_single_quoted_string(start_with_quote, msg); + self.validate_byte_escape(start_with_quote); + token::Byte(id) + }, + Some('"') => { + let start_with_quote = self.pos; + let msg = "unterminated double quote byte string"; + let id = self.scan_double_quoted_string(msg); + self.validate_byte_str_escape(start_with_quote); + token::ByteStr(id) + }, Some('r') => self.scan_raw_byte_string(), _ => unreachable!(), // Should have been a token::Ident above. }; @@ -1493,32 +1165,11 @@ impl<'a> StringReader<'a> { Ok(token::Literal(lit, suffix)) } '"' => { - let start_bpos = self.pos; - let mut valid = true; - self.bump(); - - while !self.ch_is('"') { - if self.is_eof() { - let last_bpos = self.pos; - self.fatal_span_(start_bpos, - last_bpos, - "unterminated double quote string").raise(); - } - - let ch_start = self.pos; - let ch = self.ch.unwrap(); - self.bump(); - valid &= self.scan_char_or_byte(ch_start, ch, /* ascii_only */ false, '"'); - } - // adjust for the ASCII " at the start of the literal - let id = if valid { - self.name_from(start_bpos + BytePos(1)) - } else { - Symbol::intern("??") - }; - self.bump(); + let start_with_quote = self.pos; + let msg = "unterminated double quote string"; + let id = self.scan_double_quoted_string(msg); + self.validate_str_escape(start_with_quote); let suffix = self.scan_optional_raw_name(); - Ok(token::Literal(token::Str_(id), suffix)) } 'r' => { @@ -1659,12 +1310,6 @@ impl<'a> StringReader<'a> { } } - fn consume_whitespace(&mut self) { - while is_pattern_whitespace(self.ch) && !self.is_eof() { - self.bump(); - } - } - fn read_to_eol(&mut self) -> String { let mut val = String::new(); while !self.ch_is('\n') && !self.is_eof() { @@ -1698,73 +1343,63 @@ impl<'a> StringReader<'a> { (self.ch_is('#') && self.nextch_is('!') && !self.nextnextch_is('[')) } - fn scan_byte(&mut self) -> token::Lit { - self.bump(); + fn scan_single_quoted_string(&mut self, + start_with_quote: BytePos, + unterminated_msg: &str) -> ast::Name { + // assumes that first `'` is consumed let start = self.pos; - - // the eof will be picked up by the final `'` check below - let c2 = self.ch.unwrap_or('\x00'); - self.bump(); - - let valid = self.scan_char_or_byte(start, - c2, - // ascii_only = - true, - '\''); - if !self.ch_is('\'') { - // Byte offsetting here is okay because the - // character before position `start` are an - // ascii single quote and ascii 'b'. - let pos = self.pos; - self.fatal_span_verbose(start - BytePos(2), - pos, - "unterminated byte constant".to_string()).raise(); - } - - let id = if valid { - self.name_from(start) + // lex `'''` as a single char, for recovery + if self.ch_is('\'') && self.nextch_is('\'') { + self.bump(); } else { - Symbol::intern("?") - }; - self.bump(); // advance ch past token - - token::Byte(id) - } + let mut first = true; + loop { + if self.ch_is('\'') { + break; + } + if self.ch_is('\\') && (self.nextch_is('\'') || self.nextch_is('\\')) { + self.bump(); + self.bump(); + } else { + // Only attempt to infer single line string literals. If we encounter + // a slash, bail out in order to avoid nonsensical suggestion when + // involving comments. + if self.is_eof() + || (self.ch_is('/') && !first) + || (self.ch_is('\n') && !self.nextch_is('\'')) { + + self.fatal_span_(start_with_quote, self.pos, unterminated_msg.into()) + .raise() + } + self.bump(); + } + first = false; + } + } - #[inline] - fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool { - self.scan_hex_digits(2, delim, below_0x7f_only) + let id = self.name_from(start); + self.bump(); + id } - fn scan_byte_string(&mut self) -> token::Lit { + fn scan_double_quoted_string(&mut self, unterminated_msg: &str) -> ast::Name { + debug_assert!(self.ch_is('\"')); + let start_with_quote = self.pos; self.bump(); let start = self.pos; - let mut valid = true; - while !self.ch_is('"') { if self.is_eof() { let pos = self.pos; - self.fatal_span_(start, pos, "unterminated double quote byte string").raise(); + self.fatal_span_(start_with_quote, pos, unterminated_msg).raise(); + } + if self.ch_is('\\') && (self.nextch_is('\\') || self.nextch_is('"')) { + self.bump(); } - - let ch_start = self.pos; - let ch = self.ch.unwrap(); self.bump(); - valid &= self.scan_char_or_byte(ch_start, - ch, - // ascii_only = - true, - '"'); } - - let id = if valid { - self.name_from(start) - } else { - Symbol::intern("??") - }; + let id = self.name_from(start); self.bump(); - - token::ByteStr(id) + id } fn scan_raw_byte_string(&mut self) -> token::Lit { @@ -1826,6 +1461,70 @@ impl<'a> StringReader<'a> { token::ByteStrRaw(self.name_from_to(content_start_bpos, content_end_bpos), hash_count) } + + fn validate_char_escape(&self, start_with_quote: BytePos) { + self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| { + if let Err((off, err)) = unescape::unescape_char(lit) { + emit_unescape_error( + &self.sess.span_diagnostic, + lit, + self.mk_sp(start_with_quote, self.pos), + unescape::Mode::Char, + 0..off, + err, + ) + } + }); + } + + fn validate_byte_escape(&self, start_with_quote: BytePos) { + self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| { + if let Err((off, err)) = unescape::unescape_byte(lit) { + emit_unescape_error( + &self.sess.span_diagnostic, + lit, + self.mk_sp(start_with_quote, self.pos), + unescape::Mode::Byte, + 0..off, + err, + ) + } + }); + } + + fn validate_str_escape(&self, start_with_quote: BytePos) { + self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| { + unescape::unescape_str(lit, &mut |range, c| { + if let Err(err) = c { + emit_unescape_error( + &self.sess.span_diagnostic, + lit, + self.mk_sp(start_with_quote, self.pos), + unescape::Mode::Str, + range, + err, + ) + } + }) + }); + } + + fn validate_byte_str_escape(&self, start_with_quote: BytePos) { + self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| { + unescape::unescape_byte_str(lit, &mut |range, c| { + if let Err(err) = c { + emit_unescape_error( + &self.sess.span_diagnostic, + lit, + self.mk_sp(start_with_quote, self.pos), + unescape::Mode::ByteStr, + range, + err, + ) + } + }) + }); + } } // This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs index 1abc7832ffa0f..d6d4f8e8f0429 100644 --- a/src/libsyntax/parse/mod.rs +++ b/src/libsyntax/parse/mod.rs @@ -18,7 +18,6 @@ use log::debug; use rustc_data_structures::fx::FxHashSet; use std::borrow::Cow; -use std::iter; use std::path::{Path, PathBuf}; use std::str; @@ -33,6 +32,11 @@ pub mod attr; pub mod classify; +pub(crate) mod unescape; +use unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte, EscapeError}; + +pub(crate) mod unescape_error_reporting; + /// Info about a parsing session. pub struct ParseSess { pub span_diagnostic: Handler, @@ -306,133 +310,6 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser<'_> { Parser::new(sess, stream, None, true, false) } -/// Parses a string representing a character literal into its final form. -/// Rather than just accepting/rejecting a given literal, unescapes it as -/// well. Can take any slice prefixed by a character escape. Returns the -/// character and the number of characters consumed. -fn char_lit(lit: &str, diag: Option<(Span, &Handler)>) -> (char, isize) { - use std::char; - - // Handle non-escaped chars first. - if lit.as_bytes()[0] != b'\\' { - // If the first byte isn't '\\' it might part of a multi-byte char, so - // get the char with chars(). - let c = lit.chars().next().unwrap(); - return (c, 1); - } - - // Handle escaped chars. - match lit.as_bytes()[1] as char { - '"' => ('"', 2), - 'n' => ('\n', 2), - 'r' => ('\r', 2), - 't' => ('\t', 2), - '\\' => ('\\', 2), - '\'' => ('\'', 2), - '0' => ('\0', 2), - 'x' => { - let v = u32::from_str_radix(&lit[2..4], 16).unwrap(); - let c = char::from_u32(v).unwrap(); - (c, 4) - } - 'u' => { - assert_eq!(lit.as_bytes()[2], b'{'); - let idx = lit.find('}').unwrap(); - - // All digits and '_' are ascii, so treat each byte as a char. - let mut v: u32 = 0; - for c in lit[3..idx].bytes() { - let c = char::from(c); - if c != '_' { - let x = c.to_digit(16).unwrap(); - v = v.checked_mul(16).unwrap().checked_add(x).unwrap(); - } - } - let c = char::from_u32(v).unwrap_or_else(|| { - if let Some((span, diag)) = diag { - let mut diag = diag.struct_span_err(span, "invalid unicode character escape"); - if v > 0x10FFFF { - diag.help("unicode escape must be at most 10FFFF").emit(); - } else { - diag.help("unicode escape must not be a surrogate").emit(); - } - } - '\u{FFFD}' - }); - (c, (idx + 1) as isize) - } - _ => panic!("lexer should have rejected a bad character escape {}", lit) - } -} - -/// Parses a string representing a string literal into its final form. Does unescaping. -fn str_lit(lit: &str, diag: Option<(Span, &Handler)>) -> String { - debug!("str_lit: given {}", lit.escape_default()); - let mut res = String::with_capacity(lit.len()); - - let error = |i| format!("lexer should have rejected {} at {}", lit, i); - - /// Eat everything up to a non-whitespace. - fn eat<'a>(it: &mut iter::Peekable>) { - loop { - match it.peek().map(|x| x.1) { - Some(' ') | Some('\n') | Some('\r') | Some('\t') => { - it.next(); - }, - _ => { break; } - } - } - } - - let mut chars = lit.char_indices().peekable(); - while let Some((i, c)) = chars.next() { - match c { - '\\' => { - let ch = chars.peek().unwrap_or_else(|| { - panic!("{}", error(i)) - }).1; - - if ch == '\n' { - eat(&mut chars); - } else if ch == '\r' { - chars.next(); - let ch = chars.peek().unwrap_or_else(|| { - panic!("{}", error(i)) - }).1; - - if ch != '\n' { - panic!("lexer accepted bare CR"); - } - eat(&mut chars); - } else { - // otherwise, a normal escape - let (c, n) = char_lit(&lit[i..], diag); - for _ in 0..n - 1 { // we don't need to move past the first \ - chars.next(); - } - res.push(c); - } - }, - '\r' => { - let ch = chars.peek().unwrap_or_else(|| { - panic!("{}", error(i)) - }).1; - - if ch != '\n' { - panic!("lexer accepted bare CR"); - } - chars.next(); - res.push('\n'); - } - c => res.push(c), - } - } - - res.shrink_to_fit(); // probably not going to do anything, unless there was an escape. - debug!("parse_str_lit: returning {}", res); - res -} - /// Parses a string representing a raw string literal into its final form. The /// only operation this does is convert embedded CRLF into a single LF. fn raw_str_lit(lit: &str) -> String { @@ -475,9 +352,23 @@ crate fn lit_token(lit: token::Lit, suf: Option, diag: Option<(Span, &Ha use ast::LitKind; match lit { - token::Byte(i) => (true, Some(LitKind::Byte(byte_lit(&i.as_str()).0))), - token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str(), diag).0))), - token::Err(i) => (true, Some(LitKind::Err(i))), + token::Byte(i) => { + let lit_kind = match unescape_byte(&i.as_str()) { + Ok(c) => LitKind::Byte(c), + Err((_, EscapeError::MoreThanOneChar)) => LitKind::Err(i), + Err(_) => LitKind::Byte(0), + }; + (true, Some(lit_kind)) + }, + token::Char(i) => { + let lit_kind = match unescape_char(&i.as_str()) { + Ok(c) => LitKind::Char(c), + Err((_, EscapeError::MoreThanOneChar)) => LitKind::Err(i), + Err(_) => LitKind::Char('\u{FFFD}'), + }; + (true, Some(lit_kind)) + }, + token::Err(i) => (true, Some(LitKind::Err(i))), // There are some valid suffixes for integer and float literals, // so all the handling is done internally. @@ -491,7 +382,14 @@ crate fn lit_token(lit: token::Lit, suf: Option, diag: Option<(Span, &Ha // string in the Token. let s = &sym.as_str(); if s.as_bytes().iter().any(|&c| c == b'\\' || c == b'\r') { - sym = Symbol::intern(&str_lit(s, diag)); + let mut buf = String::with_capacity(s.len()); + unescape_str(s, &mut |_, unescaped_char| { + match unescaped_char { + Ok(c) => buf.push(c), + Err(_) => buf.push('\u{FFFD}'), + } + }); + sym = Symbol::intern(&buf) } (true, Some(LitKind::Str(sym, ast::StrStyle::Cooked))) } @@ -504,7 +402,16 @@ crate fn lit_token(lit: token::Lit, suf: Option, diag: Option<(Span, &Ha (true, Some(LitKind::Str(sym, ast::StrStyle::Raw(n)))) } token::ByteStr(i) => { - (true, Some(LitKind::ByteStr(byte_str_lit(&i.as_str())))) + let s = &i.as_str(); + let mut buf = Vec::with_capacity(s.len()); + unescape_byte_str(s, &mut |_, unescaped_byte| { + match unescaped_byte { + Ok(c) => buf.push(c), + Err(_) => buf.push(0), + } + }); + buf.shrink_to_fit(); + (true, Some(LitKind::ByteStr(Lrc::new(buf)))) } token::ByteStrRaw(i, _) => { (true, Some(LitKind::ByteStr(Lrc::new(i.to_string().into_bytes())))) @@ -559,95 +466,6 @@ fn float_lit(s: &str, suffix: Option, diag: Option<(Span, &Handler)>) filtered_float_lit(Symbol::intern(s), suffix, diag) } -/// Parses a string representing a byte literal into its final form. Similar to `char_lit`. -fn byte_lit(lit: &str) -> (u8, usize) { - let err = |i| format!("lexer accepted invalid byte literal {} step {}", lit, i); - - if lit.len() == 1 { - (lit.as_bytes()[0], 1) - } else { - assert_eq!(lit.as_bytes()[0], b'\\', "{}", err(0)); - let b = match lit.as_bytes()[1] { - b'"' => b'"', - b'n' => b'\n', - b'r' => b'\r', - b't' => b'\t', - b'\\' => b'\\', - b'\'' => b'\'', - b'0' => b'\0', - _ => { - match u64::from_str_radix(&lit[2..4], 16).ok() { - Some(c) => - if c > 0xFF { - panic!(err(2)) - } else { - return (c as u8, 4) - }, - None => panic!(err(3)) - } - } - }; - (b, 2) - } -} - -fn byte_str_lit(lit: &str) -> Lrc> { - let mut res = Vec::with_capacity(lit.len()); - - let error = |i| panic!("lexer should have rejected {} at {}", lit, i); - - /// Eat everything up to a non-whitespace. - fn eat>(it: &mut iter::Peekable) { - loop { - match it.peek().map(|x| x.1) { - Some(b' ') | Some(b'\n') | Some(b'\r') | Some(b'\t') => { - it.next(); - }, - _ => { break; } - } - } - } - - // byte string literals *must* be ASCII, but the escapes don't have to be - let mut chars = lit.bytes().enumerate().peekable(); - loop { - match chars.next() { - Some((i, b'\\')) => { - match chars.peek().unwrap_or_else(|| error(i)).1 { - b'\n' => eat(&mut chars), - b'\r' => { - chars.next(); - if chars.peek().unwrap_or_else(|| error(i)).1 != b'\n' { - panic!("lexer accepted bare CR"); - } - eat(&mut chars); - } - _ => { - // otherwise, a normal escape - let (c, n) = byte_lit(&lit[i..]); - // we don't need to move past the first \ - for _ in 0..n - 1 { - chars.next(); - } - res.push(c); - } - } - }, - Some((i, b'\r')) => { - if chars.peek().unwrap_or_else(|| error(i)).1 != b'\n' { - panic!("lexer accepted bare CR"); - } - chars.next(); - res.push(b'\n'); - } - Some((_, c)) => res.push(c), - None => break, - } - } - - Lrc::new(res) -} - fn integer_lit(s: &str, suffix: Option, diag: Option<(Span, &Handler)>) -> Option { // s can only be ascii, byte indexing is fine diff --git a/src/libsyntax/parse/unescape.rs b/src/libsyntax/parse/unescape.rs new file mode 100644 index 0000000000000..90ee549db017b --- /dev/null +++ b/src/libsyntax/parse/unescape.rs @@ -0,0 +1,515 @@ +//! Utilities for validating string and char literals and turning them into +//! values they represent. + +use std::str::Chars; +use std::ops::Range; + +#[derive(Debug, PartialEq, Eq)] +pub(crate) enum EscapeError { + ZeroChars, + MoreThanOneChar, + + LoneSlash, + InvalidEscape, + BareCarriageReturn, + EscapeOnlyChar, + + TooShortHexEscape, + InvalidCharInHexEscape, + OutOfRangeHexEscape, + + NoBraceInUnicodeEscape, + InvalidCharInUnicodeEscape, + EmptyUnicodeEscape, + UnclosedUnicodeEscape, + LeadingUnderscoreUnicodeEscape, + OverlongUnicodeEscape, + LoneSurrogateUnicodeEscape, + OutOfRangeUnicodeEscape, + + UnicodeEscapeInByte, + NonAsciiCharInByte, +} + +/// Takes a contents of a char literal (without quotes), and returns an +/// unescaped char or an error +pub(crate) fn unescape_char(literal_text: &str) -> Result { + let mut chars = literal_text.chars(); + unescape_char_or_byte(&mut chars, Mode::Char) + .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of escaped characters or errors. +pub(crate) fn unescape_str(literal_text: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + unescape_str_or_byte_str(literal_text, Mode::Str, callback) +} + +pub(crate) fn unescape_byte(literal_text: &str) -> Result { + let mut chars = literal_text.chars(); + unescape_char_or_byte(&mut chars, Mode::Byte) + .map(byte_from_char) + .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of escaped characters or errors. +pub(crate) fn unescape_byte_str(literal_text: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| { + callback(range, char.map(byte_from_char)) + }) +} + +#[derive(Debug, Clone, Copy)] +pub(crate) enum Mode { + Char, + Str, + Byte, + ByteStr, +} + +impl Mode { + fn in_single_quotes(self) -> bool { + match self { + Mode::Char | Mode::Byte => true, + Mode::Str | Mode::ByteStr => false, + } + } + + pub(crate) fn in_double_quotes(self) -> bool { + !self.in_single_quotes() + } + + pub(crate) fn is_bytes(self) -> bool { + match self { + Mode::Byte | Mode::ByteStr => true, + Mode::Char | Mode::Str => false, + } + } +} + + +fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result { + if first_char != '\\' { + return match first_char { + '\t' | '\n' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(if chars.clone().next() == Some('\n') { + EscapeError::EscapeOnlyChar + } else { + EscapeError::BareCarriageReturn + }), + '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar), + '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar), + _ => { + if mode.is_bytes() && !first_char.is_ascii() { + return Err(EscapeError::NonAsciiCharInByte); + } + Ok(first_char) + } + }; + } + + let second_char = chars.next().ok_or(EscapeError::LoneSlash)?; + + let res = match second_char { + '"' => '"', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '0' => '\0', + + 'x' => { + let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + let value = hi * 16 + lo; + + if !mode.is_bytes() && !is_ascii(value) { + return Err(EscapeError::OutOfRangeHexEscape); + } + let value = value as u8; + + value as char + } + + 'u' => { + if chars.next() != Some('{') { + return Err(EscapeError::NoBraceInUnicodeEscape); + } + + let mut n_digits = 1; + let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { + '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), + '}' => return Err(EscapeError::EmptyUnicodeEscape), + c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?, + }; + + loop { + match chars.next() { + None => return Err(EscapeError::UnclosedUnicodeEscape), + Some('_') => continue, + Some('}') => { + if n_digits > 6 { + return Err(EscapeError::OverlongUnicodeEscape); + } + if mode.is_bytes() { + return Err(EscapeError::UnicodeEscapeInByte); + } + + break std::char::from_u32(value).ok_or_else(|| { + if value > 0x10FFFF { + EscapeError::OutOfRangeUnicodeEscape + } else { + EscapeError::LoneSurrogateUnicodeEscape + } + })?; + } + Some(c) => { + let digit = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; + n_digits += 1; + if n_digits > 6 { + continue; + } + let digit = digit as u32; + value = value * 16 + digit; + } + }; + } + } + _ => return Err(EscapeError::InvalidEscape), + }; + Ok(res) +} + +fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { + let first_char = chars.next().ok_or(EscapeError::ZeroChars)?; + let res = scan_escape(first_char, chars, mode)?; + if chars.next().is_some() { + return Err(EscapeError::MoreThanOneChar); + } + Ok(res) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of escaped characters or errors. +fn unescape_str_or_byte_str(src: &str, mode: Mode, callback: &mut F) +where + F: FnMut(Range, Result), +{ + assert!(mode.in_double_quotes()); + let initial_len = src.len(); + let mut chars = src.chars(); + while let Some(first_char) = chars.next() { + let start = initial_len - chars.as_str().len() - first_char.len_utf8(); + + let unescaped_char = match first_char { + '\\' => { + let (second_char, third_char) = { + let mut chars = chars.clone(); + (chars.next(), chars.next()) + }; + match (second_char, third_char) { + (Some('\n'), _) | (Some('\r'), Some('\n')) => { + skip_ascii_whitespace(&mut chars); + continue; + } + _ => scan_escape(first_char, &mut chars, mode), + } + } + '\r' => { + let second_char = chars.clone().next(); + if second_char == Some('\n') { + chars.next(); + Ok('\n') + } else { + scan_escape(first_char, &mut chars, mode) + } + } + '\n' => Ok('\n'), + '\t' => Ok('\t'), + _ => scan_escape(first_char, &mut chars, mode), + }; + let end = initial_len - chars.as_str().len(); + callback(start..end, unescaped_char); + } + + fn skip_ascii_whitespace(chars: &mut Chars<'_>) { + let str = chars.as_str(); + let first_non_space = str + .bytes() + .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') + .unwrap_or(str.len()); + *chars = str[first_non_space..].chars() + } +} + +fn byte_from_char(c: char) -> u8 { + let res = c as u32; + assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte"); + res as u8 +} + +fn is_ascii(x: u32) -> bool { + x <= 0x7F +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_unescape_char_bad() { + fn check(literal_text: &str, expected_error: EscapeError) { + let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err); + assert_eq!(actual_result, Err(expected_error)); + } + + check("", EscapeError::ZeroChars); + check(r"\", EscapeError::LoneSlash); + + check("\n", EscapeError::EscapeOnlyChar); + check("\r\n", EscapeError::EscapeOnlyChar); + check("\t", EscapeError::EscapeOnlyChar); + check("'", EscapeError::EscapeOnlyChar); + check("\r", EscapeError::BareCarriageReturn); + + check("spam", EscapeError::MoreThanOneChar); + check(r"\x0ff", EscapeError::MoreThanOneChar); + check(r#"\"a"#, EscapeError::MoreThanOneChar); + check(r"\na", EscapeError::MoreThanOneChar); + check(r"\ra", EscapeError::MoreThanOneChar); + check(r"\ta", EscapeError::MoreThanOneChar); + check(r"\\a", EscapeError::MoreThanOneChar); + check(r"\'a", EscapeError::MoreThanOneChar); + check(r"\0a", EscapeError::MoreThanOneChar); + check(r"\u{0}x", EscapeError::MoreThanOneChar); + check(r"\u{1F63b}}", EscapeError::MoreThanOneChar); + + check(r"\v", EscapeError::InvalidEscape); + check(r"\💩", EscapeError::InvalidEscape); + check(r"\●", EscapeError::InvalidEscape); + + check(r"\x", EscapeError::TooShortHexEscape); + check(r"\x0", EscapeError::TooShortHexEscape); + check(r"\xf", EscapeError::TooShortHexEscape); + check(r"\xa", EscapeError::TooShortHexEscape); + check(r"\xx", EscapeError::InvalidCharInHexEscape); + check(r"\xы", EscapeError::InvalidCharInHexEscape); + check(r"\x🦀", EscapeError::InvalidCharInHexEscape); + check(r"\xtt", EscapeError::InvalidCharInHexEscape); + check(r"\xff", EscapeError::OutOfRangeHexEscape); + check(r"\xFF", EscapeError::OutOfRangeHexEscape); + check(r"\x80", EscapeError::OutOfRangeHexEscape); + + check(r"\u", EscapeError::NoBraceInUnicodeEscape); + check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape); + check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape); + check(r"\u{", EscapeError::UnclosedUnicodeEscape); + check(r"\u{0000", EscapeError::UnclosedUnicodeEscape); + check(r"\u{}", EscapeError::EmptyUnicodeEscape); + check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape); + check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape); + check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape); + check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape); + check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape); + + check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape); + + check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape); + } + + #[test] + fn test_unescape_char_good() { + fn check(literal_text: &str, expected_char: char) { + let actual_result = unescape_char(literal_text); + assert_eq!(actual_result, Ok(expected_char)); + } + + check("a", 'a'); + check("ы", 'ы'); + check("🦀", '🦀'); + + check(r#"\""#, '"'); + check(r"\n", '\n'); + check(r"\r", '\r'); + check(r"\t", '\t'); + check(r"\\", '\\'); + check(r"\'", '\''); + check(r"\0", '\0'); + + check(r"\x00", '\0'); + check(r"\x5a", 'Z'); + check(r"\x5A", 'Z'); + check(r"\x7f", 127 as char); + + check(r"\u{0}", '\0'); + check(r"\u{000000}", '\0'); + check(r"\u{41}", 'A'); + check(r"\u{0041}", 'A'); + check(r"\u{00_41}", 'A'); + check(r"\u{4__1__}", 'A'); + check(r"\u{1F63b}", '😻'); + } + + #[test] + fn test_unescape_str_good() { + fn check(literal_text: &str, expected: &str) { + let mut buf = Ok(String::with_capacity(literal_text.len())); + unescape_str(literal_text, &mut |range, c| { + if let Ok(b) = &mut buf { + match c { + Ok(c) => b.push(c), + Err(e) => buf = Err((range, e)), + } + } + }); + let buf = buf.as_ref().map(|it| it.as_ref()); + assert_eq!(buf, Ok(expected)) + } + + check("foo", "foo"); + check("", ""); + check(" \t\n\r\n", " \t\n\n"); + + check("hello \\\n world", "hello world"); + check("hello \\\r\n world", "hello world"); + check("thread's", "thread's") + } + + #[test] + fn test_unescape_byte_bad() { + fn check(literal_text: &str, expected_error: EscapeError) { + let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err); + assert_eq!(actual_result, Err(expected_error)); + } + + check("", EscapeError::ZeroChars); + check(r"\", EscapeError::LoneSlash); + + check("\n", EscapeError::EscapeOnlyChar); + check("\r\n", EscapeError::EscapeOnlyChar); + check("\t", EscapeError::EscapeOnlyChar); + check("'", EscapeError::EscapeOnlyChar); + check("\r", EscapeError::BareCarriageReturn); + + check("spam", EscapeError::MoreThanOneChar); + check(r"\x0ff", EscapeError::MoreThanOneChar); + check(r#"\"a"#, EscapeError::MoreThanOneChar); + check(r"\na", EscapeError::MoreThanOneChar); + check(r"\ra", EscapeError::MoreThanOneChar); + check(r"\ta", EscapeError::MoreThanOneChar); + check(r"\\a", EscapeError::MoreThanOneChar); + check(r"\'a", EscapeError::MoreThanOneChar); + check(r"\0a", EscapeError::MoreThanOneChar); + + check(r"\v", EscapeError::InvalidEscape); + check(r"\💩", EscapeError::InvalidEscape); + check(r"\●", EscapeError::InvalidEscape); + + check(r"\x", EscapeError::TooShortHexEscape); + check(r"\x0", EscapeError::TooShortHexEscape); + check(r"\xa", EscapeError::TooShortHexEscape); + check(r"\xf", EscapeError::TooShortHexEscape); + check(r"\xx", EscapeError::InvalidCharInHexEscape); + check(r"\xы", EscapeError::InvalidCharInHexEscape); + check(r"\x🦀", EscapeError::InvalidCharInHexEscape); + check(r"\xtt", EscapeError::InvalidCharInHexEscape); + + check(r"\u", EscapeError::NoBraceInUnicodeEscape); + check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape); + check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape); + check(r"\u{", EscapeError::UnclosedUnicodeEscape); + check(r"\u{0000", EscapeError::UnclosedUnicodeEscape); + check(r"\u{}", EscapeError::EmptyUnicodeEscape); + check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape); + check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape); + + check("ы", EscapeError::NonAsciiCharInByte); + check("🦀", EscapeError::NonAsciiCharInByte); + + check(r"\u{0}", EscapeError::UnicodeEscapeInByte); + check(r"\u{000000}", EscapeError::UnicodeEscapeInByte); + check(r"\u{41}", EscapeError::UnicodeEscapeInByte); + check(r"\u{0041}", EscapeError::UnicodeEscapeInByte); + check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte); + check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte); + check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte); + check(r"\u{0}x", EscapeError::UnicodeEscapeInByte); + check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte); + check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte); + check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte); + check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte); + check(r"\u{D800}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte); + } + + #[test] + fn test_unescape_byte_good() { + fn check(literal_text: &str, expected_byte: u8) { + let actual_result = unescape_byte(literal_text); + assert_eq!(actual_result, Ok(expected_byte)); + } + + check("a", b'a'); + + check(r#"\""#, b'"'); + check(r"\n", b'\n'); + check(r"\r", b'\r'); + check(r"\t", b'\t'); + check(r"\\", b'\\'); + check(r"\'", b'\''); + check(r"\0", b'\0'); + + check(r"\x00", b'\0'); + check(r"\x5a", b'Z'); + check(r"\x5A", b'Z'); + check(r"\x7f", 127); + check(r"\x80", 128); + check(r"\xff", 255); + check(r"\xFF", 255); + } + + #[test] + fn test_unescape_byte_str_good() { + fn check(literal_text: &str, expected: &[u8]) { + let mut buf = Ok(Vec::with_capacity(literal_text.len())); + unescape_byte_str(literal_text, &mut |range, c| { + if let Ok(b) = &mut buf { + match c { + Ok(c) => b.push(c), + Err(e) => buf = Err((range, e)), + } + } + }); + let buf = buf.as_ref().map(|it| it.as_ref()); + assert_eq!(buf, Ok(expected)) + } + + check("foo", b"foo"); + check("", b""); + check(" \t\n\r\n", b" \t\n\n"); + + check("hello \\\n world", b"hello world"); + check("hello \\\r\n world", b"hello world"); + check("thread's", b"thread's") + } +} diff --git a/src/libsyntax/parse/unescape_error_reporting.rs b/src/libsyntax/parse/unescape_error_reporting.rs new file mode 100644 index 0000000000000..22777c0884f47 --- /dev/null +++ b/src/libsyntax/parse/unescape_error_reporting.rs @@ -0,0 +1,200 @@ +//! Utilities for rendering escape sequence errors as diagnostics. + +use std::ops::Range; +use std::iter::once; + +use syntax_pos::{Span, BytePos}; + +use crate::errors::{Handler, Applicability}; + +use super::unescape::{EscapeError, Mode}; + +pub(crate) fn emit_unescape_error( + handler: &Handler, + // interior part of the literal, without quotes + lit: &str, + // full span of the literal, including quotes + span_with_quotes: Span, + mode: Mode, + // range of the error inside `lit` + range: Range, + error: EscapeError, +) { + log::debug!("emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}", + lit, span_with_quotes, mode, range, error); + let span = { + let Range { start, end } = range; + let (start, end) = (start as u32, end as u32); + let lo = span_with_quotes.lo() + BytePos(start + 1); + let hi = lo + BytePos(end - start); + span_with_quotes + .with_lo(lo) + .with_hi(hi) + }; + let last_char = || { + let c = lit[range.clone()].chars().rev().next().unwrap(); + let span = span.with_lo(span.hi() - BytePos(c.len_utf8() as u32)); + (c, span) + }; + match error { + EscapeError::LoneSurrogateUnicodeEscape => { + handler.struct_span_err(span, "invalid unicode character escape") + .help("unicode escape must not be a surrogate") + .emit(); + } + EscapeError::OutOfRangeUnicodeEscape => { + handler.struct_span_err(span, "invalid unicode character escape") + .help("unicode escape must be at most 10FFFF") + .emit(); + } + EscapeError::MoreThanOneChar => { + handler + .struct_span_err( + span_with_quotes, + "character literal may only contain one codepoint", + ) + .span_suggestion( + span_with_quotes, + "if you meant to write a `str` literal, use double quotes", + format!("\"{}\"", lit), + Applicability::MachineApplicable, + ).emit() + } + EscapeError::EscapeOnlyChar => { + let (c, _span) = last_char(); + + let mut msg = if mode.is_bytes() { + "byte constant must be escaped: " + } else { + "character constant must be escaped: " + }.to_string(); + push_escaped_char(&mut msg, c); + + handler.span_err(span, msg.as_str()) + } + EscapeError::BareCarriageReturn => { + let msg = if mode.in_double_quotes() { + "bare CR not allowed in string, use \\r instead" + } else { + "character constant must be escaped: \\r" + }; + handler.span_err(span, msg); + } + EscapeError::InvalidEscape => { + let (c, span) = last_char(); + + let label = if mode.is_bytes() { + "unknown byte escape" + } else { + "unknown character escape" + }; + let mut msg = label.to_string(); + msg.push_str(": "); + push_escaped_char(&mut msg, c); + + let mut diag = handler.struct_span_err(span, msg.as_str()); + diag.span_label(span, label); + if c == '{' || c == '}' && !mode.is_bytes() { + diag.help("if used in a formatting string, \ + curly braces are escaped with `{{` and `}}`"); + } else if c == '\r' { + diag.help("this is an isolated carriage return; \ + consider checking your editor and version control settings"); + } + diag.emit(); + } + EscapeError::TooShortHexEscape => { + handler.span_err(span, "numeric character escape is too short") + } + EscapeError::InvalidCharInHexEscape | EscapeError::InvalidCharInUnicodeEscape => { + let (c, span) = last_char(); + + let mut msg = if error == EscapeError::InvalidCharInHexEscape { + "invalid character in numeric character escape: " + } else { + "invalid character in unicode escape: " + }.to_string(); + push_escaped_char(&mut msg, c); + + handler.span_err(span, msg.as_str()) + } + EscapeError::NonAsciiCharInByte => { + assert!(mode.is_bytes()); + let (_c, span) = last_char(); + handler.span_err(span, "byte constant must be ASCII. \ + Use a \\xHH escape for a non-ASCII byte") + } + EscapeError::OutOfRangeHexEscape => { + handler.span_err(span, "this form of character escape may only be used \ + with characters in the range [\\x00-\\x7f]") + } + EscapeError::LeadingUnderscoreUnicodeEscape => { + let (_c, span) = last_char(); + handler.span_err(span, "invalid start of unicode escape") + } + EscapeError::OverlongUnicodeEscape => { + handler.span_err(span, "overlong unicode escape (must have at most 6 hex digits)") + } + EscapeError::UnclosedUnicodeEscape => { + handler.span_err(span, "unterminated unicode escape (needed a `}`)") + } + EscapeError::NoBraceInUnicodeEscape => { + let msg = "incorrect unicode escape sequence"; + let mut diag = handler.struct_span_err(span, msg); + + let mut suggestion = "\\u{".to_owned(); + let mut suggestion_len = 0; + let (c, char_span) = last_char(); + let chars = once(c).chain(lit[range.end..].chars()); + for c in chars.take(6).take_while(|c| c.is_digit(16)) { + suggestion.push(c); + suggestion_len += c.len_utf8(); + } + + if suggestion_len > 0 { + suggestion.push('}'); + let lo = char_span.lo(); + let hi = lo + BytePos(suggestion_len as u32); + diag.span_suggestion( + span.with_lo(lo).with_hi(hi), + "format of unicode escape sequences uses braces", + suggestion, + Applicability::MaybeIncorrect, + ); + } else { + diag.span_label(span, msg); + diag.help( + "format of unicode escape sequences is `\\u{...}`", + ); + } + + diag.emit(); + } + EscapeError::UnicodeEscapeInByte => { + handler.span_err(span, "unicode escape sequences cannot be used \ + as a byte or in a byte string") + } + EscapeError::EmptyUnicodeEscape => { + handler.span_err(span, "empty unicode escape (must have at least 1 hex digit)") + } + EscapeError::ZeroChars => { + handler.span_err(span, "empty character literal") + } + EscapeError::LoneSlash => { + panic!("lexer accepted unterminated literal with trailing slash") + } + } +} + +/// Pushes a character to a message string for error reporting +pub(crate) fn push_escaped_char(msg: &mut String, c: char) { + match c { + '\u{20}'..='\u{7e}' => { + // Don't escape \, ' or " for user-facing messages + msg.push(c); + } + _ => { + msg.extend(c.escape_default()); + } + } +} diff --git a/src/test/ui/fmt/format-string-error-2.rs b/src/test/ui/fmt/format-string-error-2.rs index 5c25ae502ff6d..e9169d338f7ec 100644 --- a/src/test/ui/fmt/format-string-error-2.rs +++ b/src/test/ui/fmt/format-string-error-2.rs @@ -1,3 +1,4 @@ +// compile-flags: -Z continue-parse-after-error // ignore-tidy-tab fn main() { @@ -76,7 +77,7 @@ raw { \n println!("\x7B}\u8 {", 1); //~^ ERROR incorrect unicode escape sequence - //~| ERROR argument never used + //~| ERROR invalid format string: expected `'}'` but string was terminated // note: raw strings don't escape `\xFF` and `\u{FF}` sequences println!(r#"\x7B}\u{8} {"#, 1); diff --git a/src/test/ui/fmt/format-string-error-2.stderr b/src/test/ui/fmt/format-string-error-2.stderr index 66d35a1b854d3..6656cc8236973 100644 --- a/src/test/ui/fmt/format-string-error-2.stderr +++ b/src/test/ui/fmt/format-string-error-2.stderr @@ -1,13 +1,13 @@ error: incorrect unicode escape sequence - --> $DIR/format-string-error-2.rs:77:20 + --> $DIR/format-string-error-2.rs:78:20 | LL | println!("\x7B}\u8 {", 1); | ^^- - | | - | help: format of unicode escape sequences uses braces: `\u{8}` + | | + | help: format of unicode escape sequences uses braces: `\u{8}` error: invalid format string: expected `'}'`, found `'a'` - --> $DIR/format-string-error-2.rs:5:5 + --> $DIR/format-string-error-2.rs:6:5 | LL | format!("{ | - because of this opening brace @@ -17,7 +17,7 @@ LL | a"); = note: if you intended to print `{`, you can escape it using `{{` error: invalid format string: expected `'}'`, found `'b'` - --> $DIR/format-string-error-2.rs:9:5 + --> $DIR/format-string-error-2.rs:10:5 | LL | format!("{ \ | - because of this opening brace @@ -28,7 +28,7 @@ LL | b"); = note: if you intended to print `{`, you can escape it using `{{` error: invalid format string: expected `'}'`, found `'\'` - --> $DIR/format-string-error-2.rs:11:18 + --> $DIR/format-string-error-2.rs:12:18 | LL | format!(r#"{ \ | - ^ expected `}` in format string @@ -38,7 +38,7 @@ LL | format!(r#"{ \ = note: if you intended to print `{`, you can escape it using `{{` error: invalid format string: expected `'}'`, found `'\'` - --> $DIR/format-string-error-2.rs:15:18 + --> $DIR/format-string-error-2.rs:16:18 | LL | format!(r#"{ \n | - ^ expected `}` in format string @@ -48,7 +48,7 @@ LL | format!(r#"{ \n = note: if you intended to print `{`, you can escape it using `{{` error: invalid format string: expected `'}'`, found `'e'` - --> $DIR/format-string-error-2.rs:21:5 + --> $DIR/format-string-error-2.rs:22:5 | LL | format!("{ \n | - because of this opening brace @@ -59,7 +59,7 @@ LL | e"); = note: if you intended to print `{`, you can escape it using `{{` error: invalid format string: expected `'}'`, found `'a'` - --> $DIR/format-string-error-2.rs:25:5 + --> $DIR/format-string-error-2.rs:26:5 | LL | { | - because of this opening brace @@ -69,7 +69,7 @@ LL | a"); = note: if you intended to print `{`, you can escape it using `{{` error: invalid format string: expected `'}'`, found `'a'` - --> $DIR/format-string-error-2.rs:29:5 + --> $DIR/format-string-error-2.rs:30:5 | LL | { | - because of this opening brace @@ -79,7 +79,7 @@ LL | a = note: if you intended to print `{`, you can escape it using `{{` error: invalid format string: expected `'}'`, found `'b'` - --> $DIR/format-string-error-2.rs:35:5 + --> $DIR/format-string-error-2.rs:36:5 | LL | { \ | - because of this opening brace @@ -90,7 +90,7 @@ LL | b"); = note: if you intended to print `{`, you can escape it using `{{` error: invalid format string: expected `'}'`, found `'b'` - --> $DIR/format-string-error-2.rs:40:5 + --> $DIR/format-string-error-2.rs:41:5 | LL | { \ | - because of this opening brace @@ -101,7 +101,7 @@ LL | b \ = note: if you intended to print `{`, you can escape it using `{{` error: invalid format string: expected `'}'`, found `'\'` - --> $DIR/format-string-error-2.rs:45:8 + --> $DIR/format-string-error-2.rs:46:8 | LL | raw { \ | - ^ expected `}` in format string @@ -111,7 +111,7 @@ LL | raw { \ = note: if you intended to print `{`, you can escape it using `{{` error: invalid format string: expected `'}'`, found `'\'` - --> $DIR/format-string-error-2.rs:50:8 + --> $DIR/format-string-error-2.rs:51:8 | LL | raw { \n | - ^ expected `}` in format string @@ -121,7 +121,7 @@ LL | raw { \n = note: if you intended to print `{`, you can escape it using `{{` error: invalid format string: expected `'}'`, found `'e'` - --> $DIR/format-string-error-2.rs:57:5 + --> $DIR/format-string-error-2.rs:58:5 | LL | { \n | - because of this opening brace @@ -132,7 +132,7 @@ LL | e"); = note: if you intended to print `{`, you can escape it using `{{` error: invalid format string: expected `'}'`, found `'a'` - --> $DIR/format-string-error-2.rs:67:5 + --> $DIR/format-string-error-2.rs:68:5 | LL | { | - because of this opening brace @@ -142,13 +142,13 @@ LL | asdf} = note: if you intended to print `{`, you can escape it using `{{` error: 1 positional argument in format string, but no arguments were given - --> $DIR/format-string-error-2.rs:70:17 + --> $DIR/format-string-error-2.rs:71:17 | LL | println!("\t{}"); | ^^ error: invalid format string: expected `'}'` but string was terminated - --> $DIR/format-string-error-2.rs:74:27 + --> $DIR/format-string-error-2.rs:75:27 | LL | println!("\x7B}\u{8} {", 1); | -^ expected `'}'` in format string @@ -157,16 +157,18 @@ LL | println!("\x7B}\u{8} {", 1); | = note: if you intended to print `{`, you can escape it using `{{` -error: argument never used - --> $DIR/format-string-error-2.rs:77:28 +error: invalid format string: expected `'}'` but string was terminated + --> $DIR/format-string-error-2.rs:78:27 | LL | println!("\x7B}\u8 {", 1); - | ------------ ^ argument never used - | | - | formatting specifier missing + | -^ expected `'}'` in format string + | | + | because of this opening brace + | + = note: if you intended to print `{`, you can escape it using `{{` error: invalid format string: unmatched `}` found - --> $DIR/format-string-error-2.rs:82:21 + --> $DIR/format-string-error-2.rs:83:21 | LL | println!(r#"\x7B}\u{8} {"#, 1); | ^ unmatched `}` in format string @@ -174,7 +176,7 @@ LL | println!(r#"\x7B}\u{8} {"#, 1); = note: if you intended to print `}`, you can escape it using `}}` error: invalid format string: unmatched `}` found - --> $DIR/format-string-error-2.rs:85:21 + --> $DIR/format-string-error-2.rs:86:21 | LL | println!(r#"\x7B}\u8 {"#, 1); | ^ unmatched `}` in format string diff --git a/src/test/ui/parser/ascii-only-character-escape.stderr b/src/test/ui/parser/ascii-only-character-escape.stderr index 8a981e8d62e2b..391677917580b 100644 --- a/src/test/ui/parser/ascii-only-character-escape.stderr +++ b/src/test/ui/parser/ascii-only-character-escape.stderr @@ -1,20 +1,20 @@ error: this form of character escape may only be used with characters in the range [\x00-\x7f] - --> $DIR/ascii-only-character-escape.rs:4:16 + --> $DIR/ascii-only-character-escape.rs:4:14 | LL | let x = "\x80"; - | ^^ + | ^^^^ error: this form of character escape may only be used with characters in the range [\x00-\x7f] - --> $DIR/ascii-only-character-escape.rs:5:16 + --> $DIR/ascii-only-character-escape.rs:5:14 | LL | let y = "\xff"; - | ^^ + | ^^^^ error: this form of character escape may only be used with characters in the range [\x00-\x7f] - --> $DIR/ascii-only-character-escape.rs:6:16 + --> $DIR/ascii-only-character-escape.rs:6:14 | LL | let z = "\xe2"; - | ^^ + | ^^^^ error: aborting due to 3 previous errors diff --git a/src/test/ui/parser/byte-literals.stderr b/src/test/ui/parser/byte-literals.stderr index 28385f34f2ab2..58a5797b90776 100644 --- a/src/test/ui/parser/byte-literals.stderr +++ b/src/test/ui/parser/byte-literals.stderr @@ -34,11 +34,11 @@ error: byte constant must be ASCII. Use a \xHH escape for a non-ASCII byte LL | b'é'; | ^ -error: unterminated byte constant: b'a - --> $DIR/byte-literals.rs:14:5 +error: unterminated byte constant + --> $DIR/byte-literals.rs:14:6 | LL | b'a - | ^^^ + | ^^^^ error: aborting due to 7 previous errors diff --git a/src/test/ui/parser/byte-string-literals.stderr b/src/test/ui/parser/byte-string-literals.stderr index b855484444010..eeb2fcd12320b 100644 --- a/src/test/ui/parser/byte-string-literals.stderr +++ b/src/test/ui/parser/byte-string-literals.stderr @@ -23,10 +23,10 @@ LL | b"é"; | ^ error: unterminated double quote byte string - --> $DIR/byte-string-literals.rs:9:7 + --> $DIR/byte-string-literals.rs:9:6 | LL | b"a - | _______^ + | ______^ LL | | } | |__^ diff --git a/src/test/ui/parser/issue-23620-invalid-escapes.rs b/src/test/ui/parser/issue-23620-invalid-escapes.rs index b4b8f1fc0b0ab..53629973a1b5f 100644 --- a/src/test/ui/parser/issue-23620-invalid-escapes.rs +++ b/src/test/ui/parser/issue-23620-invalid-escapes.rs @@ -9,32 +9,27 @@ fn main() { let _ = b'\u'; //~^ ERROR incorrect unicode escape sequence - //~^^ ERROR unicode escape sequences cannot be used as a byte or in a byte string let _ = b'\x5'; //~^ ERROR numeric character escape is too short let _ = b'\xxy'; //~^ ERROR invalid character in numeric character escape: x - //~^^ ERROR invalid character in numeric character escape: y let _ = '\x5'; //~^ ERROR numeric character escape is too short let _ = '\xxy'; //~^ ERROR invalid character in numeric character escape: x - //~^^ ERROR invalid character in numeric character escape: y let _ = b"\u{a4a4} \xf \u"; //~^ ERROR unicode escape sequences cannot be used as a byte or in a byte string //~^^ ERROR invalid character in numeric character escape: //~^^^ ERROR incorrect unicode escape sequence - //~^^^^ ERROR unicode escape sequences cannot be used as a byte or in a byte string let _ = "\xf \u"; //~^ ERROR invalid character in numeric character escape: - //~^^ ERROR form of character escape may only be used with characters in the range [\x00-\x7f] - //~^^^ ERROR incorrect unicode escape sequence + //~^^ ERROR incorrect unicode escape sequence let _ = "\u8f"; //~^ ERROR incorrect unicode escape sequence diff --git a/src/test/ui/parser/issue-23620-invalid-escapes.stderr b/src/test/ui/parser/issue-23620-invalid-escapes.stderr index 295ba3b73e861..5fabc1d7e4326 100644 --- a/src/test/ui/parser/issue-23620-invalid-escapes.stderr +++ b/src/test/ui/parser/issue-23620-invalid-escapes.stderr @@ -18,88 +18,58 @@ LL | let _ = b'\u'; | = help: format of unicode escape sequences is `\u{...}` -error: unicode escape sequences cannot be used as a byte or in a byte string - --> $DIR/issue-23620-invalid-escapes.rs:10:15 - | -LL | let _ = b'\u'; - | ^^ - error: numeric character escape is too short - --> $DIR/issue-23620-invalid-escapes.rs:14:17 + --> $DIR/issue-23620-invalid-escapes.rs:13:15 | LL | let _ = b'\x5'; - | ^ + | ^^^ error: invalid character in numeric character escape: x - --> $DIR/issue-23620-invalid-escapes.rs:17:17 + --> $DIR/issue-23620-invalid-escapes.rs:16:17 | LL | let _ = b'\xxy'; | ^ -error: invalid character in numeric character escape: y - --> $DIR/issue-23620-invalid-escapes.rs:17:18 - | -LL | let _ = b'\xxy'; - | ^ - error: numeric character escape is too short - --> $DIR/issue-23620-invalid-escapes.rs:21:16 + --> $DIR/issue-23620-invalid-escapes.rs:19:14 | LL | let _ = '\x5'; - | ^ + | ^^^ error: invalid character in numeric character escape: x - --> $DIR/issue-23620-invalid-escapes.rs:24:16 + --> $DIR/issue-23620-invalid-escapes.rs:22:16 | LL | let _ = '\xxy'; | ^ -error: invalid character in numeric character escape: y - --> $DIR/issue-23620-invalid-escapes.rs:24:17 - | -LL | let _ = '\xxy'; - | ^ - error: unicode escape sequences cannot be used as a byte or in a byte string - --> $DIR/issue-23620-invalid-escapes.rs:28:15 + --> $DIR/issue-23620-invalid-escapes.rs:25:15 | LL | let _ = b"\u{a4a4} \xf \u"; | ^^^^^^^^ error: invalid character in numeric character escape: - --> $DIR/issue-23620-invalid-escapes.rs:28:27 + --> $DIR/issue-23620-invalid-escapes.rs:25:27 | LL | let _ = b"\u{a4a4} \xf \u"; | ^ error: incorrect unicode escape sequence - --> $DIR/issue-23620-invalid-escapes.rs:28:28 + --> $DIR/issue-23620-invalid-escapes.rs:25:28 | LL | let _ = b"\u{a4a4} \xf \u"; | ^^ incorrect unicode escape sequence | = help: format of unicode escape sequences is `\u{...}` -error: unicode escape sequences cannot be used as a byte or in a byte string - --> $DIR/issue-23620-invalid-escapes.rs:28:28 - | -LL | let _ = b"\u{a4a4} \xf \u"; - | ^^ - error: invalid character in numeric character escape: - --> $DIR/issue-23620-invalid-escapes.rs:34:17 + --> $DIR/issue-23620-invalid-escapes.rs:30:17 | LL | let _ = "\xf \u"; | ^ -error: this form of character escape may only be used with characters in the range [\x00-\x7f] - --> $DIR/issue-23620-invalid-escapes.rs:34:16 - | -LL | let _ = "\xf \u"; - | ^^ - error: incorrect unicode escape sequence - --> $DIR/issue-23620-invalid-escapes.rs:34:18 + --> $DIR/issue-23620-invalid-escapes.rs:30:18 | LL | let _ = "\xf \u"; | ^^ incorrect unicode escape sequence @@ -107,12 +77,12 @@ LL | let _ = "\xf \u"; = help: format of unicode escape sequences is `\u{...}` error: incorrect unicode escape sequence - --> $DIR/issue-23620-invalid-escapes.rs:39:14 + --> $DIR/issue-23620-invalid-escapes.rs:34:14 | LL | let _ = "\u8f"; | ^^-- - | | - | help: format of unicode escape sequences uses braces: `\u{8f}` + | | + | help: format of unicode escape sequences uses braces: `\u{8f}` -error: aborting due to 18 previous errors +error: aborting due to 13 previous errors diff --git a/src/test/ui/parser/lex-bad-char-literals-1.stderr b/src/test/ui/parser/lex-bad-char-literals-1.stderr index 414ad81512ae5..000d155c26833 100644 --- a/src/test/ui/parser/lex-bad-char-literals-1.stderr +++ b/src/test/ui/parser/lex-bad-char-literals-1.stderr @@ -1,14 +1,14 @@ error: numeric character escape is too short - --> $DIR/lex-bad-char-literals-1.rs:3:8 + --> $DIR/lex-bad-char-literals-1.rs:3:6 | LL | '\x1' - | ^ + | ^^^ error: numeric character escape is too short - --> $DIR/lex-bad-char-literals-1.rs:7:8 + --> $DIR/lex-bad-char-literals-1.rs:7:6 | LL | "\x1" - | ^ + | ^^^ error: unknown character escape: \u{25cf} --> $DIR/lex-bad-char-literals-1.rs:11:7 diff --git a/src/test/ui/parser/lex-bad-char-literals-2.stderr b/src/test/ui/parser/lex-bad-char-literals-2.stderr index 4c1c5c29f472e..b0a4ed02434b4 100644 --- a/src/test/ui/parser/lex-bad-char-literals-2.stderr +++ b/src/test/ui/parser/lex-bad-char-literals-2.stderr @@ -3,6 +3,10 @@ error: character literal may only contain one codepoint | LL | 'nope' | ^^^^^^ +help: if you meant to write a `str` literal, use double quotes + | +LL | "nope" + | ^^^^^^ error[E0601]: `main` function not found in crate `lex_bad_char_literals_2` | diff --git a/src/test/ui/parser/lex-bad-char-literals-4.rs b/src/test/ui/parser/lex-bad-char-literals-4.rs index e13f11f36df48..de0a19df99360 100644 --- a/src/test/ui/parser/lex-bad-char-literals-4.rs +++ b/src/test/ui/parser/lex-bad-char-literals-4.rs @@ -1,5 +1,5 @@ // // This test needs to the last one appearing in this file as it kills the parser static c: char = - '● //~ ERROR: character literal may only contain one codepoint + '● //~ ERROR: unterminated character literal ; diff --git a/src/test/ui/parser/lex-bad-char-literals-4.stderr b/src/test/ui/parser/lex-bad-char-literals-4.stderr index 7bcca3761fc60..8f8f806f6cf61 100644 --- a/src/test/ui/parser/lex-bad-char-literals-4.stderr +++ b/src/test/ui/parser/lex-bad-char-literals-4.stderr @@ -1,8 +1,8 @@ -error: character literal may only contain one codepoint: '● +error: unterminated character literal --> $DIR/lex-bad-char-literals-4.rs:4:5 | LL | '● - | ^^ + | ^^^^ error: aborting due to previous error diff --git a/src/test/ui/parser/lex-bad-char-literals-6.stderr b/src/test/ui/parser/lex-bad-char-literals-6.stderr index 74959c9a4ed4a..a7bbe05e94b7b 100644 --- a/src/test/ui/parser/lex-bad-char-literals-6.stderr +++ b/src/test/ui/parser/lex-bad-char-literals-6.stderr @@ -3,18 +3,30 @@ error: character literal may only contain one codepoint | LL | let x: &str = 'ab'; | ^^^^ +help: if you meant to write a `str` literal, use double quotes + | +LL | let x: &str = "ab"; + | ^^^^ error: character literal may only contain one codepoint --> $DIR/lex-bad-char-literals-6.rs:4:19 | LL | let y: char = 'cd'; | ^^^^ +help: if you meant to write a `str` literal, use double quotes + | +LL | let y: char = "cd"; + | ^^^^ error: character literal may only contain one codepoint --> $DIR/lex-bad-char-literals-6.rs:6:13 | LL | let z = 'ef'; | ^^^^ +help: if you meant to write a `str` literal, use double quotes + | +LL | let z = "ef"; + | ^^^^ error[E0277]: can't compare `&str` with `char` --> $DIR/lex-bad-char-literals-6.rs:9:10 diff --git a/src/test/ui/parser/lex-bad-char-literals-7.rs b/src/test/ui/parser/lex-bad-char-literals-7.rs new file mode 100644 index 0000000000000..70eafcb91dacb --- /dev/null +++ b/src/test/ui/parser/lex-bad-char-literals-7.rs @@ -0,0 +1,14 @@ +// compile-flags: -Z continue-parse-after-error +fn main() { + let _: char = ''; + //~^ ERROR: empty character literal + let _: char = '\u{}'; + //~^ ERROR: empty unicode escape (must have at least 1 hex digit) + + // Next two are OK, but may befool error recovery + let _ = '/'; + let _ = b'/'; + + let _ = ' hello // here's a comment + //~^ ERROR: unterminated character literal +} diff --git a/src/test/ui/parser/lex-bad-char-literals-7.stderr b/src/test/ui/parser/lex-bad-char-literals-7.stderr new file mode 100644 index 0000000000000..e1ba3c3ee0f17 --- /dev/null +++ b/src/test/ui/parser/lex-bad-char-literals-7.stderr @@ -0,0 +1,20 @@ +error: empty character literal + --> $DIR/lex-bad-char-literals-7.rs:3:20 + | +LL | let _: char = ''; + | ^ + +error: empty unicode escape (must have at least 1 hex digit) + --> $DIR/lex-bad-char-literals-7.rs:5:20 + | +LL | let _: char = '\u{}'; + | ^^^^ + +error: unterminated character literal + --> $DIR/lex-bad-char-literals-7.rs:12:13 + | +LL | let _ = ' hello // here's a comment + | ^^^^^^^^ + +error: aborting due to 3 previous errors + diff --git a/src/test/ui/parser/macro/literals-are-validated-before-expansion.rs b/src/test/ui/parser/macro/literals-are-validated-before-expansion.rs new file mode 100644 index 0000000000000..c3fc754b5567f --- /dev/null +++ b/src/test/ui/parser/macro/literals-are-validated-before-expansion.rs @@ -0,0 +1,10 @@ +macro_rules! black_hole { + ($($tt:tt)*) => {} +} + +fn main() { + black_hole! { '\u{FFFFFF}' } + //~^ ERROR: invalid unicode character escape + black_hole! { "this is surrogate: \u{DAAA}" } + //~^ ERROR: invalid unicode character escape +} diff --git a/src/test/ui/parser/macro/literals-are-validated-before-expansion.stderr b/src/test/ui/parser/macro/literals-are-validated-before-expansion.stderr new file mode 100644 index 0000000000000..d20eb0fb30a49 --- /dev/null +++ b/src/test/ui/parser/macro/literals-are-validated-before-expansion.stderr @@ -0,0 +1,18 @@ +error: invalid unicode character escape + --> $DIR/literals-are-validated-before-expansion.rs:6:20 + | +LL | black_hole! { '\u{FFFFFF}' } + | ^^^^^^^^^^ + | + = help: unicode escape must be at most 10FFFF + +error: invalid unicode character escape + --> $DIR/literals-are-validated-before-expansion.rs:8:39 + | +LL | black_hole! { "this is surrogate: \u{DAAA}" } + | ^^^^^^^^ + | + = help: unicode escape must not be a surrogate + +error: aborting due to 2 previous errors + diff --git a/src/test/ui/parser/new-unicode-escapes-1.stderr b/src/test/ui/parser/new-unicode-escapes-1.stderr index a8da50951ddf3..22d6a0981ffd6 100644 --- a/src/test/ui/parser/new-unicode-escapes-1.stderr +++ b/src/test/ui/parser/new-unicode-escapes-1.stderr @@ -1,8 +1,8 @@ error: unterminated unicode escape (needed a `}`) - --> $DIR/new-unicode-escapes-1.rs:2:21 + --> $DIR/new-unicode-escapes-1.rs:2:14 | LL | let s = "\u{2603"; - | ^ + | ^^^^^^^ error: aborting due to previous error diff --git a/src/test/ui/parser/new-unicode-escapes-2.stderr b/src/test/ui/parser/new-unicode-escapes-2.stderr index ede49cdf7e1d1..b5148279c7450 100644 --- a/src/test/ui/parser/new-unicode-escapes-2.stderr +++ b/src/test/ui/parser/new-unicode-escapes-2.stderr @@ -1,8 +1,8 @@ error: overlong unicode escape (must have at most 6 hex digits) - --> $DIR/new-unicode-escapes-2.rs:2:17 + --> $DIR/new-unicode-escapes-2.rs:2:14 | LL | let s = "\u{260311111111}"; - | ^^^^^^^^^^^^ + | ^^^^^^^^^^^^^^^^ error: aborting due to previous error diff --git a/src/test/ui/parser/new-unicode-escapes-3.stderr b/src/test/ui/parser/new-unicode-escapes-3.stderr index 59cfb988f2897..361698467f97d 100644 --- a/src/test/ui/parser/new-unicode-escapes-3.stderr +++ b/src/test/ui/parser/new-unicode-escapes-3.stderr @@ -1,16 +1,16 @@ error: invalid unicode character escape - --> $DIR/new-unicode-escapes-3.rs:2:14 + --> $DIR/new-unicode-escapes-3.rs:2:15 | LL | let s1 = "\u{d805}"; - | ^^^^^^^^^^ + | ^^^^^^^^ | = help: unicode escape must not be a surrogate error: invalid unicode character escape - --> $DIR/new-unicode-escapes-3.rs:3:14 + --> $DIR/new-unicode-escapes-3.rs:3:15 | LL | let s2 = "\u{ffffff}"; - | ^^^^^^^^^^^^ + | ^^^^^^^^^^ | = help: unicode escape must be at most 10FFFF From 1835cbeb6574997ec5188cb22b9538c61976d2b4 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Thu, 2 May 2019 20:56:07 +0300 Subject: [PATCH 2/2] don't amplify errors in format! with bad literals --- src/libsyntax/ext/base.rs | 1 + src/libsyntax/parse/mod.rs | 21 +++++++++++++------- src/test/ui/fmt/format-string-error-2.rs | 1 - src/test/ui/fmt/format-string-error-2.stderr | 16 +++------------ src/test/ui/str/str-as-char.fixed | 3 +-- src/test/ui/str/str-as-char.rs | 1 - src/test/ui/str/str-as-char.stderr | 12 +---------- 7 files changed, 20 insertions(+), 35 deletions(-) diff --git a/src/libsyntax/ext/base.rs b/src/libsyntax/ext/base.rs index 452cc2f2c65cc..cc19acb61adc2 100644 --- a/src/libsyntax/ext/base.rs +++ b/src/libsyntax/ext/base.rs @@ -998,6 +998,7 @@ pub fn expr_to_spanned_string<'a>( Err(match expr.node { ast::ExprKind::Lit(ref l) => match l.node { ast::LitKind::Str(s, style) => return Ok(respan(expr.span, (s, style))), + ast::LitKind::Err(_) => None, _ => Some(cx.struct_span_err(l.span, err_msg)) }, ast::ExprKind::Err => None, diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs index d6d4f8e8f0429..34a86bab2294f 100644 --- a/src/libsyntax/parse/mod.rs +++ b/src/libsyntax/parse/mod.rs @@ -33,7 +33,7 @@ pub mod attr; pub mod classify; pub(crate) mod unescape; -use unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte, EscapeError}; +use unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte}; pub(crate) mod unescape_error_reporting; @@ -355,16 +355,14 @@ crate fn lit_token(lit: token::Lit, suf: Option, diag: Option<(Span, &Ha token::Byte(i) => { let lit_kind = match unescape_byte(&i.as_str()) { Ok(c) => LitKind::Byte(c), - Err((_, EscapeError::MoreThanOneChar)) => LitKind::Err(i), - Err(_) => LitKind::Byte(0), + Err(_) => LitKind::Err(i), }; (true, Some(lit_kind)) }, token::Char(i) => { let lit_kind = match unescape_char(&i.as_str()) { Ok(c) => LitKind::Char(c), - Err((_, EscapeError::MoreThanOneChar)) => LitKind::Err(i), - Err(_) => LitKind::Char('\u{FFFD}'), + Err(_) => LitKind::Err(i), }; (true, Some(lit_kind)) }, @@ -380,17 +378,22 @@ crate fn lit_token(lit: token::Lit, suf: Option, diag: Option<(Span, &Ha // reuse the symbol from the Token. Otherwise, we must generate a // new symbol because the string in the LitKind is different to the // string in the Token. + let mut has_error = false; let s = &sym.as_str(); if s.as_bytes().iter().any(|&c| c == b'\\' || c == b'\r') { let mut buf = String::with_capacity(s.len()); unescape_str(s, &mut |_, unescaped_char| { match unescaped_char { Ok(c) => buf.push(c), - Err(_) => buf.push('\u{FFFD}'), + Err(_) => has_error = true, } }); + if has_error { + return (true, Some(LitKind::Err(sym))); + } sym = Symbol::intern(&buf) } + (true, Some(LitKind::Str(sym, ast::StrStyle::Cooked))) } token::StrRaw(mut sym, n) => { @@ -404,12 +407,16 @@ crate fn lit_token(lit: token::Lit, suf: Option, diag: Option<(Span, &Ha token::ByteStr(i) => { let s = &i.as_str(); let mut buf = Vec::with_capacity(s.len()); + let mut has_error = false; unescape_byte_str(s, &mut |_, unescaped_byte| { match unescaped_byte { Ok(c) => buf.push(c), - Err(_) => buf.push(0), + Err(_) => has_error = true, } }); + if has_error { + return (true, Some(LitKind::Err(i))); + } buf.shrink_to_fit(); (true, Some(LitKind::ByteStr(Lrc::new(buf)))) } diff --git a/src/test/ui/fmt/format-string-error-2.rs b/src/test/ui/fmt/format-string-error-2.rs index e9169d338f7ec..8ca98fc266a01 100644 --- a/src/test/ui/fmt/format-string-error-2.rs +++ b/src/test/ui/fmt/format-string-error-2.rs @@ -77,7 +77,6 @@ raw { \n println!("\x7B}\u8 {", 1); //~^ ERROR incorrect unicode escape sequence - //~| ERROR invalid format string: expected `'}'` but string was terminated // note: raw strings don't escape `\xFF` and `\u{FF}` sequences println!(r#"\x7B}\u{8} {"#, 1); diff --git a/src/test/ui/fmt/format-string-error-2.stderr b/src/test/ui/fmt/format-string-error-2.stderr index 6656cc8236973..227ec27efc87f 100644 --- a/src/test/ui/fmt/format-string-error-2.stderr +++ b/src/test/ui/fmt/format-string-error-2.stderr @@ -157,18 +157,8 @@ LL | println!("\x7B}\u{8} {", 1); | = note: if you intended to print `{`, you can escape it using `{{` -error: invalid format string: expected `'}'` but string was terminated - --> $DIR/format-string-error-2.rs:78:27 - | -LL | println!("\x7B}\u8 {", 1); - | -^ expected `'}'` in format string - | | - | because of this opening brace - | - = note: if you intended to print `{`, you can escape it using `{{` - error: invalid format string: unmatched `}` found - --> $DIR/format-string-error-2.rs:83:21 + --> $DIR/format-string-error-2.rs:82:21 | LL | println!(r#"\x7B}\u{8} {"#, 1); | ^ unmatched `}` in format string @@ -176,12 +166,12 @@ LL | println!(r#"\x7B}\u{8} {"#, 1); = note: if you intended to print `}`, you can escape it using `}}` error: invalid format string: unmatched `}` found - --> $DIR/format-string-error-2.rs:86:21 + --> $DIR/format-string-error-2.rs:85:21 | LL | println!(r#"\x7B}\u8 {"#, 1); | ^ unmatched `}` in format string | = note: if you intended to print `}`, you can escape it using `}}` -error: aborting due to 19 previous errors +error: aborting due to 18 previous errors diff --git a/src/test/ui/str/str-as-char.fixed b/src/test/ui/str/str-as-char.fixed index accead5c850cc..42bbef8391785 100644 --- a/src/test/ui/str/str-as-char.fixed +++ b/src/test/ui/str/str-as-char.fixed @@ -1,6 +1,5 @@ // run-rustfix fn main() { - println!("{}", "●●"); //~ ERROR character literal may only contain one codepoint - //~^ ERROR format argument must be a string literal + println!("●●"); //~ ERROR character literal may only contain one codepoint } diff --git a/src/test/ui/str/str-as-char.rs b/src/test/ui/str/str-as-char.rs index fb179ec7245d2..09b9dfc590db3 100644 --- a/src/test/ui/str/str-as-char.rs +++ b/src/test/ui/str/str-as-char.rs @@ -2,5 +2,4 @@ fn main() { println!('●●'); //~ ERROR character literal may only contain one codepoint - //~^ ERROR format argument must be a string literal } diff --git a/src/test/ui/str/str-as-char.stderr b/src/test/ui/str/str-as-char.stderr index 162f0888a2947..540a1b55376ff 100644 --- a/src/test/ui/str/str-as-char.stderr +++ b/src/test/ui/str/str-as-char.stderr @@ -8,15 +8,5 @@ help: if you meant to write a `str` literal, use double quotes LL | println!("●●"); | ^^^^ -error: format argument must be a string literal - --> $DIR/str-as-char.rs:4:14 - | -LL | println!('●●'); - | ^^^^ -help: you might be missing a string literal to format with - | -LL | println!("{}", '●●'); - | ^^^^^ - -error: aborting due to 2 previous errors +error: aborting due to previous error