diff --git a/compiler/rustc_ast/src/token.rs b/compiler/rustc_ast/src/token.rs index 97dfb7837674f..fa6162c51847a 100644 --- a/compiler/rustc_ast/src/token.rs +++ b/compiler/rustc_ast/src/token.rs @@ -13,7 +13,7 @@ use rustc_span::symbol::{kw, sym}; use rustc_span::symbol::{Ident, Symbol}; use rustc_span::{self, edition::Edition, Span, DUMMY_SP}; use std::borrow::Cow; -use std::{fmt, mem}; +use std::fmt; #[derive(Clone, Copy, PartialEq, Encodable, Decodable, Debug, HashStable_Generic)] pub enum CommentKind { @@ -335,11 +335,6 @@ impl Token { Token::new(Ident(ident.name, ident.is_raw_guess()), ident.span) } - /// Return this token by value and leave a dummy token in its place. - pub fn take(&mut self) -> Self { - mem::replace(self, Token::dummy()) - } - /// For interpolated tokens, returns a span of the fragment to which the interpolated /// token refers. For all other tokens this is just a regular span. /// It is particularly important to use this for identifiers and lifetimes diff --git a/compiler/rustc_errors/src/lib.rs b/compiler/rustc_errors/src/lib.rs index b44cf352233e3..babab1fa112fc 100644 --- a/compiler/rustc_errors/src/lib.rs +++ b/compiler/rustc_errors/src/lib.rs @@ -63,7 +63,8 @@ pub mod translation; pub use diagnostic_builder::IntoDiagnostic; pub use snippet::Style; -pub type PResult<'a, T> = Result>; +pub type PErr<'a> = DiagnosticBuilder<'a, ErrorGuaranteed>; +pub type PResult<'a, T> = Result>; // `PResult` is used a lot. Make sure it doesn't unintentionally get bigger. // (See also the comment on `DiagnosticBuilder`'s `diagnostic` field.) diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs index 21557a9c85401..eceef59802eb9 100644 --- a/compiler/rustc_lexer/src/cursor.rs +++ b/compiler/rustc_lexer/src/cursor.rs @@ -4,8 +4,8 @@ use std::str::Chars; /// /// Next characters can be peeked via `first` method, /// and position can be shifted forward via `bump` method. -pub(crate) struct Cursor<'a> { - initial_len: usize, +pub struct Cursor<'a> { + len_remaining: usize, /// Iterator over chars. Slightly faster than a &str. chars: Chars<'a>, #[cfg(debug_assertions)] @@ -15,9 +15,9 @@ pub(crate) struct Cursor<'a> { pub(crate) const EOF_CHAR: char = '\0'; impl<'a> Cursor<'a> { - pub(crate) fn new(input: &'a str) -> Cursor<'a> { + pub fn new(input: &'a str) -> Cursor<'a> { Cursor { - initial_len: input.len(), + len_remaining: input.len(), chars: input.chars(), #[cfg(debug_assertions)] prev: EOF_CHAR, @@ -61,13 +61,13 @@ impl<'a> Cursor<'a> { } /// Returns amount of already consumed symbols. - pub(crate) fn len_consumed(&self) -> u32 { - (self.initial_len - self.chars.as_str().len()) as u32 + pub(crate) fn pos_within_token(&self) -> u32 { + (self.len_remaining - self.chars.as_str().len()) as u32 } /// Resets the number of bytes consumed to 0. - pub(crate) fn reset_len_consumed(&mut self) { - self.initial_len = self.chars.as_str().len(); + pub(crate) fn reset_pos_within_token(&mut self) { + self.len_remaining = self.chars.as_str().len(); } /// Moves to the next character. diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index a79c982649afc..c71e6ffe34d9b 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -29,9 +29,11 @@ pub mod unescape; #[cfg(test)] mod tests; +pub use crate::cursor::Cursor; + use self::LiteralKind::*; use self::TokenKind::*; -use crate::cursor::{Cursor, EOF_CHAR}; +use crate::cursor::EOF_CHAR; use std::convert::TryFrom; /// Parsed token. @@ -139,6 +141,9 @@ pub enum TokenKind { /// Unknown token, not expected by the lexer, e.g. "№" Unknown, + + /// End of input. + Eof, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -219,13 +224,6 @@ pub fn strip_shebang(input: &str) -> Option { None } -/// Parses the first token from the provided input string. -#[inline] -pub fn first_token(input: &str) -> Token { - debug_assert!(!input.is_empty()); - Cursor::new(input).advance_token() -} - /// Validates a raw string literal. Used for getting more information about a /// problem with a `RawStr`/`RawByteStr` with a `None` field. #[inline] @@ -243,12 +241,8 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> pub fn tokenize(input: &str) -> impl Iterator + '_ { let mut cursor = Cursor::new(input); std::iter::from_fn(move || { - if cursor.is_eof() { - None - } else { - cursor.reset_len_consumed(); - Some(cursor.advance_token()) - } + let token = cursor.advance_token(); + if token.kind != TokenKind::Eof { Some(token) } else { None } }) } @@ -311,8 +305,11 @@ pub fn is_ident(string: &str) -> bool { impl Cursor<'_> { /// Parses a token from the input string. - fn advance_token(&mut self) -> Token { - let first_char = self.bump().unwrap(); + pub fn advance_token(&mut self) -> Token { + let first_char = match self.bump() { + Some(c) => c, + None => return Token::new(TokenKind::Eof, 0), + }; let token_kind = match first_char { // Slash, comment or block comment. '/' => match self.first() { @@ -329,7 +326,7 @@ impl Cursor<'_> { ('#', c1) if is_id_start(c1) => self.raw_ident(), ('#', _) | ('"', _) => { let res = self.raw_double_quoted_string(1); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if res.is_ok() { self.eat_literal_suffix(); } @@ -344,7 +341,7 @@ impl Cursor<'_> { ('\'', _) => { self.bump(); let terminated = self.single_quoted_string(); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if terminated { self.eat_literal_suffix(); } @@ -354,7 +351,7 @@ impl Cursor<'_> { ('"', _) => { self.bump(); let terminated = self.double_quoted_string(); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if terminated { self.eat_literal_suffix(); } @@ -364,7 +361,7 @@ impl Cursor<'_> { ('r', '"') | ('r', '#') => { self.bump(); let res = self.raw_double_quoted_string(2); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if res.is_ok() { self.eat_literal_suffix(); } @@ -381,7 +378,7 @@ impl Cursor<'_> { // Numeric literal. c @ '0'..='9' => { let literal_kind = self.number(c); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); self.eat_literal_suffix(); TokenKind::Literal { kind: literal_kind, suffix_start } } @@ -420,7 +417,7 @@ impl Cursor<'_> { // String literal. '"' => { let terminated = self.double_quoted_string(); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if terminated { self.eat_literal_suffix(); } @@ -433,7 +430,9 @@ impl Cursor<'_> { } _ => Unknown, }; - Token::new(token_kind, self.len_consumed()) + let res = Token::new(token_kind, self.pos_within_token()); + self.reset_pos_within_token(); + res } fn line_comment(&mut self) -> TokenKind { @@ -618,7 +617,7 @@ impl Cursor<'_> { if !can_be_a_lifetime { let terminated = self.single_quoted_string(); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if terminated { self.eat_literal_suffix(); } @@ -643,7 +642,7 @@ impl Cursor<'_> { if self.first() == '\'' { self.bump(); let kind = Char { terminated: true }; - Literal { kind, suffix_start: self.len_consumed() } + Literal { kind, suffix_start: self.pos_within_token() } } else { Lifetime { starts_with_number } } @@ -724,7 +723,7 @@ impl Cursor<'_> { fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result { debug_assert!(self.prev() == 'r'); - let start_pos = self.len_consumed(); + let start_pos = self.pos_within_token(); let mut possible_terminator_offset = None; let mut max_hashes = 0; @@ -778,7 +777,7 @@ impl Cursor<'_> { // Keep track of possible terminators to give a hint about // where there might be a missing terminator possible_terminator_offset = - Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len); + Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len); max_hashes = n_end_hashes; } } diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 63819a2f98df5..bcd078a89677e 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -1,10 +1,11 @@ use crate::lexer::unicode_chars::UNICODE_ARRAY; use rustc_ast::ast::{self, AttrStyle}; use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind}; -use rustc_ast::tokenstream::{Spacing, TokenStream}; +use rustc_ast::tokenstream::TokenStream; use rustc_ast::util::unicode::contains_text_flow_control_chars; use rustc_errors::{error_code, Applicability, DiagnosticBuilder, ErrorGuaranteed, PResult}; use rustc_lexer::unescape::{self, Mode}; +use rustc_lexer::Cursor; use rustc_lexer::{Base, DocStyle, RawStrError}; use rustc_session::lint::builtin::{ RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT, @@ -38,11 +39,20 @@ pub struct UnmatchedBrace { pub(crate) fn parse_token_trees<'a>( sess: &'a ParseSess, - src: &'a str, - start_pos: BytePos, + mut src: &'a str, + mut start_pos: BytePos, override_span: Option, ) -> (PResult<'a, TokenStream>, Vec) { - StringReader { sess, start_pos, pos: start_pos, src, override_span }.into_token_trees() + // Skip `#!`, if present. + if let Some(shebang_len) = rustc_lexer::strip_shebang(src) { + src = &src[shebang_len..]; + start_pos = start_pos + BytePos::from_usize(shebang_len); + } + + let cursor = Cursor::new(src); + let string_reader = + StringReader { sess, start_pos, pos: start_pos, src, cursor, override_span }; + tokentrees::TokenTreesReader::parse_token_trees(string_reader) } struct StringReader<'a> { @@ -53,6 +63,8 @@ struct StringReader<'a> { pos: BytePos, /// Source text to tokenize. src: &'a str, + /// Cursor for getting lexer tokens. + cursor: Cursor<'a>, override_span: Option, } @@ -61,42 +73,195 @@ impl<'a> StringReader<'a> { self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi)) } - /// Returns the next token, and info about preceding whitespace, if any. - fn next_token(&mut self) -> (Spacing, Token) { - let mut spacing = Spacing::Joint; - - // Skip `#!` at the start of the file - if self.pos == self.start_pos - && let Some(shebang_len) = rustc_lexer::strip_shebang(self.src) - { - self.pos = self.pos + BytePos::from_usize(shebang_len); - spacing = Spacing::Alone; - } + /// Returns the next token, paired with a bool indicating if the token was + /// preceded by whitespace. + fn next_token(&mut self) -> (Token, bool) { + let mut preceded_by_whitespace = false; // Skip trivial (whitespace & comments) tokens loop { - let start_src_index = self.src_index(self.pos); - let text: &str = &self.src[start_src_index..]; - - if text.is_empty() { - let span = self.mk_sp(self.pos, self.pos); - return (spacing, Token::new(token::Eof, span)); - } - - let token = rustc_lexer::first_token(text); - + let token = self.cursor.advance_token(); let start = self.pos; self.pos = self.pos + BytePos(token.len); debug!("next_token: {:?}({:?})", token.kind, self.str_from(start)); - match self.cook_lexer_token(token.kind, start) { - Some(kind) => { + // Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a + // rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs + // additional validation. + let kind = match token.kind { + rustc_lexer::TokenKind::LineComment { doc_style } => { + // Skip non-doc comments + let Some(doc_style) = doc_style else { + self.lint_unicode_text_flow(start); + preceded_by_whitespace = true; + continue; + }; + + // Opening delimiter of the length 3 is not included into the symbol. + let content_start = start + BytePos(3); + let content = self.str_from(content_start); + self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style) + } + rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => { + if !terminated { + self.report_unterminated_block_comment(start, doc_style); + } + + // Skip non-doc comments + let Some(doc_style) = doc_style else { + self.lint_unicode_text_flow(start); + preceded_by_whitespace = true; + continue; + }; + + // Opening delimiter of the length 3 and closing delimiter of the length 2 + // are not included into the symbol. + let content_start = start + BytePos(3); + let content_end = self.pos - BytePos(if terminated { 2 } else { 0 }); + let content = self.str_from_to(content_start, content_end); + self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style) + } + rustc_lexer::TokenKind::Whitespace => { + preceded_by_whitespace = true; + continue; + } + rustc_lexer::TokenKind::Ident => { + let sym = nfc_normalize(self.str_from(start)); let span = self.mk_sp(start, self.pos); - return (spacing, Token::new(kind, span)); + self.sess.symbol_gallery.insert(sym, span); + token::Ident(sym, false) } - None => spacing = Spacing::Alone, - } + rustc_lexer::TokenKind::RawIdent => { + let sym = nfc_normalize(self.str_from(start + BytePos(2))); + let span = self.mk_sp(start, self.pos); + self.sess.symbol_gallery.insert(sym, span); + if !sym.can_be_raw() { + self.err_span(span, &format!("`{}` cannot be a raw identifier", sym)); + } + self.sess.raw_identifier_spans.borrow_mut().push(span); + token::Ident(sym, true) + } + rustc_lexer::TokenKind::UnknownPrefix => { + self.report_unknown_prefix(start); + let sym = nfc_normalize(self.str_from(start)); + let span = self.mk_sp(start, self.pos); + self.sess.symbol_gallery.insert(sym, span); + token::Ident(sym, false) + } + rustc_lexer::TokenKind::InvalidIdent + // Do not recover an identifier with emoji if the codepoint is a confusable + // with a recoverable substitution token, like `➖`. + if !UNICODE_ARRAY + .iter() + .any(|&(c, _, _)| { + let sym = self.str_from(start); + sym.chars().count() == 1 && c == sym.chars().next().unwrap() + }) => + { + let sym = nfc_normalize(self.str_from(start)); + let span = self.mk_sp(start, self.pos); + self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default() + .push(span); + token::Ident(sym, false) + } + rustc_lexer::TokenKind::Literal { kind, suffix_start } => { + let suffix_start = start + BytePos(suffix_start); + let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind); + let suffix = if suffix_start < self.pos { + let string = self.str_from(suffix_start); + if string == "_" { + self.sess + .span_diagnostic + .struct_span_warn( + self.mk_sp(suffix_start, self.pos), + "underscore literal suffix is not allowed", + ) + .warn( + "this was previously accepted by the compiler but is \ + being phased out; it will become a hard error in \ + a future release!", + ) + .note( + "see issue #42326 \ + \ + for more information", + ) + .emit(); + None + } else { + Some(Symbol::intern(string)) + } + } else { + None + }; + token::Literal(token::Lit { kind, symbol, suffix }) + } + rustc_lexer::TokenKind::Lifetime { starts_with_number } => { + // Include the leading `'` in the real identifier, for macro + // expansion purposes. See #12512 for the gory details of why + // this is necessary. + let lifetime_name = self.str_from(start); + if starts_with_number { + self.err_span_(start, self.pos, "lifetimes cannot start with a number"); + } + let ident = Symbol::intern(lifetime_name); + token::Lifetime(ident) + } + rustc_lexer::TokenKind::Semi => token::Semi, + rustc_lexer::TokenKind::Comma => token::Comma, + rustc_lexer::TokenKind::Dot => token::Dot, + rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis), + rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis), + rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace), + rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace), + rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket), + rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket), + rustc_lexer::TokenKind::At => token::At, + rustc_lexer::TokenKind::Pound => token::Pound, + rustc_lexer::TokenKind::Tilde => token::Tilde, + rustc_lexer::TokenKind::Question => token::Question, + rustc_lexer::TokenKind::Colon => token::Colon, + rustc_lexer::TokenKind::Dollar => token::Dollar, + rustc_lexer::TokenKind::Eq => token::Eq, + rustc_lexer::TokenKind::Bang => token::Not, + rustc_lexer::TokenKind::Lt => token::Lt, + rustc_lexer::TokenKind::Gt => token::Gt, + rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus), + rustc_lexer::TokenKind::And => token::BinOp(token::And), + rustc_lexer::TokenKind::Or => token::BinOp(token::Or), + rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus), + rustc_lexer::TokenKind::Star => token::BinOp(token::Star), + rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash), + rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret), + rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), + + rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => { + let c = self.str_from(start).chars().next().unwrap(); + let mut err = + self.struct_err_span_char(start, self.pos, "unknown start of token", c); + // FIXME: the lexer could be used to turn the ASCII version of unicode + // homoglyphs, instead of keeping a table in `check_for_substitution`into the + // token. Ideally, this should be inside `rustc_lexer`. However, we should + // first remove compound tokens like `<<` from `rustc_lexer`, and then add + // fancier error recovery to it, as there will be less overall work to do this + // way. + let token = unicode_chars::check_for_substitution(self, start, c, &mut err); + if c == '\x00' { + err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used"); + } + err.emit(); + if let Some(token) = token { + token + } else { + preceded_by_whitespace = true; + continue; + } + } + rustc_lexer::TokenKind::Eof => token::Eof, + }; + let span = self.mk_sp(start, self.pos); + return (Token::new(kind, span), preceded_by_whitespace); } } @@ -162,171 +327,6 @@ impl<'a> StringReader<'a> { } } - /// Turns simple `rustc_lexer::TokenKind` enum into a rich - /// `rustc_ast::TokenKind`. This turns strings into interned - /// symbols and runs additional validation. - fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> Option { - Some(match token { - rustc_lexer::TokenKind::LineComment { doc_style } => { - // Skip non-doc comments - let Some(doc_style) = doc_style else { - self.lint_unicode_text_flow(start); - return None; - }; - - // Opening delimiter of the length 3 is not included into the symbol. - let content_start = start + BytePos(3); - let content = self.str_from(content_start); - self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style) - } - rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => { - if !terminated { - self.report_unterminated_block_comment(start, doc_style); - } - - // Skip non-doc comments - let Some(doc_style) = doc_style else { - self.lint_unicode_text_flow(start); - return None; - }; - - // Opening delimiter of the length 3 and closing delimiter of the length 2 - // are not included into the symbol. - let content_start = start + BytePos(3); - let content_end = self.pos - BytePos(if terminated { 2 } else { 0 }); - let content = self.str_from_to(content_start, content_end); - self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style) - } - rustc_lexer::TokenKind::Whitespace => return None, - rustc_lexer::TokenKind::Ident - | rustc_lexer::TokenKind::RawIdent - | rustc_lexer::TokenKind::UnknownPrefix => { - let is_raw_ident = token == rustc_lexer::TokenKind::RawIdent; - let is_unknown_prefix = token == rustc_lexer::TokenKind::UnknownPrefix; - let mut ident_start = start; - if is_raw_ident { - ident_start = ident_start + BytePos(2); - } - if is_unknown_prefix { - self.report_unknown_prefix(start); - } - let sym = nfc_normalize(self.str_from(ident_start)); - let span = self.mk_sp(start, self.pos); - self.sess.symbol_gallery.insert(sym, span); - if is_raw_ident { - if !sym.can_be_raw() { - self.err_span(span, &format!("`{}` cannot be a raw identifier", sym)); - } - self.sess.raw_identifier_spans.borrow_mut().push(span); - } - token::Ident(sym, is_raw_ident) - } - rustc_lexer::TokenKind::InvalidIdent - // Do not recover an identifier with emoji if the codepoint is a confusable - // with a recoverable substitution token, like `➖`. - if !UNICODE_ARRAY - .iter() - .any(|&(c, _, _)| { - let sym = self.str_from(start); - sym.chars().count() == 1 && c == sym.chars().next().unwrap() - }) - => - { - let sym = nfc_normalize(self.str_from(start)); - let span = self.mk_sp(start, self.pos); - self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span); - token::Ident(sym, false) - } - rustc_lexer::TokenKind::Literal { kind, suffix_start } => { - let suffix_start = start + BytePos(suffix_start); - let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind); - let suffix = if suffix_start < self.pos { - let string = self.str_from(suffix_start); - if string == "_" { - self.sess - .span_diagnostic - .struct_span_warn( - self.mk_sp(suffix_start, self.pos), - "underscore literal suffix is not allowed", - ) - .warn( - "this was previously accepted by the compiler but is \ - being phased out; it will become a hard error in \ - a future release!", - ) - .note( - "see issue #42326 \ - \ - for more information", - ) - .emit(); - None - } else { - Some(Symbol::intern(string)) - } - } else { - None - }; - token::Literal(token::Lit { kind, symbol, suffix }) - } - rustc_lexer::TokenKind::Lifetime { starts_with_number } => { - // Include the leading `'` in the real identifier, for macro - // expansion purposes. See #12512 for the gory details of why - // this is necessary. - let lifetime_name = self.str_from(start); - if starts_with_number { - self.err_span_(start, self.pos, "lifetimes cannot start with a number"); - } - let ident = Symbol::intern(lifetime_name); - token::Lifetime(ident) - } - rustc_lexer::TokenKind::Semi => token::Semi, - rustc_lexer::TokenKind::Comma => token::Comma, - rustc_lexer::TokenKind::Dot => token::Dot, - rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis), - rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis), - rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace), - rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace), - rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket), - rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket), - rustc_lexer::TokenKind::At => token::At, - rustc_lexer::TokenKind::Pound => token::Pound, - rustc_lexer::TokenKind::Tilde => token::Tilde, - rustc_lexer::TokenKind::Question => token::Question, - rustc_lexer::TokenKind::Colon => token::Colon, - rustc_lexer::TokenKind::Dollar => token::Dollar, - rustc_lexer::TokenKind::Eq => token::Eq, - rustc_lexer::TokenKind::Bang => token::Not, - rustc_lexer::TokenKind::Lt => token::Lt, - rustc_lexer::TokenKind::Gt => token::Gt, - rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus), - rustc_lexer::TokenKind::And => token::BinOp(token::And), - rustc_lexer::TokenKind::Or => token::BinOp(token::Or), - rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus), - rustc_lexer::TokenKind::Star => token::BinOp(token::Star), - rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash), - rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret), - rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), - - rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => { - let c = self.str_from(start).chars().next().unwrap(); - let mut err = - self.struct_err_span_char(start, self.pos, "unknown start of token", c); - // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, - // instead of keeping a table in `check_for_substitution`into the token. Ideally, - // this should be inside `rustc_lexer`. However, we should first remove compound - // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it, - // as there will be less overall work to do this way. - let token = unicode_chars::check_for_substitution(self, start, c, &mut err); - if c == '\x00' { - err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used"); - } - err.emit(); - token? - } - }) - } - fn cook_doc_comment( &self, content_start: BytePos, diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index aa70912dcde4c..364753154db05 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -1,31 +1,15 @@ use super::{StringReader, UnmatchedBrace}; - use rustc_ast::token::{self, Delimiter, Token}; use rustc_ast::tokenstream::{DelimSpan, Spacing, TokenStream, TokenTree}; use rustc_ast_pretty::pprust::token_to_string; use rustc_data_structures::fx::FxHashMap; -use rustc_errors::PResult; +use rustc_errors::{PErr, PResult}; use rustc_span::Span; -impl<'a> StringReader<'a> { - pub(super) fn into_token_trees(self) -> (PResult<'a, TokenStream>, Vec) { - let mut tt_reader = TokenTreesReader { - string_reader: self, - token: Token::dummy(), - open_braces: Vec::new(), - unmatched_braces: Vec::new(), - matching_delim_spans: Vec::new(), - last_unclosed_found_span: None, - last_delim_empty_block_spans: FxHashMap::default(), - matching_block_spans: Vec::new(), - }; - let res = tt_reader.parse_all_token_trees(); - (res, tt_reader.unmatched_braces) - } -} - -struct TokenTreesReader<'a> { +pub(super) struct TokenTreesReader<'a> { string_reader: StringReader<'a>, + /// The "next" token, which has been obtained from the `StringReader` but + /// not yet handled by the `TokenTreesReader`. token: Token, /// Stack of open delimiters and their spans. Used for error message. open_braces: Vec<(Delimiter, Span)>, @@ -43,231 +27,235 @@ struct TokenTreesReader<'a> { } impl<'a> TokenTreesReader<'a> { + pub(super) fn parse_token_trees( + string_reader: StringReader<'a>, + ) -> (PResult<'a, TokenStream>, Vec) { + let mut tt_reader = TokenTreesReader { + string_reader, + token: Token::dummy(), + open_braces: Vec::new(), + unmatched_braces: Vec::new(), + matching_delim_spans: Vec::new(), + last_unclosed_found_span: None, + last_delim_empty_block_spans: FxHashMap::default(), + matching_block_spans: Vec::new(), + }; + let res = tt_reader.parse_all_token_trees(); + (res, tt_reader.unmatched_braces) + } + // Parse a stream of tokens into a list of `TokenTree`s, up to an `Eof`. fn parse_all_token_trees(&mut self) -> PResult<'a, TokenStream> { + self.token = self.string_reader.next_token().0; let mut buf = TokenStreamBuilder::default(); - - self.bump(); - while self.token != token::Eof { - buf.push(self.parse_token_tree()?); + loop { + match self.token.kind { + token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)), + token::CloseDelim(delim) => return Err(self.close_delim_err(delim)), + token::Eof => return Ok(buf.into_token_stream()), + _ => buf.push(self.parse_token_tree_non_delim_non_eof()), + } } - - Ok(buf.into_token_stream()) } // Parse a stream of tokens into a list of `TokenTree`s, up to a `CloseDelim`. fn parse_token_trees_until_close_delim(&mut self) -> TokenStream { let mut buf = TokenStreamBuilder::default(); loop { - if let token::CloseDelim(..) = self.token.kind { - return buf.into_token_stream(); - } - - match self.parse_token_tree() { - Ok(tree) => buf.push(tree), - Err(mut e) => { - e.emit(); + match self.token.kind { + token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)), + token::CloseDelim(..) => return buf.into_token_stream(), + token::Eof => { + self.eof_err().emit(); return buf.into_token_stream(); } + _ => buf.push(self.parse_token_tree_non_delim_non_eof()), } } } - fn parse_token_tree(&mut self) -> PResult<'a, TokenTree> { - let sm = self.string_reader.sess.source_map(); - - match self.token.kind { - token::Eof => { - let msg = "this file contains an unclosed delimiter"; - let mut err = - self.string_reader.sess.span_diagnostic.struct_span_err(self.token.span, msg); - for &(_, sp) in &self.open_braces { - err.span_label(sp, "unclosed delimiter"); - self.unmatched_braces.push(UnmatchedBrace { - expected_delim: Delimiter::Brace, - found_delim: None, - found_span: self.token.span, - unclosed_span: Some(sp), - candidate_span: None, - }); - } + fn eof_err(&mut self) -> PErr<'a> { + let msg = "this file contains an unclosed delimiter"; + let mut err = self.string_reader.sess.span_diagnostic.struct_span_err(self.token.span, msg); + for &(_, sp) in &self.open_braces { + err.span_label(sp, "unclosed delimiter"); + self.unmatched_braces.push(UnmatchedBrace { + expected_delim: Delimiter::Brace, + found_delim: None, + found_span: self.token.span, + unclosed_span: Some(sp), + candidate_span: None, + }); + } - if let Some((delim, _)) = self.open_braces.last() { - if let Some((_, open_sp, close_sp)) = - self.matching_delim_spans.iter().find(|(d, open_sp, close_sp)| { - if let Some(close_padding) = sm.span_to_margin(*close_sp) { - if let Some(open_padding) = sm.span_to_margin(*open_sp) { - return delim == d && close_padding != open_padding; - } - } - false - }) - // these are in reverse order as they get inserted on close, but - { - // we want the last open/first close - err.span_label(*open_sp, "this delimiter might not be properly closed..."); - err.span_label( - *close_sp, - "...as it matches this but it has different indentation", - ); + if let Some((delim, _)) = self.open_braces.last() { + if let Some((_, open_sp, close_sp)) = + self.matching_delim_spans.iter().find(|(d, open_sp, close_sp)| { + let sm = self.string_reader.sess.source_map(); + if let Some(close_padding) = sm.span_to_margin(*close_sp) { + if let Some(open_padding) = sm.span_to_margin(*open_sp) { + return delim == d && close_padding != open_padding; + } } - } - Err(err) + false + }) + // these are in reverse order as they get inserted on close, but + { + // we want the last open/first close + err.span_label(*open_sp, "this delimiter might not be properly closed..."); + err.span_label(*close_sp, "...as it matches this but it has different indentation"); } - token::OpenDelim(delim) => { - // The span for beginning of the delimited section - let pre_span = self.token.span; - - // Parse the open delimiter. - self.open_braces.push((delim, self.token.span)); - self.bump(); + } + err + } - // Parse the token trees within the delimiters. - // We stop at any delimiter so we can try to recover if the user - // uses an incorrect delimiter. - let tts = self.parse_token_trees_until_close_delim(); + fn parse_token_tree_open_delim(&mut self, open_delim: Delimiter) -> TokenTree { + // The span for beginning of the delimited section + let pre_span = self.token.span; - // Expand to cover the entire delimited token tree - let delim_span = DelimSpan::from_pair(pre_span, self.token.span); + // Move past the open delimiter. + self.open_braces.push((open_delim, self.token.span)); + self.token = self.string_reader.next_token().0; - match self.token.kind { - // Correct delimiter. - token::CloseDelim(d) if d == delim => { - let (open_brace, open_brace_span) = self.open_braces.pop().unwrap(); - let close_brace_span = self.token.span; + // Parse the token trees within the delimiters. + // We stop at any delimiter so we can try to recover if the user + // uses an incorrect delimiter. + let tts = self.parse_token_trees_until_close_delim(); - if tts.is_empty() { - let empty_block_span = open_brace_span.to(close_brace_span); - if !sm.is_multiline(empty_block_span) { - // Only track if the block is in the form of `{}`, otherwise it is - // likely that it was written on purpose. - self.last_delim_empty_block_spans.insert(delim, empty_block_span); - } - } + // Expand to cover the entire delimited token tree + let delim_span = DelimSpan::from_pair(pre_span, self.token.span); - //only add braces - if let (Delimiter::Brace, Delimiter::Brace) = (open_brace, delim) { - self.matching_block_spans.push((open_brace_span, close_brace_span)); - } + match self.token.kind { + // Correct delimiter. + token::CloseDelim(close_delim) if close_delim == open_delim => { + let (open_brace, open_brace_span) = self.open_braces.pop().unwrap(); + let close_brace_span = self.token.span; - if self.open_braces.is_empty() { - // Clear up these spans to avoid suggesting them as we've found - // properly matched delimiters so far for an entire block. - self.matching_delim_spans.clear(); - } else { - self.matching_delim_spans.push(( - open_brace, - open_brace_span, - close_brace_span, - )); - } - // Parse the closing delimiter. - self.bump(); + if tts.is_empty() { + let empty_block_span = open_brace_span.to(close_brace_span); + let sm = self.string_reader.sess.source_map(); + if !sm.is_multiline(empty_block_span) { + // Only track if the block is in the form of `{}`, otherwise it is + // likely that it was written on purpose. + self.last_delim_empty_block_spans.insert(open_delim, empty_block_span); } - // Incorrect delimiter. - token::CloseDelim(other) => { - let mut unclosed_delimiter = None; - let mut candidate = None; + } + + //only add braces + if let (Delimiter::Brace, Delimiter::Brace) = (open_brace, open_delim) { + self.matching_block_spans.push((open_brace_span, close_brace_span)); + } + + if self.open_braces.is_empty() { + // Clear up these spans to avoid suggesting them as we've found + // properly matched delimiters so far for an entire block. + self.matching_delim_spans.clear(); + } else { + self.matching_delim_spans.push((open_brace, open_brace_span, close_brace_span)); + } + // Move past the closing delimiter. + self.token = self.string_reader.next_token().0; + } + // Incorrect delimiter. + token::CloseDelim(close_delim) => { + let mut unclosed_delimiter = None; + let mut candidate = None; - if self.last_unclosed_found_span != Some(self.token.span) { - // do not complain about the same unclosed delimiter multiple times - self.last_unclosed_found_span = Some(self.token.span); - // This is a conservative error: only report the last unclosed - // delimiter. The previous unclosed delimiters could actually be - // closed! The parser just hasn't gotten to them yet. - if let Some(&(_, sp)) = self.open_braces.last() { - unclosed_delimiter = Some(sp); - }; - if let Some(current_padding) = sm.span_to_margin(self.token.span) { - for (brace, brace_span) in &self.open_braces { - if let Some(padding) = sm.span_to_margin(*brace_span) { - // high likelihood of these two corresponding - if current_padding == padding && brace == &other { - candidate = Some(*brace_span); - } - } + if self.last_unclosed_found_span != Some(self.token.span) { + // do not complain about the same unclosed delimiter multiple times + self.last_unclosed_found_span = Some(self.token.span); + // This is a conservative error: only report the last unclosed + // delimiter. The previous unclosed delimiters could actually be + // closed! The parser just hasn't gotten to them yet. + if let Some(&(_, sp)) = self.open_braces.last() { + unclosed_delimiter = Some(sp); + }; + let sm = self.string_reader.sess.source_map(); + if let Some(current_padding) = sm.span_to_margin(self.token.span) { + for (brace, brace_span) in &self.open_braces { + if let Some(padding) = sm.span_to_margin(*brace_span) { + // high likelihood of these two corresponding + if current_padding == padding && brace == &close_delim { + candidate = Some(*brace_span); } } - let (tok, _) = self.open_braces.pop().unwrap(); - self.unmatched_braces.push(UnmatchedBrace { - expected_delim: tok, - found_delim: Some(other), - found_span: self.token.span, - unclosed_span: unclosed_delimiter, - candidate_span: candidate, - }); - } else { - self.open_braces.pop(); } - - // If the incorrect delimiter matches an earlier opening - // delimiter, then don't consume it (it can be used to - // close the earlier one). Otherwise, consume it. - // E.g., we try to recover from: - // fn foo() { - // bar(baz( - // } // Incorrect delimiter but matches the earlier `{` - if !self.open_braces.iter().any(|&(b, _)| b == other) { - self.bump(); - } - } - token::Eof => { - // Silently recover, the EOF token will be seen again - // and an error emitted then. Thus we don't pop from - // self.open_braces here. } - _ => {} + let (tok, _) = self.open_braces.pop().unwrap(); + self.unmatched_braces.push(UnmatchedBrace { + expected_delim: tok, + found_delim: Some(close_delim), + found_span: self.token.span, + unclosed_span: unclosed_delimiter, + candidate_span: candidate, + }); + } else { + self.open_braces.pop(); } - Ok(TokenTree::Delimited(delim_span, delim, tts)) + // If the incorrect delimiter matches an earlier opening + // delimiter, then don't consume it (it can be used to + // close the earlier one). Otherwise, consume it. + // E.g., we try to recover from: + // fn foo() { + // bar(baz( + // } // Incorrect delimiter but matches the earlier `{` + if !self.open_braces.iter().any(|&(b, _)| b == close_delim) { + self.token = self.string_reader.next_token().0; + } } - token::CloseDelim(delim) => { - // An unexpected closing delimiter (i.e., there is no - // matching opening delimiter). - let token_str = token_to_string(&self.token); - let msg = format!("unexpected closing delimiter: `{}`", token_str); - let mut err = - self.string_reader.sess.span_diagnostic.struct_span_err(self.token.span, &msg); - - // Braces are added at the end, so the last element is the biggest block - if let Some(parent) = self.matching_block_spans.last() { - if let Some(span) = self.last_delim_empty_block_spans.remove(&delim) { - // Check if the (empty block) is in the last properly closed block - if (parent.0.to(parent.1)).contains(span) { - err.span_label( - span, - "block is empty, you might have not meant to close it", - ); - } else { - err.span_label(parent.0, "this opening brace..."); + token::Eof => { + // Silently recover, the EOF token will be seen again + // and an error emitted then. Thus we don't pop from + // self.open_braces here. + } + _ => unreachable!(), + } - err.span_label(parent.1, "...matches this closing brace"); - } - } else { - err.span_label(parent.0, "this opening brace..."); + TokenTree::Delimited(delim_span, open_delim, tts) + } - err.span_label(parent.1, "...matches this closing brace"); - } - } + fn close_delim_err(&mut self, delim: Delimiter) -> PErr<'a> { + // An unexpected closing delimiter (i.e., there is no + // matching opening delimiter). + let token_str = token_to_string(&self.token); + let msg = format!("unexpected closing delimiter: `{}`", token_str); + let mut err = + self.string_reader.sess.span_diagnostic.struct_span_err(self.token.span, &msg); - err.span_label(self.token.span, "unexpected closing delimiter"); - Err(err) - } - _ => { - let tok = self.token.take(); - let mut spacing = self.bump(); - if !self.token.is_op() { - spacing = Spacing::Alone; + // Braces are added at the end, so the last element is the biggest block + if let Some(parent) = self.matching_block_spans.last() { + if let Some(span) = self.last_delim_empty_block_spans.remove(&delim) { + // Check if the (empty block) is in the last properly closed block + if (parent.0.to(parent.1)).contains(span) { + err.span_label(span, "block is empty, you might have not meant to close it"); + } else { + err.span_label(parent.0, "this opening brace..."); + err.span_label(parent.1, "...matches this closing brace"); } - Ok(TokenTree::Token(tok, spacing)) + } else { + err.span_label(parent.0, "this opening brace..."); + err.span_label(parent.1, "...matches this closing brace"); } } + + err.span_label(self.token.span, "unexpected closing delimiter"); + err } - fn bump(&mut self) -> Spacing { - let (spacing, token) = self.string_reader.next_token(); - self.token = token; - spacing + #[inline] + fn parse_token_tree_non_delim_non_eof(&mut self) -> TokenTree { + // `this_spacing` for the returned token refers to whether the token is + // immediately followed by another op token. It is determined by the + // next token: its kind and its `preceded_by_whitespace` status. + let (next_tok, is_next_tok_preceded_by_whitespace) = self.string_reader.next_token(); + let this_spacing = if is_next_tok_preceded_by_whitespace || !next_tok.is_op() { + Spacing::Alone + } else { + Spacing::Joint + }; + let this_tok = std::mem::replace(&mut self.token, next_tok); + TokenTree::Token(this_tok, this_spacing) } } diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 8922bf377858e..78b98431b190d 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -13,6 +13,7 @@ use std::collections::VecDeque; use std::fmt::{Display, Write}; use rustc_data_structures::fx::FxHashMap; +use rustc_lexer::Cursor; use rustc_lexer::{LiteralKind, TokenKind}; use rustc_span::edition::Edition; use rustc_span::symbol::Symbol; @@ -408,15 +409,16 @@ enum Highlight<'a> { struct TokenIter<'a> { src: &'a str, + cursor: Cursor<'a>, } impl<'a> Iterator for TokenIter<'a> { type Item = (TokenKind, &'a str); fn next(&mut self) -> Option<(TokenKind, &'a str)> { - if self.src.is_empty() { + let token = self.cursor.advance_token(); + if token.kind == TokenKind::Eof { return None; } - let token = rustc_lexer::first_token(self.src); let (text, rest) = self.src.split_at(token.len as usize); self.src = rest; Some((token.kind, text)) @@ -525,7 +527,7 @@ impl<'a> Classifier<'a> { /// Takes as argument the source code to HTML-ify, the rust edition to use and the source code /// file span which will be used later on by the `span_correspondance_map`. fn new(src: &str, file_span: Span, decoration_info: Option) -> Classifier<'_> { - let tokens = PeekIter::new(TokenIter { src }); + let tokens = PeekIter::new(TokenIter { src, cursor: Cursor::new(src) }); let decorations = decoration_info.map(Decorations::new); Classifier { tokens, @@ -850,6 +852,7 @@ impl<'a> Classifier<'a> { Class::Ident(self.new_span(before, text)) } TokenKind::Lifetime { .. } => Class::Lifetime, + TokenKind::Eof => panic!("Eof in advance"), }; // Anything that didn't return above is the simple case where we the // class just spans a single token, so we can use the `string` method.