Skip to content

Commit

Permalink
Auto merge of #102302 - nnethercote:more-lexer-improvements, r=matklad
Browse files Browse the repository at this point in the history
More lexer improvements

A follow-up to #99884.

r? `@matklad`
  • Loading branch information
bors committed Sep 28, 2022
2 parents 837bf37 + d0a26ac commit 6201eab
Show file tree
Hide file tree
Showing 7 changed files with 429 additions and 443 deletions.
7 changes: 1 addition & 6 deletions compiler/rustc_ast/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use rustc_span::symbol::{kw, sym};
use rustc_span::symbol::{Ident, Symbol};
use rustc_span::{self, edition::Edition, Span, DUMMY_SP};
use std::borrow::Cow;
use std::{fmt, mem};
use std::fmt;

#[derive(Clone, Copy, PartialEq, Encodable, Decodable, Debug, HashStable_Generic)]
pub enum CommentKind {
Expand Down Expand Up @@ -335,11 +335,6 @@ impl Token {
Token::new(Ident(ident.name, ident.is_raw_guess()), ident.span)
}

/// Return this token by value and leave a dummy token in its place.
pub fn take(&mut self) -> Self {
mem::replace(self, Token::dummy())
}

/// For interpolated tokens, returns a span of the fragment to which the interpolated
/// token refers. For all other tokens this is just a regular span.
/// It is particularly important to use this for identifiers and lifetimes
Expand Down
3 changes: 2 additions & 1 deletion compiler/rustc_errors/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ pub mod translation;
pub use diagnostic_builder::IntoDiagnostic;
pub use snippet::Style;

pub type PResult<'a, T> = Result<T, DiagnosticBuilder<'a, ErrorGuaranteed>>;
pub type PErr<'a> = DiagnosticBuilder<'a, ErrorGuaranteed>;
pub type PResult<'a, T> = Result<T, PErr<'a>>;

// `PResult` is used a lot. Make sure it doesn't unintentionally get bigger.
// (See also the comment on `DiagnosticBuilder`'s `diagnostic` field.)
Expand Down
16 changes: 8 additions & 8 deletions compiler/rustc_lexer/src/cursor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ use std::str::Chars;
///
/// Next characters can be peeked via `first` method,
/// and position can be shifted forward via `bump` method.
pub(crate) struct Cursor<'a> {
initial_len: usize,
pub struct Cursor<'a> {
len_remaining: usize,
/// Iterator over chars. Slightly faster than a &str.
chars: Chars<'a>,
#[cfg(debug_assertions)]
Expand All @@ -15,9 +15,9 @@ pub(crate) struct Cursor<'a> {
pub(crate) const EOF_CHAR: char = '\0';

impl<'a> Cursor<'a> {
pub(crate) fn new(input: &'a str) -> Cursor<'a> {
pub fn new(input: &'a str) -> Cursor<'a> {
Cursor {
initial_len: input.len(),
len_remaining: input.len(),
chars: input.chars(),
#[cfg(debug_assertions)]
prev: EOF_CHAR,
Expand Down Expand Up @@ -61,13 +61,13 @@ impl<'a> Cursor<'a> {
}

/// Returns amount of already consumed symbols.
pub(crate) fn len_consumed(&self) -> u32 {
(self.initial_len - self.chars.as_str().len()) as u32
pub(crate) fn pos_within_token(&self) -> u32 {
(self.len_remaining - self.chars.as_str().len()) as u32
}

/// Resets the number of bytes consumed to 0.
pub(crate) fn reset_len_consumed(&mut self) {
self.initial_len = self.chars.as_str().len();
pub(crate) fn reset_pos_within_token(&mut self) {
self.len_remaining = self.chars.as_str().len();
}

/// Moves to the next character.
Expand Down
53 changes: 26 additions & 27 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@ pub mod unescape;
#[cfg(test)]
mod tests;

pub use crate::cursor::Cursor;

use self::LiteralKind::*;
use self::TokenKind::*;
use crate::cursor::{Cursor, EOF_CHAR};
use crate::cursor::EOF_CHAR;
use std::convert::TryFrom;

/// Parsed token.
Expand Down Expand Up @@ -139,6 +141,9 @@ pub enum TokenKind {

/// Unknown token, not expected by the lexer, e.g. "№"
Unknown,

/// End of input.
Eof,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
Expand Down Expand Up @@ -219,13 +224,6 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
None
}

/// Parses the first token from the provided input string.
#[inline]
pub fn first_token(input: &str) -> Token {
debug_assert!(!input.is_empty());
Cursor::new(input).advance_token()
}

/// Validates a raw string literal. Used for getting more information about a
/// problem with a `RawStr`/`RawByteStr` with a `None` field.
#[inline]
Expand All @@ -243,12 +241,8 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError>
pub fn tokenize(input: &str) -> impl Iterator<Item = Token> + '_ {
let mut cursor = Cursor::new(input);
std::iter::from_fn(move || {
if cursor.is_eof() {
None
} else {
cursor.reset_len_consumed();
Some(cursor.advance_token())
}
let token = cursor.advance_token();
if token.kind != TokenKind::Eof { Some(token) } else { None }
})
}

Expand Down Expand Up @@ -311,8 +305,11 @@ pub fn is_ident(string: &str) -> bool {

impl Cursor<'_> {
/// Parses a token from the input string.
fn advance_token(&mut self) -> Token {
let first_char = self.bump().unwrap();
pub fn advance_token(&mut self) -> Token {
let first_char = match self.bump() {
Some(c) => c,
None => return Token::new(TokenKind::Eof, 0),
};
let token_kind = match first_char {
// Slash, comment or block comment.
'/' => match self.first() {
Expand All @@ -329,7 +326,7 @@ impl Cursor<'_> {
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
let res = self.raw_double_quoted_string(1);
let suffix_start = self.len_consumed();
let suffix_start = self.pos_within_token();
if res.is_ok() {
self.eat_literal_suffix();
}
Expand All @@ -344,7 +341,7 @@ impl Cursor<'_> {
('\'', _) => {
self.bump();
let terminated = self.single_quoted_string();
let suffix_start = self.len_consumed();
let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
Expand All @@ -354,7 +351,7 @@ impl Cursor<'_> {
('"', _) => {
self.bump();
let terminated = self.double_quoted_string();
let suffix_start = self.len_consumed();
let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
Expand All @@ -364,7 +361,7 @@ impl Cursor<'_> {
('r', '"') | ('r', '#') => {
self.bump();
let res = self.raw_double_quoted_string(2);
let suffix_start = self.len_consumed();
let suffix_start = self.pos_within_token();
if res.is_ok() {
self.eat_literal_suffix();
}
Expand All @@ -381,7 +378,7 @@ impl Cursor<'_> {
// Numeric literal.
c @ '0'..='9' => {
let literal_kind = self.number(c);
let suffix_start = self.len_consumed();
let suffix_start = self.pos_within_token();
self.eat_literal_suffix();
TokenKind::Literal { kind: literal_kind, suffix_start }
}
Expand Down Expand Up @@ -420,7 +417,7 @@ impl Cursor<'_> {
// String literal.
'"' => {
let terminated = self.double_quoted_string();
let suffix_start = self.len_consumed();
let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
Expand All @@ -433,7 +430,9 @@ impl Cursor<'_> {
}
_ => Unknown,
};
Token::new(token_kind, self.len_consumed())
let res = Token::new(token_kind, self.pos_within_token());
self.reset_pos_within_token();
res
}

fn line_comment(&mut self) -> TokenKind {
Expand Down Expand Up @@ -618,7 +617,7 @@ impl Cursor<'_> {

if !can_be_a_lifetime {
let terminated = self.single_quoted_string();
let suffix_start = self.len_consumed();
let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
Expand All @@ -643,7 +642,7 @@ impl Cursor<'_> {
if self.first() == '\'' {
self.bump();
let kind = Char { terminated: true };
Literal { kind, suffix_start: self.len_consumed() }
Literal { kind, suffix_start: self.pos_within_token() }
} else {
Lifetime { starts_with_number }
}
Expand Down Expand Up @@ -724,7 +723,7 @@ impl Cursor<'_> {

fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result<u32, RawStrError> {
debug_assert!(self.prev() == 'r');
let start_pos = self.len_consumed();
let start_pos = self.pos_within_token();
let mut possible_terminator_offset = None;
let mut max_hashes = 0;

Expand Down Expand Up @@ -778,7 +777,7 @@ impl Cursor<'_> {
// Keep track of possible terminators to give a hint about
// where there might be a missing terminator
possible_terminator_offset =
Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len);
max_hashes = n_end_hashes;
}
}
Expand Down
Loading

0 comments on commit 6201eab

Please sign in to comment.