From 868fc878850b620f97ca45411d6e90323866e6c4 Mon Sep 17 00:00:00 2001 From: lucab <98086+lucab@users.noreply.github.com> Date: Sat, 27 Jul 2024 01:17:25 +0000 Subject: [PATCH] perf(parser): optimize conditional advance on ASCII values (#4298) Part of https://github.com/oxc-project/oxc/issues/3291. --- crates/oxc_parser/src/lexer/byte_handlers.rs | 44 ++++++++++---------- crates/oxc_parser/src/lexer/mod.rs | 18 ++++---- crates/oxc_parser/src/lexer/numeric.rs | 4 +- crates/oxc_parser/src/lexer/punctuation.rs | 25 ++++++----- crates/oxc_parser/src/lexer/source.rs | 18 ++++++++ crates/oxc_parser/src/lexer/unicode.rs | 6 +-- 6 files changed, 69 insertions(+), 46 deletions(-) diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs index dad9f010d844c..4e2439131ed5a 100644 --- a/crates/oxc_parser/src/lexer/byte_handlers.rs +++ b/crates/oxc_parser/src/lexer/byte_handlers.rs @@ -209,8 +209,8 @@ ascii_byte_handler!(LIN(lexer) { // ! ascii_byte_handler!(EXL(lexer) { lexer.consume_char(); - if lexer.next_eq('=') { - if lexer.next_eq('=') { + if lexer.next_ascii_char_eq(b'=') { + if lexer.next_ascii_char_eq(b'=') { Kind::Neq2 } else { Kind::Neq @@ -237,7 +237,7 @@ ascii_byte_handler!(HAS(lexer) { lexer.consume_char(); // HashbangComment :: // `#!` SingleLineCommentChars? - if lexer.token.start == 0 && lexer.next_eq('!') { + if lexer.token.start == 0 && lexer.next_ascii_char_eq(b'!') { lexer.read_hashbang_comment() } else { lexer.private_identifier() @@ -252,7 +252,7 @@ ascii_identifier_handler!(IDT(_id_without_first_char) { // % ascii_byte_handler!(PRC(lexer) { lexer.consume_char(); - if lexer.next_eq('=') { + if lexer.next_ascii_char_eq(b'=') { Kind::PercentEq } else { Kind::Percent @@ -262,13 +262,13 @@ ascii_byte_handler!(PRC(lexer) { // & ascii_byte_handler!(AMP(lexer) { lexer.consume_char(); - if lexer.next_eq('&') { - if lexer.next_eq('=') { + if lexer.next_ascii_char_eq(b'&') { + if lexer.next_ascii_char_eq(b'=') { Kind::Amp2Eq } else { Kind::Amp2 } - } else if lexer.next_eq('=') { + } else if lexer.next_ascii_char_eq(b'=') { Kind::AmpEq } else { Kind::Amp @@ -290,13 +290,13 @@ ascii_byte_handler!(PNC(lexer) { // * ascii_byte_handler!(ATR(lexer) { lexer.consume_char(); - if lexer.next_eq('*') { - if lexer.next_eq('=') { + if lexer.next_ascii_char_eq(b'*') { + if lexer.next_ascii_char_eq(b'=') { Kind::Star2Eq } else { Kind::Star2 } - } else if lexer.next_eq('=') { + } else if lexer.next_ascii_char_eq(b'=') { Kind::StarEq } else { Kind::Star @@ -306,9 +306,9 @@ ascii_byte_handler!(ATR(lexer) { // + ascii_byte_handler!(PLS(lexer) { lexer.consume_char(); - if lexer.next_eq('+') { + if lexer.next_ascii_char_eq(b'+') { Kind::Plus2 - } else if lexer.next_eq('=') { + } else if lexer.next_ascii_char_eq(b'=') { Kind::PlusEq } else { Kind::Plus @@ -347,7 +347,7 @@ ascii_byte_handler!(SLH(lexer) { } _ => { // regex is handled separately, see `next_regex` - if lexer.next_eq('=') { + if lexer.next_ascii_char_eq(b'=') { Kind::SlashEq } else { Kind::Slash @@ -389,13 +389,13 @@ ascii_byte_handler!(LSS(lexer) { // = ascii_byte_handler!(EQL(lexer) { lexer.consume_char(); - if lexer.next_eq('=') { - if lexer.next_eq('=') { + if lexer.next_ascii_char_eq(b'=') { + if lexer.next_ascii_char_eq(b'=') { Kind::Eq3 } else { Kind::Eq2 } - } else if lexer.next_eq('>') { + } else if lexer.next_ascii_char_eq(b'>') { Kind::Arrow } else { Kind::Eq @@ -412,8 +412,8 @@ ascii_byte_handler!(GTR(lexer) { // ? ascii_byte_handler!(QST(lexer) { lexer.consume_char(); - if lexer.next_eq('?') { - if lexer.next_eq('=') { + if lexer.next_ascii_char_eq(b'?') { + if lexer.next_ascii_char_eq(b'=') { Kind::Question2Eq } else { Kind::Question2 @@ -457,7 +457,7 @@ ascii_byte_handler!(BTC(lexer) { // ^ ascii_byte_handler!(CRT(lexer) { lexer.consume_char(); - if lexer.next_eq('=') { + if lexer.next_ascii_char_eq(b'=') { Kind::CaretEq } else { Kind::Caret @@ -479,13 +479,13 @@ ascii_byte_handler!(BEO(lexer) { // | ascii_byte_handler!(PIP(lexer) { lexer.consume_char(); - if lexer.next_eq('|') { - if lexer.next_eq('=') { + if lexer.next_ascii_char_eq(b'|') { + if lexer.next_ascii_char_eq(b'=') { Kind::Pipe2Eq } else { Kind::Pipe2 } - } else if lexer.next_eq('=') { + } else if lexer.next_ascii_char_eq(b'=') { Kind::PipeEq } else { Kind::Pipe diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index c81fb03e269cf..ca4365b62bdf5 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -263,14 +263,16 @@ impl<'a> Lexer<'a> { self.source.peek_char2() } - /// Peek the next character, and advance the current position if it matches - #[inline] - fn next_eq(&mut self, c: char) -> bool { - let matched = self.peek() == Some(c); - if matched { - self.source.next_char().unwrap(); - } - matched + /// Peek the next byte, and advance the current position if it matches + /// the given ASCII char. + #[allow(clippy::inline_always)] + #[inline(always)] + fn next_ascii_char_eq(&mut self, b: u8) -> bool { + // TODO: can be replaced by `std::ascii:Char` once stabilized. + // https://github.com/rust-lang/rust/issues/110998 + assert!(b.is_ascii()); + // SAFETY: `b` is a valid ASCII char. + unsafe { self.source.advance_if_ascii_eq(b) } } fn current_offset(&self) -> Span { diff --git a/crates/oxc_parser/src/lexer/numeric.rs b/crates/oxc_parser/src/lexer/numeric.rs index cbc359b0740b1..7e7ab75fe690d 100644 --- a/crates/oxc_parser/src/lexer/numeric.rs +++ b/crates/oxc_parser/src/lexer/numeric.rs @@ -29,9 +29,9 @@ impl<'a> Lexer<'a> { pub(super) fn decimal_literal_after_first_digit(&mut self) -> Kind { self.read_decimal_digits_after_first_digit(); - if self.next_eq('.') { + if self.next_ascii_char_eq(b'.') { return self.decimal_literal_after_decimal_point_after_digits(); - } else if self.next_eq('n') { + } else if self.next_ascii_char_eq(b'n') { return self.check_after_numeric_literal(Kind::Decimal); } diff --git a/crates/oxc_parser/src/lexer/punctuation.rs b/crates/oxc_parser/src/lexer/punctuation.rs index 9d6d71dbaccf4..57f96bf1af0e7 100644 --- a/crates/oxc_parser/src/lexer/punctuation.rs +++ b/crates/oxc_parser/src/lexer/punctuation.rs @@ -17,13 +17,13 @@ impl<'a> Lexer<'a> { /// returns None for `SingleLineHTMLOpenComment` `` in script mode pub(super) fn read_minus(&mut self) -> Option { - if self.next_eq('-') { + if self.next_ascii_char_eq(b'-') { // SingleLineHTMLCloseComment `-->` in script mode - if self.token.is_on_new_line && self.source_type.is_script() && self.next_eq('>') { + if self.token.is_on_new_line + && self.source_type.is_script() + && self.next_ascii_char_eq(b'>') + { None } else { Some(Kind::Minus2) } - } else if self.next_eq('=') { + } else if self.next_ascii_char_eq(b'=') { Some(Kind::MinusEq) } else { Some(Kind::Minus) @@ -59,19 +62,19 @@ impl<'a> Lexer<'a> { } fn read_right_angle(&mut self) -> Kind { - if self.next_eq('>') { - if self.next_eq('>') { - if self.next_eq('=') { + if self.next_ascii_char_eq(b'>') { + if self.next_ascii_char_eq(b'>') { + if self.next_ascii_char_eq(b'=') { Kind::ShiftRight3Eq } else { Kind::ShiftRight3 } - } else if self.next_eq('=') { + } else if self.next_ascii_char_eq(b'=') { Kind::ShiftRightEq } else { Kind::ShiftRight } - } else if self.next_eq('=') { + } else if self.next_ascii_char_eq(b'=') { Kind::GtEq } else { Kind::RAngle diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs index db38fb40285dc..bbeee075dfaa0 100644 --- a/crates/oxc_parser/src/lexer/source.rs +++ b/crates/oxc_parser/src/lexer/source.rs @@ -197,6 +197,24 @@ impl<'a> Source<'a> { self.ptr = self.end; } + /// Advance `Source`'s cursor by one byte if it is equal to the given ASCII value. + /// + /// # SAFETY + /// + /// Caller must ensure that `ascii_byte` is a valid ASCII character. + #[allow(clippy::inline_always)] + #[inline(always)] + pub(super) unsafe fn advance_if_ascii_eq(&mut self, ascii_byte: u8) -> bool { + debug_assert!(ascii_byte.is_ascii()); + let matched = self.peek_byte() == Some(ascii_byte); + if matched { + // SAFETY: next byte exists and is a valid ASCII char (and thus UTF-8 + // char boundary). + self.ptr = unsafe { self.ptr.add(1) }; + } + matched + } + /// Get string slice from a `SourcePosition` up to the current position of `Source`. pub(super) fn str_from_pos_to_current(&self, pos: SourcePosition) -> &'a str { assert!(pos.ptr <= self.ptr); diff --git a/crates/oxc_parser/src/lexer/unicode.rs b/crates/oxc_parser/src/lexer/unicode.rs index 555803144b120..ad9de193ac23e 100644 --- a/crates/oxc_parser/src/lexer/unicode.rs +++ b/crates/oxc_parser/src/lexer/unicode.rs @@ -141,11 +141,11 @@ impl<'a> Lexer<'a> { } fn unicode_code_point(&mut self) -> Option { - if !self.next_eq('{') { + if !self.next_ascii_char_eq(b'{') { return None; } let value = self.code_point()?; - if !self.next_eq('}') { + if !self.next_ascii_char_eq(b'}') { return None; } Some(SurrogatePair::CodePoint(value)) @@ -232,7 +232,7 @@ impl<'a> Lexer<'a> { // LF | LS | PS => {} CR => { - self.next_eq(LF); + self.next_ascii_char_eq(b'\n'); } // SingleEscapeCharacter :: one of // ' " \ b f n r t v