From 868fc878850b620f97ca45411d6e90323866e6c4 Mon Sep 17 00:00:00 2001
From: lucab <98086+lucab@users.noreply.github.com>
Date: Sat, 27 Jul 2024 01:17:25 +0000
Subject: [PATCH] perf(parser): optimize conditional advance on ASCII values
 (#4298)

Part of https://github.com/oxc-project/oxc/issues/3291.
---
 crates/oxc_parser/src/lexer/byte_handlers.rs | 44 ++++++++++----------
 crates/oxc_parser/src/lexer/mod.rs           | 18 ++++----
 crates/oxc_parser/src/lexer/numeric.rs       |  4 +-
 crates/oxc_parser/src/lexer/punctuation.rs   | 25 ++++++-----
 crates/oxc_parser/src/lexer/source.rs        | 18 ++++++++
 crates/oxc_parser/src/lexer/unicode.rs       |  6 +--
 6 files changed, 69 insertions(+), 46 deletions(-)

diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs
index dad9f010d844c..4e2439131ed5a 100644
--- a/crates/oxc_parser/src/lexer/byte_handlers.rs
+++ b/crates/oxc_parser/src/lexer/byte_handlers.rs
@@ -209,8 +209,8 @@ ascii_byte_handler!(LIN(lexer) {
 // !
 ascii_byte_handler!(EXL(lexer) {
     lexer.consume_char();
-    if lexer.next_eq('=') {
-        if lexer.next_eq('=') {
+    if lexer.next_ascii_char_eq(b'=') {
+        if lexer.next_ascii_char_eq(b'=') {
             Kind::Neq2
         } else {
             Kind::Neq
@@ -237,7 +237,7 @@ ascii_byte_handler!(HAS(lexer) {
     lexer.consume_char();
     // HashbangComment ::
     //     `#!` SingleLineCommentChars?
-    if lexer.token.start == 0 && lexer.next_eq('!') {
+    if lexer.token.start == 0 && lexer.next_ascii_char_eq(b'!') {
         lexer.read_hashbang_comment()
     } else {
         lexer.private_identifier()
@@ -252,7 +252,7 @@ ascii_identifier_handler!(IDT(_id_without_first_char) {
 // %
 ascii_byte_handler!(PRC(lexer) {
     lexer.consume_char();
-    if lexer.next_eq('=') {
+    if lexer.next_ascii_char_eq(b'=') {
         Kind::PercentEq
     } else {
         Kind::Percent
@@ -262,13 +262,13 @@ ascii_byte_handler!(PRC(lexer) {
 // &
 ascii_byte_handler!(AMP(lexer) {
     lexer.consume_char();
-    if lexer.next_eq('&') {
-        if lexer.next_eq('=') {
+    if lexer.next_ascii_char_eq(b'&') {
+        if lexer.next_ascii_char_eq(b'=') {
             Kind::Amp2Eq
         } else {
             Kind::Amp2
         }
-    } else if lexer.next_eq('=') {
+    } else if lexer.next_ascii_char_eq(b'=') {
         Kind::AmpEq
     } else {
         Kind::Amp
@@ -290,13 +290,13 @@ ascii_byte_handler!(PNC(lexer) {
 // *
 ascii_byte_handler!(ATR(lexer) {
     lexer.consume_char();
-    if lexer.next_eq('*') {
-        if lexer.next_eq('=') {
+    if lexer.next_ascii_char_eq(b'*') {
+        if lexer.next_ascii_char_eq(b'=') {
             Kind::Star2Eq
         } else {
             Kind::Star2
         }
-    } else if lexer.next_eq('=') {
+    } else if lexer.next_ascii_char_eq(b'=') {
         Kind::StarEq
     } else {
         Kind::Star
@@ -306,9 +306,9 @@ ascii_byte_handler!(ATR(lexer) {
 // +
 ascii_byte_handler!(PLS(lexer) {
     lexer.consume_char();
-    if lexer.next_eq('+') {
+    if lexer.next_ascii_char_eq(b'+') {
         Kind::Plus2
-    } else if lexer.next_eq('=') {
+    } else if lexer.next_ascii_char_eq(b'=') {
         Kind::PlusEq
     } else {
         Kind::Plus
@@ -347,7 +347,7 @@ ascii_byte_handler!(SLH(lexer) {
         }
         _ => {
             // regex is handled separately, see `next_regex`
-            if lexer.next_eq('=') {
+            if lexer.next_ascii_char_eq(b'=') {
                 Kind::SlashEq
             } else {
                 Kind::Slash
@@ -389,13 +389,13 @@ ascii_byte_handler!(LSS(lexer) {
 // =
 ascii_byte_handler!(EQL(lexer) {
     lexer.consume_char();
-    if lexer.next_eq('=') {
-        if lexer.next_eq('=') {
+    if lexer.next_ascii_char_eq(b'=') {
+        if lexer.next_ascii_char_eq(b'=') {
             Kind::Eq3
         } else {
             Kind::Eq2
         }
-    } else if lexer.next_eq('>') {
+    } else if lexer.next_ascii_char_eq(b'>') {
         Kind::Arrow
     } else {
         Kind::Eq
@@ -412,8 +412,8 @@ ascii_byte_handler!(GTR(lexer) {
 // ?
 ascii_byte_handler!(QST(lexer) {
     lexer.consume_char();
-    if lexer.next_eq('?') {
-        if lexer.next_eq('=') {
+    if lexer.next_ascii_char_eq(b'?') {
+        if lexer.next_ascii_char_eq(b'=') {
             Kind::Question2Eq
         } else {
             Kind::Question2
@@ -457,7 +457,7 @@ ascii_byte_handler!(BTC(lexer) {
 // ^
 ascii_byte_handler!(CRT(lexer) {
     lexer.consume_char();
-    if lexer.next_eq('=') {
+    if lexer.next_ascii_char_eq(b'=') {
         Kind::CaretEq
     } else {
         Kind::Caret
@@ -479,13 +479,13 @@ ascii_byte_handler!(BEO(lexer) {
 // |
 ascii_byte_handler!(PIP(lexer) {
     lexer.consume_char();
-    if lexer.next_eq('|') {
-        if lexer.next_eq('=') {
+    if lexer.next_ascii_char_eq(b'|') {
+        if lexer.next_ascii_char_eq(b'=') {
             Kind::Pipe2Eq
         } else {
             Kind::Pipe2
         }
-    } else if lexer.next_eq('=') {
+    } else if lexer.next_ascii_char_eq(b'=') {
         Kind::PipeEq
     } else {
         Kind::Pipe
diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs
index c81fb03e269cf..ca4365b62bdf5 100644
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@@ -263,14 +263,16 @@ impl<'a> Lexer<'a> {
         self.source.peek_char2()
     }
 
-    /// Peek the next character, and advance the current position if it matches
-    #[inline]
-    fn next_eq(&mut self, c: char) -> bool {
-        let matched = self.peek() == Some(c);
-        if matched {
-            self.source.next_char().unwrap();
-        }
-        matched
+    /// Peek the next byte, and advance the current position if it matches
+    /// the given ASCII char.
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    fn next_ascii_char_eq(&mut self, b: u8) -> bool {
+        // TODO: can be replaced by `std::ascii:Char` once stabilized.
+        // https://github.com/rust-lang/rust/issues/110998
+        assert!(b.is_ascii());
+        // SAFETY: `b` is a valid ASCII char.
+        unsafe { self.source.advance_if_ascii_eq(b) }
     }
 
     fn current_offset(&self) -> Span {
diff --git a/crates/oxc_parser/src/lexer/numeric.rs b/crates/oxc_parser/src/lexer/numeric.rs
index cbc359b0740b1..7e7ab75fe690d 100644
--- a/crates/oxc_parser/src/lexer/numeric.rs
+++ b/crates/oxc_parser/src/lexer/numeric.rs
@@ -29,9 +29,9 @@ impl<'a> Lexer<'a> {
 
     pub(super) fn decimal_literal_after_first_digit(&mut self) -> Kind {
         self.read_decimal_digits_after_first_digit();
-        if self.next_eq('.') {
+        if self.next_ascii_char_eq(b'.') {
             return self.decimal_literal_after_decimal_point_after_digits();
-        } else if self.next_eq('n') {
+        } else if self.next_ascii_char_eq(b'n') {
             return self.check_after_numeric_literal(Kind::Decimal);
         }
 
diff --git a/crates/oxc_parser/src/lexer/punctuation.rs b/crates/oxc_parser/src/lexer/punctuation.rs
index 9d6d71dbaccf4..57f96bf1af0e7 100644
--- a/crates/oxc_parser/src/lexer/punctuation.rs
+++ b/crates/oxc_parser/src/lexer/punctuation.rs
@@ -17,13 +17,13 @@ impl<'a> Lexer<'a> {
 
     /// returns None for `SingleLineHTMLOpenComment` `<!--` in script mode
     pub(super) fn read_left_angle(&mut self) -> Option<Kind> {
-        if self.next_eq('<') {
-            if self.next_eq('=') {
+        if self.next_ascii_char_eq(b'<') {
+            if self.next_ascii_char_eq(b'=') {
                 Some(Kind::ShiftLeftEq)
             } else {
                 Some(Kind::ShiftLeft)
             }
-        } else if self.next_eq('=') {
+        } else if self.next_ascii_char_eq(b'=') {
             Some(Kind::LtEq)
         } else if self.peek() == Some('!')
             // SingleLineHTMLOpenComment `<!--` in script mode
@@ -38,14 +38,17 @@ impl<'a> Lexer<'a> {
 
     /// returns None for `SingleLineHTMLCloseComment` `-->` in script mode
     pub(super) fn read_minus(&mut self) -> Option<Kind> {
-        if self.next_eq('-') {
+        if self.next_ascii_char_eq(b'-') {
             // SingleLineHTMLCloseComment `-->` in script mode
-            if self.token.is_on_new_line && self.source_type.is_script() && self.next_eq('>') {
+            if self.token.is_on_new_line
+                && self.source_type.is_script()
+                && self.next_ascii_char_eq(b'>')
+            {
                 None
             } else {
                 Some(Kind::Minus2)
             }
-        } else if self.next_eq('=') {
+        } else if self.next_ascii_char_eq(b'=') {
             Some(Kind::MinusEq)
         } else {
             Some(Kind::Minus)
@@ -59,19 +62,19 @@ impl<'a> Lexer<'a> {
     }
 
     fn read_right_angle(&mut self) -> Kind {
-        if self.next_eq('>') {
-            if self.next_eq('>') {
-                if self.next_eq('=') {
+        if self.next_ascii_char_eq(b'>') {
+            if self.next_ascii_char_eq(b'>') {
+                if self.next_ascii_char_eq(b'=') {
                     Kind::ShiftRight3Eq
                 } else {
                     Kind::ShiftRight3
                 }
-            } else if self.next_eq('=') {
+            } else if self.next_ascii_char_eq(b'=') {
                 Kind::ShiftRightEq
             } else {
                 Kind::ShiftRight
             }
-        } else if self.next_eq('=') {
+        } else if self.next_ascii_char_eq(b'=') {
             Kind::GtEq
         } else {
             Kind::RAngle
diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs
index db38fb40285dc..bbeee075dfaa0 100644
--- a/crates/oxc_parser/src/lexer/source.rs
+++ b/crates/oxc_parser/src/lexer/source.rs
@@ -197,6 +197,24 @@ impl<'a> Source<'a> {
         self.ptr = self.end;
     }
 
+    /// Advance `Source`'s cursor by one byte if it is equal to the given ASCII value.
+    ///
+    /// # SAFETY
+    ///
+    /// Caller must ensure that `ascii_byte` is a valid ASCII character.
+    #[allow(clippy::inline_always)]
+    #[inline(always)]
+    pub(super) unsafe fn advance_if_ascii_eq(&mut self, ascii_byte: u8) -> bool {
+        debug_assert!(ascii_byte.is_ascii());
+        let matched = self.peek_byte() == Some(ascii_byte);
+        if matched {
+            // SAFETY: next byte exists and is a valid ASCII char (and thus UTF-8
+            // char boundary).
+            self.ptr = unsafe { self.ptr.add(1) };
+        }
+        matched
+    }
+
     /// Get string slice from a `SourcePosition` up to the current position of `Source`.
     pub(super) fn str_from_pos_to_current(&self, pos: SourcePosition) -> &'a str {
         assert!(pos.ptr <= self.ptr);
diff --git a/crates/oxc_parser/src/lexer/unicode.rs b/crates/oxc_parser/src/lexer/unicode.rs
index 555803144b120..ad9de193ac23e 100644
--- a/crates/oxc_parser/src/lexer/unicode.rs
+++ b/crates/oxc_parser/src/lexer/unicode.rs
@@ -141,11 +141,11 @@ impl<'a> Lexer<'a> {
     }
 
     fn unicode_code_point(&mut self) -> Option<SurrogatePair> {
-        if !self.next_eq('{') {
+        if !self.next_ascii_char_eq(b'{') {
             return None;
         }
         let value = self.code_point()?;
-        if !self.next_eq('}') {
+        if !self.next_ascii_char_eq(b'}') {
             return None;
         }
         Some(SurrogatePair::CodePoint(value))
@@ -232,7 +232,7 @@ impl<'a> Lexer<'a> {
                 // <CR> <LF>
                 LF | LS | PS => {}
                 CR => {
-                    self.next_eq(LF);
+                    self.next_ascii_char_eq(b'\n');
                 }
                 // SingleEscapeCharacter :: one of
                 //   ' " \ b f n r t v