refactor(regular_expression): Misc refactoring for body_parser (#5062)

- Add examples to list all `RegExp`s in source code - Refactor `MayContainStrings` related part
oxc-project · Aug 22, 2024 · 96f5798 · 96f5798
1 parent 3b35332
commit 96f5798
Show file tree

Hide file tree

Showing 6 changed files with 107 additions and 25 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/oxc_regular_expression/Cargo.toml b/crates/oxc_regular_expression/Cargo.toml
@@ -27,3 +27,10 @@ oxc_span        = { workspace = true }
 phf              = { workspace = true, features = ["macros"] }
 rustc-hash       = { workspace = true }
 unicode-id-start = { workspace = true }
+
+[dev-dependencies]
+oxc_allocator = { workspace = true }
+oxc_ast       = { workspace = true }
+oxc_parser    = { workspace = true }
+oxc_semantic  = { workspace = true }
+oxc_span      = { workspace = true }
diff --git a/crates/oxc_regular_expression/examples/parse_file.rs b/crates/oxc_regular_expression/examples/parse_file.rs
@@ -0,0 +1,68 @@
+#![allow(clippy::print_stdout)]
+use std::{env, fs, path::Path, sync::Arc};
+
+use oxc_allocator::Allocator;
+use oxc_ast::AstKind;
+use oxc_parser::Parser;
+use oxc_semantic::SemanticBuilder;
+use oxc_span::SourceType;
+
+fn main() {
+    let name = env::args().nth(1).unwrap_or_else(|| "test.js".to_string());
+    let path = Path::new(&name);
+
+    let source_text = Arc::new(fs::read_to_string(path).unwrap());
+    let source_type = SourceType::from_path(path).unwrap();
+
+    let allocator = Allocator::default();
+
+    let parser_ret = Parser::new(&allocator, &source_text, source_type).parse();
+    if !parser_ret.errors.is_empty() {
+        println!("Parsing failed:");
+        for error in parser_ret.errors {
+            let error = error.with_source_code(Arc::clone(&source_text));
+            println!("{error:?}");
+        }
+        return;
+    }
+
+    let program = allocator.alloc(parser_ret.program);
+    let semantic_ret = SemanticBuilder::new(&source_text, source_type).build(program);
+    let semantic = semantic_ret.semantic;
+
+    for node in semantic.nodes().iter() {
+        match node.kind() {
+            AstKind::RegExpLiteral(re) => {
+                let literal = re.span.source_text(&source_text);
+                let parsed = oxc_regular_expression::Parser::new(
+                    &allocator,
+                    literal,
+                    oxc_regular_expression::ParserOptions::default()
+                        .with_span_offset(re.span.start),
+                )
+                .parse();
+
+                println!("🍀 {literal}");
+                if let Err(error) = parsed {
+                    let error = error.with_source_code(Arc::clone(&source_text));
+                    println!("{error:?}");
+                    return;
+                }
+                println!("{parsed:#?}");
+                println!();
+            }
+            AstKind::NewExpression(new_expr) => {
+                if new_expr
+                    .callee
+                    .get_identifier_reference()
+                    .filter(|ident| ident.name == "RegExp")
+                    .is_some()
+                {
+                    println!("👻 TODO: new RegExp(...)");
+                    println!();
+                }
+            }
+            _ => {}
+        }
+    }
+}
diff --git a/crates/oxc_regular_expression/examples/test.js b/crates/oxc_regular_expression/examples/test.js
@@ -0,0 +1,3 @@
+const re1 = /abc{1}/gsv;
+const re2 = new RegExp("ooo", "u");
+const re3 = /[\w--[v]]/gsv;
diff --git a/crates/oxc_regular_expression/src/ast.rs b/crates/oxc_regular_expression/src/ast.rs
@@ -207,7 +207,7 @@ pub struct CharacterClassRange {
 #[derive(Debug)]
 pub struct ClassStringDisjunction<'a> {
     pub span: Span,
-    /// `true` if body is empty or contain 2 more characters.
+    /// `true` if body is empty or contain [`ClassString`] which `strings` is `true`
     pub strings: bool,
     pub body: Vec<'a, ClassString<'a>>,
 }
@@ -216,6 +216,8 @@ pub struct ClassStringDisjunction<'a> {
 #[derive(Debug)]
 pub struct ClassString<'a> {
     pub span: Span,
+    /// `true` if body is empty or contain 2 more characters.
+    pub strings: bool,
     pub body: Vec<'a, Character>,
 }
 

diff --git a/crates/oxc_regular_expression/src/body_parser/parser.rs b/crates/oxc_regular_expression/src/body_parser/parser.rs
@@ -590,7 +590,7 @@ impl<'a> PatternParser<'a> {
         };
 
         if self.reader.eat('{') {
-            if let Some((name, value, is_strings_related)) =
+            if let Some((name, value, strings)) =
                 self.consume_unicode_property_value_expression()?
             {
                 if self.reader.eat('}') {
@@ -599,7 +599,7 @@ impl<'a> PatternParser<'a> {
                     // MayContainStrings is true
                     // - if the UnicodePropertyValueExpression is LoneUnicodePropertyNameOrValue
                     //   - and it is binary property of strings(can be true only with `UnicodeSetsMode`)
-                    if negative && is_strings_related {
+                    if negative && strings {
                         return Err(OxcDiagnostic::error(
                             "Invalid property name(negative + property of strings)",
                         )
@@ -609,7 +609,7 @@ impl<'a> PatternParser<'a> {
                     return Ok(Some(ast::UnicodePropertyEscape {
                         span: self.span_factory.create(span_start, self.reader.offset()),
                         negative,
-                        strings: is_strings_related,
+                        strings,
                         name,
                         value,
                     }));
@@ -1299,21 +1299,21 @@ impl<'a> PatternParser<'a> {
                         //   - && ClassUnion has ClassOperands
                         //     - && at least 1 ClassOperand has MayContainStrings: true
                         ast::CharacterClassContentsKind::Union => {
-                            body.iter().any(|item| may_contain_strings(item))
+                            body.iter().any(may_contain_strings)
                         }
                         // MayContainStrings is true
                         // - if ClassContents is ClassIntersection
                         //   - && ClassIntersection has ClassOperands
                         //     - && all ClassOperands have MayContainStrings: true
                         ast::CharacterClassContentsKind::Intersection => {
-                            body.iter().all(|item| may_contain_strings(item))
+                            body.iter().all(may_contain_strings)
                         }
                         // MayContainStrings is true
                         // - if ClassContents is ClassSubtraction
                         //   - && ClassSubtraction has ClassOperands
                         //     - && the first ClassOperand has MayContainStrings: true
                         ast::CharacterClassContentsKind::Subtraction => {
-                            body.iter().next().map_or(false, |item| may_contain_strings(item))
+                            body.iter().next().map_or(false, may_contain_strings)
                         }
                     } {
                         return Err(OxcDiagnostic::error(
@@ -1377,11 +1377,13 @@ impl<'a> PatternParser<'a> {
         let mut strings = false;
 
         loop {
-            let (class_string, contain_strings) = self.parse_class_string()?;
-            body.push(class_string);
-            if contain_strings {
+            let class_string = self.parse_class_string()?;
+
+            // Propagate strings flag
+            if class_string.strings {
                 strings = true;
             }
+            body.push(class_string);
 
             if !self.reader.eat('|') {
                 break;
@@ -1404,24 +1406,22 @@ impl<'a> PatternParser<'a> {
     //   ClassSetCharacter NonEmptyClassString[opt]
     // ```
     // Returns (ClassString, contain_strings)
-    fn parse_class_string(&mut self) -> Result<(ast::ClassString<'a>, bool)> {
+    fn parse_class_string(&mut self) -> Result<ast::ClassString<'a>> {
         let span_start = self.reader.offset();
 
         let mut body = Vec::new_in(self.allocator);
         while let Some(class_set_character) = self.parse_class_set_character()? {
             body.push(class_set_character);
         }
 
-        // True if empty or contains 2 or more characters
-        let contain_strings = body.len() != 1;
+        // `true` if empty or contains 2 or more characters
+        let strings = body.len() != 1;
 
-        Ok((
-            ast::ClassString {
-                span: self.span_factory.create(span_start, self.reader.offset()),
-                body,
-            },
-            contain_strings,
-        ))
+        Ok(ast::ClassString {
+            span: self.span_factory.create(span_start, self.reader.offset()),
+            strings,
+            body,
+        })
     }
 
     // ```
@@ -1864,11 +1864,9 @@ impl<'a> PatternParser<'a> {
     //   [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate
     // ```
     fn consume_reg_exp_idenfigier_part(&mut self) -> Result<Option<u32>> {
-        if let Some(cp) = self.reader.peek() {
-            if unicode::is_identifier_part_char(cp) {
-                self.reader.advance();
-                return Ok(Some(cp));
-            }
+        if let Some(cp) = self.reader.peek().filter(|&cp| unicode::is_identifier_part_char(cp)) {
+            self.reader.advance();
+            return Ok(Some(cp));
         }
 
         let span_start = self.reader.offset();
@@ -2004,6 +2002,7 @@ impl<'a> PatternParser<'a> {
                 )
                 .with_label(self.span_factory.create(span_start, self.reader.offset())));
             }
+
             self.reader.rewind(checkpoint);
         }