Skip to content

Commit

Permalink
refactor(regular_expression): Misc refactoring for body_parser (#5062)
Browse files Browse the repository at this point in the history
- Add examples to list all `RegExp`s in source code
- Refactor `MayContainStrings` related part
  • Loading branch information
leaysgur committed Aug 22, 2024
1 parent 3b35332 commit 96f5798
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 25 deletions.
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions crates/oxc_regular_expression/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,10 @@ oxc_span = { workspace = true }
phf = { workspace = true, features = ["macros"] }
rustc-hash = { workspace = true }
unicode-id-start = { workspace = true }

[dev-dependencies]
oxc_allocator = { workspace = true }
oxc_ast = { workspace = true }
oxc_parser = { workspace = true }
oxc_semantic = { workspace = true }
oxc_span = { workspace = true }
68 changes: 68 additions & 0 deletions crates/oxc_regular_expression/examples/parse_file.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#![allow(clippy::print_stdout)]
use std::{env, fs, path::Path, sync::Arc};

use oxc_allocator::Allocator;
use oxc_ast::AstKind;
use oxc_parser::Parser;
use oxc_semantic::SemanticBuilder;
use oxc_span::SourceType;

fn main() {
let name = env::args().nth(1).unwrap_or_else(|| "test.js".to_string());
let path = Path::new(&name);

let source_text = Arc::new(fs::read_to_string(path).unwrap());
let source_type = SourceType::from_path(path).unwrap();

let allocator = Allocator::default();

let parser_ret = Parser::new(&allocator, &source_text, source_type).parse();
if !parser_ret.errors.is_empty() {
println!("Parsing failed:");
for error in parser_ret.errors {
let error = error.with_source_code(Arc::clone(&source_text));
println!("{error:?}");
}
return;
}

let program = allocator.alloc(parser_ret.program);
let semantic_ret = SemanticBuilder::new(&source_text, source_type).build(program);
let semantic = semantic_ret.semantic;

for node in semantic.nodes().iter() {
match node.kind() {
AstKind::RegExpLiteral(re) => {
let literal = re.span.source_text(&source_text);
let parsed = oxc_regular_expression::Parser::new(
&allocator,
literal,
oxc_regular_expression::ParserOptions::default()
.with_span_offset(re.span.start),
)
.parse();

println!("🍀 {literal}");
if let Err(error) = parsed {
let error = error.with_source_code(Arc::clone(&source_text));
println!("{error:?}");
return;
}
println!("{parsed:#?}");
println!();
}
AstKind::NewExpression(new_expr) => {
if new_expr
.callee
.get_identifier_reference()
.filter(|ident| ident.name == "RegExp")
.is_some()
{
println!("👻 TODO: new RegExp(...)");
println!();
}
}
_ => {}
}
}
}
3 changes: 3 additions & 0 deletions crates/oxc_regular_expression/examples/test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
const re1 = /abc{1}/gsv;
const re2 = new RegExp("ooo", "u");
const re3 = /[\w--[v]]/gsv;
4 changes: 3 additions & 1 deletion crates/oxc_regular_expression/src/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ pub struct CharacterClassRange {
#[derive(Debug)]
pub struct ClassStringDisjunction<'a> {
pub span: Span,
/// `true` if body is empty or contain 2 more characters.
/// `true` if body is empty or contain [`ClassString`] which `strings` is `true`
pub strings: bool,
pub body: Vec<'a, ClassString<'a>>,
}
Expand All @@ -216,6 +216,8 @@ pub struct ClassStringDisjunction<'a> {
#[derive(Debug)]
pub struct ClassString<'a> {
pub span: Span,
/// `true` if body is empty or contain 2 more characters.
pub strings: bool,
pub body: Vec<'a, Character>,
}

Expand Down
47 changes: 23 additions & 24 deletions crates/oxc_regular_expression/src/body_parser/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,7 @@ impl<'a> PatternParser<'a> {
};

if self.reader.eat('{') {
if let Some((name, value, is_strings_related)) =
if let Some((name, value, strings)) =
self.consume_unicode_property_value_expression()?
{
if self.reader.eat('}') {
Expand All @@ -599,7 +599,7 @@ impl<'a> PatternParser<'a> {
// MayContainStrings is true
// - if the UnicodePropertyValueExpression is LoneUnicodePropertyNameOrValue
// - and it is binary property of strings(can be true only with `UnicodeSetsMode`)
if negative && is_strings_related {
if negative && strings {
return Err(OxcDiagnostic::error(
"Invalid property name(negative + property of strings)",
)
Expand All @@ -609,7 +609,7 @@ impl<'a> PatternParser<'a> {
return Ok(Some(ast::UnicodePropertyEscape {
span: self.span_factory.create(span_start, self.reader.offset()),
negative,
strings: is_strings_related,
strings,
name,
value,
}));
Expand Down Expand Up @@ -1299,21 +1299,21 @@ impl<'a> PatternParser<'a> {
// - && ClassUnion has ClassOperands
// - && at least 1 ClassOperand has MayContainStrings: true
ast::CharacterClassContentsKind::Union => {
body.iter().any(|item| may_contain_strings(item))
body.iter().any(may_contain_strings)
}
// MayContainStrings is true
// - if ClassContents is ClassIntersection
// - && ClassIntersection has ClassOperands
// - && all ClassOperands have MayContainStrings: true
ast::CharacterClassContentsKind::Intersection => {
body.iter().all(|item| may_contain_strings(item))
body.iter().all(may_contain_strings)
}
// MayContainStrings is true
// - if ClassContents is ClassSubtraction
// - && ClassSubtraction has ClassOperands
// - && the first ClassOperand has MayContainStrings: true
ast::CharacterClassContentsKind::Subtraction => {
body.iter().next().map_or(false, |item| may_contain_strings(item))
body.iter().next().map_or(false, may_contain_strings)
}
} {
return Err(OxcDiagnostic::error(
Expand Down Expand Up @@ -1377,11 +1377,13 @@ impl<'a> PatternParser<'a> {
let mut strings = false;

loop {
let (class_string, contain_strings) = self.parse_class_string()?;
body.push(class_string);
if contain_strings {
let class_string = self.parse_class_string()?;

// Propagate strings flag
if class_string.strings {
strings = true;
}
body.push(class_string);

if !self.reader.eat('|') {
break;
Expand All @@ -1404,24 +1406,22 @@ impl<'a> PatternParser<'a> {
// ClassSetCharacter NonEmptyClassString[opt]
// ```
// Returns (ClassString, contain_strings)
fn parse_class_string(&mut self) -> Result<(ast::ClassString<'a>, bool)> {
fn parse_class_string(&mut self) -> Result<ast::ClassString<'a>> {
let span_start = self.reader.offset();

let mut body = Vec::new_in(self.allocator);
while let Some(class_set_character) = self.parse_class_set_character()? {
body.push(class_set_character);
}

// True if empty or contains 2 or more characters
let contain_strings = body.len() != 1;
// `true` if empty or contains 2 or more characters
let strings = body.len() != 1;

Ok((
ast::ClassString {
span: self.span_factory.create(span_start, self.reader.offset()),
body,
},
contain_strings,
))
Ok(ast::ClassString {
span: self.span_factory.create(span_start, self.reader.offset()),
strings,
body,
})
}

// ```
Expand Down Expand Up @@ -1864,11 +1864,9 @@ impl<'a> PatternParser<'a> {
// [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate
// ```
fn consume_reg_exp_idenfigier_part(&mut self) -> Result<Option<u32>> {
if let Some(cp) = self.reader.peek() {
if unicode::is_identifier_part_char(cp) {
self.reader.advance();
return Ok(Some(cp));
}
if let Some(cp) = self.reader.peek().filter(|&cp| unicode::is_identifier_part_char(cp)) {
self.reader.advance();
return Ok(Some(cp));
}

let span_start = self.reader.offset();
Expand Down Expand Up @@ -2004,6 +2002,7 @@ impl<'a> PatternParser<'a> {
)
.with_label(self.span_factory.create(span_start, self.reader.offset())));
}

self.reader.rewind(checkpoint);
}

Expand Down

0 comments on commit 96f5798

Please sign in to comment.