From 77647931e4244aa87d70dcf072c1073cef2e5e78 Mon Sep 17 00:00:00 2001 From: camchenry <1514176+camchenry@users.noreply.github.com> Date: Thu, 26 Sep 2024 05:04:46 +0000 Subject: [PATCH] feat(regular_expression): implement visitor pattern trait for regex AST (#6055) - resolves https://github.com/oxc-project/oxc/issues/5977 - supersedes https://github.com/oxc-project/oxc/pull/5951 To facilitate easier traversal of the Regex AST, this PR defines a `Visit` trait with default implementations that will walk the entirety of the Regex AST. Methods in the `Visit` trait can be overridden with custom implementations to do things like analyzing only certain nodes in a regular expression, which will be useful for regex-related `oxc_linter` rules. In the future, we should consider automatically generating this code as it is very repetitive, but for now a handwritten visitor is sufficient. --- .../examples/visitor.rs | 28 ++ crates/oxc_regular_expression/src/lib.rs | 1 + crates/oxc_regular_expression/src/visit.rs | 389 ++++++++++++++++++ 3 files changed, 418 insertions(+) create mode 100644 crates/oxc_regular_expression/examples/visitor.rs create mode 100644 crates/oxc_regular_expression/src/visit.rs diff --git a/crates/oxc_regular_expression/examples/visitor.rs b/crates/oxc_regular_expression/examples/visitor.rs new file mode 100644 index 0000000000000..4c087ef1f9dc7 --- /dev/null +++ b/crates/oxc_regular_expression/examples/visitor.rs @@ -0,0 +1,28 @@ +#![allow(clippy::print_stdout)] + +use oxc_allocator::Allocator; +use oxc_regular_expression::{ + visit::{RegExpAstKind, Visit}, + Parser, ParserOptions, +}; + +struct TestVisitor; + +impl Visit<'_> for TestVisitor { + fn enter_node(&mut self, kind: RegExpAstKind) { + println!("enter_node: {kind:?}"); + } + + fn leave_node(&mut self, kind: RegExpAstKind) { + println!("leave_node: {kind:?}"); + } +} + +fn main() { + let source_text = r"/(https?:\/\/github\.com\/(([^\s]+)\/([^\s]+))\/([^\s]+\/)?(issues|pull)\/([0-9]+))|(([^\s]+)\/([^\s]+))?#([1-9][0-9]*)($|[\s\:\;\-\(\=])/"; + let allocator = Allocator::default(); + let parser = Parser::new(&allocator, source_text, ParserOptions::default()); + let pattern = parser.parse().unwrap().pattern; + let mut visitor = TestVisitor; + visitor.visit_pattern(&pattern); +} diff --git a/crates/oxc_regular_expression/src/lib.rs b/crates/oxc_regular_expression/src/lib.rs index d1b19e075df96..ee697a9f02b76 100644 --- a/crates/oxc_regular_expression/src/lib.rs +++ b/crates/oxc_regular_expression/src/lib.rs @@ -9,6 +9,7 @@ mod literal_parser; mod options; mod span; mod surrogate_pair; +pub mod visit; mod generated { mod derive_clone_in; diff --git a/crates/oxc_regular_expression/src/visit.rs b/crates/oxc_regular_expression/src/visit.rs new file mode 100644 index 0000000000000..2845f69c3fbcc --- /dev/null +++ b/crates/oxc_regular_expression/src/visit.rs @@ -0,0 +1,389 @@ +#![allow(unused_variables, clippy::wildcard_imports)] +use walk::walk_pattern; + +use crate::ast::{ + Alternative, BoundaryAssertion, CapturingGroup, Character, CharacterClass, + CharacterClassContents, CharacterClassEscape, CharacterClassRange, ClassString, + ClassStringDisjunction, Disjunction, Dot, IgnoreGroup, IndexedReference, LookAroundAssertion, + NamedReference, Pattern, Quantifier, Term, UnicodePropertyEscape, +}; +use walk::*; + +#[derive(Copy, Clone, Debug)] +pub enum RegExpAstKind<'a> { + Pattern(&'a Pattern<'a>), + Disjunction(&'a Disjunction<'a>), + Alternative(&'a Alternative<'a>), + Term(&'a Term<'a>), + LookAroundAssertion(&'a LookAroundAssertion<'a>), + Quantifier(&'a Quantifier<'a>), + CapturingGroup(&'a CapturingGroup<'a>), + IgnoreGroup(&'a IgnoreGroup<'a>), + BoundaryAssertion(&'a BoundaryAssertion), + Character(&'a Character), + Dot(&'a Dot), + CharacterClassEscape(&'a CharacterClassEscape), + UnicodePropertyEscape(&'a UnicodePropertyEscape<'a>), + CharacterClass(&'a CharacterClass<'a>), + CharacterClassContents(&'a CharacterClassContents<'a>), + CharacterClassRange(&'a CharacterClassRange), + CharacterClassStringDisjunction(&'a ClassStringDisjunction<'a>), + CharacterClassString(&'a ClassString<'a>), + IndexedReference(&'a IndexedReference), + NamedReference(&'a NamedReference<'a>), +} + +/// RegEx syntax tree traversal +pub trait Visit<'a>: Sized { + #[inline] + fn enter_node(&mut self, kind: RegExpAstKind<'a>) {} + #[inline] + fn leave_node(&mut self, kind: RegExpAstKind<'a>) {} + + #[inline] + fn alloc(&self, t: &T) -> &'a T { + // SAFETY: + // This should be safe as long as `src` is an reference from the allocator. + // But honestly, I'm not really sure if this is safe. + unsafe { std::mem::transmute(t) } + } + + #[inline] + fn visit_pattern(&mut self, it: &Pattern<'a>) { + walk_pattern(self, it); + } + + #[inline] + fn visit_disjunction(&mut self, it: &Disjunction<'a>) { + walk_disjunction(self, it); + } + + #[inline] + fn visit_alternative(&mut self, it: &Alternative<'a>) { + walk_alternative(self, it); + } + + #[inline] + fn visit_term(&mut self, it: &Term<'a>) { + walk_term(self, it); + } + + #[inline] + fn visit_lookaround_assertion(&mut self, it: &LookAroundAssertion<'a>) { + walk_lookaround_assertion(self, it); + } + + #[inline] + fn visit_quantifier(&mut self, it: &Quantifier<'a>) { + walk_quantifier(self, it); + } + + #[inline] + fn visit_capturing_group(&mut self, it: &CapturingGroup<'a>) { + walk_capturing_group(self, it); + } + + #[inline] + fn visit_ignore_group(&mut self, it: &IgnoreGroup<'a>) { + walk_ignore_group(self, it); + } + + #[inline] + fn visit_boundary_assertion(&mut self, it: &BoundaryAssertion) { + walk_boundary_assertion(self, it); + } + + #[inline] + fn visit_character(&mut self, it: &Character) { + walk_character(self, it); + } + + #[inline] + fn visit_dot(&mut self, it: &Dot) { + walk_dot(self, it); + } + + #[inline] + fn visit_character_class_escape(&mut self, it: &CharacterClassEscape) { + walk_character_class_escape(self, it); + } + + #[inline] + fn visit_unicode_property_escape(&mut self, it: &UnicodePropertyEscape<'a>) { + walk_unicode_property_escape(self, it); + } + + #[inline] + fn visit_character_class(&mut self, it: &CharacterClass<'a>) { + walk_character_class(self, it); + } + + #[inline] + fn visit_character_class_contents(&mut self, it: &CharacterClassContents<'a>) { + walk_character_class_contents(self, it); + } + + #[inline] + fn visit_character_class_range(&mut self, it: &CharacterClassRange) { + walk_character_class_range(self, it); + } + + #[inline] + fn visit_character_class_string_disjunction(&mut self, it: &ClassStringDisjunction<'a>) { + walk_character_class_string_disjunction(self, it); + } + + #[inline] + fn visit_character_class_string(&mut self, it: &ClassString<'a>) { + walk_character_class_string(self, it); + } + + #[inline] + fn visit_indexed_reference(&mut self, it: &IndexedReference) { + walk_indexed_reference(self, it); + } + + #[inline] + fn visit_named_reference(&mut self, it: &NamedReference<'a>) { + walk_named_reference(self, it); + } +} + +pub mod walk { + use super::*; + + #[inline] + pub fn walk_pattern<'a, V: Visit<'a>>(visitor: &mut V, it: &Pattern<'a>) { + let kind = RegExpAstKind::Pattern(visitor.alloc(it)); + visitor.enter_node(kind); + visitor.visit_disjunction(&it.body); + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_disjunction<'a, V: Visit<'a>>(visitor: &mut V, it: &Disjunction<'a>) { + let kind = RegExpAstKind::Disjunction(visitor.alloc(it)); + visitor.enter_node(kind); + for alt in &it.body { + visitor.visit_alternative(alt); + } + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_alternative<'a, V: Visit<'a>>(visitor: &mut V, it: &Alternative<'a>) { + let kind = RegExpAstKind::Alternative(visitor.alloc(it)); + visitor.enter_node(kind); + for term in &it.body { + visitor.visit_term(term); + } + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_term<'a, V: Visit<'a>>(visitor: &mut V, it: &Term<'a>) { + let kind = RegExpAstKind::Term(visitor.alloc(it)); + visitor.enter_node(kind); + match it { + Term::LookAroundAssertion(lookaround) => { + visitor.visit_lookaround_assertion(lookaround); + } + Term::Quantifier(quant) => { + visitor.visit_quantifier(quant); + } + Term::CapturingGroup(group) => { + visitor.visit_capturing_group(group); + } + Term::IgnoreGroup(group) => { + visitor.visit_ignore_group(group); + } + Term::BoundaryAssertion(boundary_assertion) => { + visitor.visit_boundary_assertion(boundary_assertion); + } + Term::Character(character) => { + visitor.visit_character(character); + } + Term::Dot(dot) => { + visitor.visit_dot(dot); + } + Term::CharacterClassEscape(character_class_escape) => { + visitor.visit_character_class_escape(character_class_escape); + } + Term::UnicodePropertyEscape(unicode_property_escape) => { + visitor.visit_unicode_property_escape(unicode_property_escape); + } + Term::CharacterClass(character_class) => { + visitor.visit_character_class(character_class); + } + Term::IndexedReference(indexed_reference) => { + visitor.visit_indexed_reference(indexed_reference); + } + Term::NamedReference(named_reference) => { + visitor.visit_named_reference(named_reference); + } + } + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_lookaround_assertion<'a, V: Visit<'a>>( + visitor: &mut V, + it: &LookAroundAssertion<'a>, + ) { + let kind = RegExpAstKind::LookAroundAssertion(visitor.alloc(it)); + visitor.enter_node(kind); + visitor.visit_disjunction(&it.body); + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_quantifier<'a, V: Visit<'a>>(visitor: &mut V, it: &Quantifier<'a>) { + let kind = RegExpAstKind::Quantifier(visitor.alloc(it)); + visitor.enter_node(kind); + visitor.visit_term(&it.body); + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_capturing_group<'a, V: Visit<'a>>(visitor: &mut V, it: &CapturingGroup<'a>) { + let kind = RegExpAstKind::CapturingGroup(visitor.alloc(it)); + visitor.enter_node(kind); + visitor.visit_disjunction(&it.body); + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_ignore_group<'a, V: Visit<'a>>(visitor: &mut V, it: &IgnoreGroup<'a>) { + let kind = RegExpAstKind::IgnoreGroup(visitor.alloc(it)); + visitor.enter_node(kind); + visitor.visit_disjunction(&it.body); + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_boundary_assertion<'a, V: Visit<'a>>(visitor: &mut V, it: &BoundaryAssertion) { + let kind = RegExpAstKind::BoundaryAssertion(visitor.alloc(it)); + visitor.enter_node(kind); + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_character<'a, V: Visit<'a>>(visitor: &mut V, it: &Character) { + let kind = RegExpAstKind::Character(visitor.alloc(it)); + visitor.enter_node(kind); + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_dot<'a, V: Visit<'a>>(visitor: &mut V, it: &Dot) { + let kind = RegExpAstKind::Dot(visitor.alloc(it)); + visitor.enter_node(kind); + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_character_class_escape<'a, V: Visit<'a>>( + visitor: &mut V, + it: &CharacterClassEscape, + ) { + let kind = RegExpAstKind::CharacterClassEscape(visitor.alloc(it)); + visitor.enter_node(kind); + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_unicode_property_escape<'a, V: Visit<'a>>( + visitor: &mut V, + it: &UnicodePropertyEscape<'a>, + ) { + let kind = RegExpAstKind::UnicodePropertyEscape(visitor.alloc(it)); + visitor.enter_node(kind); + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_character_class<'a, V: Visit<'a>>(visitor: &mut V, it: &CharacterClass<'a>) { + let kind = RegExpAstKind::CharacterClass(visitor.alloc(it)); + visitor.enter_node(kind); + for content in &it.body { + visitor.visit_character_class_contents(content); + } + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_character_class_contents<'a, V: Visit<'a>>( + visitor: &mut V, + it: &CharacterClassContents<'a>, + ) { + let kind = RegExpAstKind::CharacterClassContents(visitor.alloc(it)); + visitor.enter_node(kind); + match it { + CharacterClassContents::CharacterClassRange(character_class_range) => { + visitor.visit_character_class_range(character_class_range); + } + CharacterClassContents::CharacterClassEscape(character_class_escape) => { + visitor.visit_character_class_escape(character_class_escape); + } + CharacterClassContents::UnicodePropertyEscape(unicode_property_escape) => { + visitor.visit_unicode_property_escape(unicode_property_escape); + } + CharacterClassContents::Character(character) => { + visitor.visit_character(character); + } + CharacterClassContents::NestedCharacterClass(character_class) => { + visitor.visit_character_class(character_class); + } + CharacterClassContents::ClassStringDisjunction(class_string_disjunction) => { + visitor.visit_character_class_string_disjunction(class_string_disjunction); + } + } + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_character_class_range<'a, V: Visit<'a>>(visitor: &mut V, it: &CharacterClassRange) { + let kind = RegExpAstKind::CharacterClassRange(visitor.alloc(it)); + visitor.enter_node(kind); + visitor.visit_character(&it.min); + visitor.visit_character(&it.max); + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_character_class_string_disjunction<'a, V: Visit<'a>>( + visitor: &mut V, + it: &ClassStringDisjunction<'a>, + ) { + let kind = RegExpAstKind::CharacterClassStringDisjunction(visitor.alloc(it)); + visitor.enter_node(kind); + for string in &it.body { + visitor.visit_character_class_string(string); + } + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_character_class_string<'a, V: Visit<'a>>(visitor: &mut V, it: &ClassString<'a>) { + let kind = RegExpAstKind::CharacterClassString(visitor.alloc(it)); + visitor.enter_node(kind); + for character in &it.body { + visitor.visit_character(character); + } + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_indexed_reference<'a, V: Visit<'a>>(visitor: &mut V, it: &IndexedReference) { + let kind = RegExpAstKind::IndexedReference(visitor.alloc(it)); + visitor.enter_node(kind); + visitor.leave_node(kind); + } + + #[inline] + pub fn walk_named_reference<'a, V: Visit<'a>>(visitor: &mut V, it: &NamedReference<'a>) { + let kind = RegExpAstKind::NamedReference(visitor.alloc(it)); + visitor.enter_node(kind); + visitor.leave_node(kind); + } +}