diff --git a/crates/oxc_regular_expression/src/display.rs b/crates/oxc_regular_expression/src/display.rs index 014877837853df..89f4752dbe4b60 100644 --- a/crates/oxc_regular_expression/src/display.rs +++ b/crates/oxc_regular_expression/src/display.rs @@ -1,8 +1,11 @@ -use std::fmt::{self, Display}; +use std::{ + fmt::{self, Display}, + iter::Peekable, +}; #[allow(clippy::wildcard_imports)] use crate::ast::*; -use crate::body_parser::unicode::{is_lead_surrogate, is_syntax_character, is_trail_surrogate}; +use crate::body_parser::unicode::{combine_surrogate_pair, is_lead_surrogate, is_trail_surrogate}; impl<'a> Display for RegularExpression<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -50,7 +53,23 @@ impl<'a> Display for Disjunction<'a> { impl<'a> Display for Alternative<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write_join(f, "", &self.body) + fn as_character<'a>(term: &'a Term) -> Option<&'a Character> { + if let Term::Character(ch) = term { + Some(ch) + } else { + None + } + } + write_join_with(f, "", &self.body, |iter| { + let next = iter.next()?; + let Some(next) = as_character(next) else { return Some(next.to_string()) }; + let peek = iter.peek().and_then(|it| as_character(*it)); + let (result, eat) = character_to_string(next, peek); + if eat { + _ = iter.next(); + } + Some(result) + }) } } @@ -132,19 +151,8 @@ impl<'a> Display for Quantifier<'a> { impl Display for Character { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let cp = self.value; - // escape syntax characters - if is_syntax_character(cp) { - write!(f, r"\")?; - } - - if is_lead_surrogate(cp) || is_trail_surrogate(cp) { - write!(f, r"\u{cp:X}") - } else { - // TODO: use `self.kind` to print the correct representation. - let Some(ch) = char::from_u32(cp) else { return Err(fmt::Error) }; - write!(f, "{ch}") - } + let (string, _) = character_to_string(self, None); + write!(f, "{string}") } } @@ -192,6 +200,13 @@ impl<'a> Display for UnicodePropertyEscape<'a> { impl<'a> Display for CharacterClass<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn as_character<'a>(content: &'a CharacterClassContents) -> Option<&'a Character> { + if let CharacterClassContents::Character(ch) = content { + Some(ch) + } else { + None + } + } write!(f, "[")?; if !self.body.is_empty() { @@ -203,7 +218,16 @@ impl<'a> Display for CharacterClass<'a> { CharacterClassContentsKind::Subtraction => "--", CharacterClassContentsKind::Intersection => "&&", }; - write_join(f, sep, &self.body)?; + write_join_with(f, sep, &self.body, |iter| { + let next = iter.next()?; + let Some(next) = as_character(next) else { return Some(next.to_string()) }; + let peek = iter.peek().and_then(|it| as_character(*it)); + let (result, eat) = character_to_string(next, peek); + if eat { + _ = iter.next(); + } + Some(result) + })?; } write!(f, "]") @@ -231,7 +255,7 @@ impl Display for CharacterClassRange { impl<'a> Display for ClassStringDisjunction<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "\\q{{")?; + write!(f, r"\q{{")?; write_join(f, "|", &self.body)?; write!(f, "}}") } @@ -286,7 +310,7 @@ impl<'a> Display for IgnoreGroup<'a> { impl Display for IndexedReference { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "\\{}", self.index) + write!(f, r"\{}", self.index) } } @@ -301,21 +325,101 @@ where S: AsRef, E: Display, I: IntoIterator, +{ + write_join_with(f, sep, items, |iter| iter.next().map(|it| it.to_string())) +} + +fn write_join_with(f: &mut fmt::Formatter<'_>, sep: S, items: I, next: F) -> fmt::Result +where + S: AsRef, + E: Display, + I: IntoIterator, + F: Fn(&mut Peekable) -> Option, { let sep = sep.as_ref(); - let mut iter = items.into_iter(); + let iter = &mut items.into_iter().peekable(); - if let Some(first) = iter.next() { + if let Some(first) = next(iter) { write!(f, "{first}")?; } - for it in iter { + while let Some(it) = next(iter) { write!(f, "{sep}{it}")?; } Ok(()) } +fn character_to_string( + this: &Character, + peek: Option<&Character>, +) -> (/* result */ String, /* true of peek is consumed */ bool) { + const USED_PEEK: bool = true; + const UNUSED_PEEK: bool = false; + + if is_trail_surrogate(this.value) { + return (format!(r"\u{:X}", this.value), UNUSED_PEEK); + } + + let (cp, peeked) = if is_lead_surrogate(this.value) { + if peek.is_some_and(|peek| is_trail_surrogate(peek.value)) { + let Some(peek) = peek else { unreachable!() }; + (combine_surrogate_pair(this.value, peek.value), USED_PEEK) + } else { + return (format!(r"\u{:X}", this.value), UNUSED_PEEK); + } + } else { + (this.value, UNUSED_PEEK) + }; + + // NOTE: even if `ch` is unused this conversion makes sure that we are looking at a correct value. + let Some(ch) = char::from_u32(cp) else { panic!("Invalid `Character` `{cp}`!") }; + let result = match this.kind { + CharacterKind::ControlLetter => match ch { + 'f' => r"\cL", + '\n' => r"\cJ", + '\r' => r"\cM", + '\t' => r"\cI", + 'v' => r"\cK", + _ => panic!("Unknown control letter `{ch}`"), + } + .to_string(), + CharacterKind::Identifier => { + format!(r"\{ch}") + } + CharacterKind::Symbol => format!("{ch}"), + CharacterKind::Null => format!(r"\0"), + CharacterKind::UnicodeEscape => { + // we remove the leading `0x` of our 4 digit hex number. + let hex = &format!("{cp:#4X}")[2..]; + if hex.len() <= 4 { + format!(r"\u{hex}") + } else { + format!(r"\u{{{hex}}}") + } + } + CharacterKind::HexadecimalEscape => { + // we remove the leading `0x` of our 2 digit hex number. + let hex = &format!("{cp:#2X}")[2..]; + format!(r"\x{hex}") + } + CharacterKind::Octal => { + let octal = format!("{cp:o}"); + format!(r"\{octal}") + } + CharacterKind::SingleEscape => match ch { + '\n' => String::from(r"\n"), + '\r' => String::from(r"\r"), + '\t' => String::from(r"\t"), + '\u{8}' => String::from(r"\b"), + '\u{2D}' => String::from(r"\-"), + _ => format!(r"\{ch}"), + }, + }; + + (result, peeked) +} + #[cfg(test)] mod test { use oxc_allocator::Allocator; @@ -338,30 +442,33 @@ mod test { (r"/[a[b[c[d[e[f[g[h[i[j[k[l]]]]]]]]]]]]/v", None), (r"/[\q{abc|d|e|}]/v", None), (r"/\p{Basic_Emoji}/v", None), + (r"/[|\]]/", None), + (r"/c\]/", None), + ("/a{0}|b{1,2}|c{3,}/i", None), + (r"/Em🥹j/", None), + (r"/\n\cM\0\x41\./", None), + (r"/\n\cM\0\x41\u1234\./u", None), + (r"/[\bb]/", None), // we lose the flags ordering ("/abcd/igv", Some("/abcd/giv")), (r"/\d/ug", Some(r"/\d/gu")), - // we always display "syntax characters" as their escaped form. - (r"/c]/", Some(r"/c\]/")), - (r"/[|\]]/", Some(r"/[\|\]]/")), - ("/a{0}|b{1,2}|c{3,}/i", None), - // NOTE: surrogated characters can't be displayed without access to the next character so we - // can't print them correctly via `Display` trait. Instead we print their unicode code point - (r"/Em🥹j/", Some(r"/Em\uD83E\uDD79j/")), - // TODO: currently we do **NOT** respect the `Character::kind` field so everything is - // lowered to their final form. - (r"/\n\cM\0\x41\./", Some("/\n\r\0A\\./")), - (r"/\n\cM\0\x41\u1234\./u", Some("/\n\r\0Aሴ\\./u")), - (r"/\n\cM\0\x41\u{1f600}\./u", Some("/\n\r\0A😀\\./u")), - // TODO: `\b` is parsed as `\u{8}` which is wrong. - // (r"/[\bb]/", Some(r"/[\bb]/")), + // NOTE: we capitalize all hex values. + (r"/\n\cM\0\x41\u{1f600}\./u", Some(r"/\n\cM\0\x41\u{1F600}\./u")), + // TODO: fails with unicode mode + // (r"/c]/", None), ]; fn test_display(allocator: &Allocator, source: &str, expect: Option<&str>) { use crate::{Parser, ParserOptions}; let expect = expect.unwrap_or(source); - let parsed = Parser::new(allocator, source, ParserOptions::default()).parse().unwrap(); - assert_eq!(expect, parsed.to_string()); + let parsed_utf16 = + Parser::new(allocator, source, ParserOptions::default()).parse().unwrap(); + let parsed_unicode = + Parser::new(allocator, source, ParserOptions::default().with_unicode_mode()) + .parse() + .unwrap(); + assert_eq!(expect, parsed_utf16.to_string()); + assert_eq!(expect, parsed_unicode.to_string()); } #[test]