Skip to content

Commit

Permalink
fix: bugs.
Browse files Browse the repository at this point in the history
  • Loading branch information
rzvxa committed Aug 29, 2024
1 parent 02710dc commit f3723fb
Showing 1 changed file with 145 additions and 38 deletions.
183 changes: 145 additions & 38 deletions crates/oxc_regular_expression/src/display.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
use std::fmt::{self, Display};
use std::{
fmt::{self, Display},
iter::Peekable,
};

#[allow(clippy::wildcard_imports)]
use crate::ast::*;
use crate::body_parser::unicode::{is_lead_surrogate, is_syntax_character, is_trail_surrogate};
use crate::body_parser::unicode::{combine_surrogate_pair, is_lead_surrogate, is_trail_surrogate};

impl<'a> Display for RegularExpression<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
Expand Down Expand Up @@ -50,7 +53,23 @@ impl<'a> Display for Disjunction<'a> {

impl<'a> Display for Alternative<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write_join(f, "", &self.body)
fn as_character<'a>(term: &'a Term) -> Option<&'a Character> {
if let Term::Character(ch) = term {
Some(ch)
} else {
None
}
}
write_join_with(f, "", &self.body, |iter| {
let next = iter.next()?;
let Some(next) = as_character(next) else { return Some(next.to_string()) };
let peek = iter.peek().and_then(|it| as_character(*it));
let (result, eat) = character_to_string(next, peek);
if eat {
_ = iter.next();
}
Some(result)
})
}
}

Expand Down Expand Up @@ -132,19 +151,8 @@ impl<'a> Display for Quantifier<'a> {

impl Display for Character {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let cp = self.value;
// escape syntax characters
if is_syntax_character(cp) {
write!(f, r"\")?;
}

if is_lead_surrogate(cp) || is_trail_surrogate(cp) {
write!(f, r"\u{cp:X}")
} else {
// TODO: use `self.kind` to print the correct representation.
let Some(ch) = char::from_u32(cp) else { return Err(fmt::Error) };
write!(f, "{ch}")
}
let (string, _) = character_to_string(self, None);
write!(f, "{string}")
}
}

Expand Down Expand Up @@ -192,6 +200,13 @@ impl<'a> Display for UnicodePropertyEscape<'a> {

impl<'a> Display for CharacterClass<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fn as_character<'a>(content: &'a CharacterClassContents) -> Option<&'a Character> {
if let CharacterClassContents::Character(ch) = content {
Some(ch)
} else {
None
}
}
write!(f, "[")?;

if !self.body.is_empty() {
Expand All @@ -203,7 +218,16 @@ impl<'a> Display for CharacterClass<'a> {
CharacterClassContentsKind::Subtraction => "--",
CharacterClassContentsKind::Intersection => "&&",
};
write_join(f, sep, &self.body)?;
write_join_with(f, sep, &self.body, |iter| {
let next = iter.next()?;
let Some(next) = as_character(next) else { return Some(next.to_string()) };
let peek = iter.peek().and_then(|it| as_character(*it));
let (result, eat) = character_to_string(next, peek);
if eat {
_ = iter.next();
}
Some(result)
})?;
}

write!(f, "]")
Expand Down Expand Up @@ -231,7 +255,7 @@ impl Display for CharacterClassRange {

impl<'a> Display for ClassStringDisjunction<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "\\q{{")?;
write!(f, r"\q{{")?;
write_join(f, "|", &self.body)?;
write!(f, "}}")
}
Expand Down Expand Up @@ -286,7 +310,7 @@ impl<'a> Display for IgnoreGroup<'a> {

impl Display for IndexedReference {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "\\{}", self.index)
write!(f, r"\{}", self.index)
}
}

Expand All @@ -301,21 +325,101 @@ where
S: AsRef<str>,
E: Display,
I: IntoIterator<Item = E>,
{
write_join_with(f, sep, items, |iter| iter.next().map(|it| it.to_string()))
}

fn write_join_with<S, I, E, F>(f: &mut fmt::Formatter<'_>, sep: S, items: I, next: F) -> fmt::Result
where
S: AsRef<str>,
E: Display,
I: IntoIterator<Item = E>,
F: Fn(&mut Peekable<I::IntoIter>) -> Option<String>,
{
let sep = sep.as_ref();
let mut iter = items.into_iter();
let iter = &mut items.into_iter().peekable();

if let Some(first) = iter.next() {
if let Some(first) = next(iter) {
write!(f, "{first}")?;
}

for it in iter {
while let Some(it) = next(iter) {
write!(f, "{sep}{it}")?;
}

Ok(())
}

fn character_to_string(
this: &Character,
peek: Option<&Character>,
) -> (/* result */ String, /* true of peek is consumed */ bool) {
const USED_PEEK: bool = true;
const UNUSED_PEEK: bool = false;

if is_trail_surrogate(this.value) {
return (format!(r"\u{:X}", this.value), UNUSED_PEEK);
}

let (cp, peeked) = if is_lead_surrogate(this.value) {
if peek.is_some_and(|peek| is_trail_surrogate(peek.value)) {
let Some(peek) = peek else { unreachable!() };
(combine_surrogate_pair(this.value, peek.value), USED_PEEK)
} else {
return (format!(r"\u{:X}", this.value), UNUSED_PEEK);
}
} else {
(this.value, UNUSED_PEEK)
};

// NOTE: even if `ch` is unused this conversion makes sure that we are looking at a correct value.
let Some(ch) = char::from_u32(cp) else { panic!("Invalid `Character` `{cp}`!") };
let result = match this.kind {
CharacterKind::ControlLetter => match ch {
'f' => r"\cL",
'\n' => r"\cJ",
'\r' => r"\cM",
'\t' => r"\cI",
'v' => r"\cK",
_ => panic!("Unknown control letter `{ch}`"),
}
.to_string(),
CharacterKind::Identifier => {
format!(r"\{ch}")
}
CharacterKind::Symbol => format!("{ch}"),
CharacterKind::Null => format!(r"\0"),
CharacterKind::UnicodeEscape => {
// we remove the leading `0x` of our 4 digit hex number.
let hex = &format!("{cp:#4X}")[2..];
if hex.len() <= 4 {
format!(r"\u{hex}")
} else {
format!(r"\u{{{hex}}}")
}
}
CharacterKind::HexadecimalEscape => {
// we remove the leading `0x` of our 2 digit hex number.
let hex = &format!("{cp:#2X}")[2..];
format!(r"\x{hex}")
}
CharacterKind::Octal => {
let octal = format!("{cp:o}");
format!(r"\{octal}")
}
CharacterKind::SingleEscape => match ch {
'\n' => String::from(r"\n"),
'\r' => String::from(r"\r"),
'\t' => String::from(r"\t"),
'\u{8}' => String::from(r"\b"),
'\u{2D}' => String::from(r"\-"),
_ => format!(r"\{ch}"),
},
};

(result, peeked)
}

#[cfg(test)]
mod test {
use oxc_allocator::Allocator;
Expand All @@ -338,30 +442,33 @@ mod test {
(r"/[a[b[c[d[e[f[g[h[i[j[k[l]]]]]]]]]]]]/v", None),
(r"/[\q{abc|d|e|}]/v", None),
(r"/\p{Basic_Emoji}/v", None),
(r"/[|\]]/", None),
(r"/c\]/", None),
("/a{0}|b{1,2}|c{3,}/i", None),
(r"/Em🥹j/", None),
(r"/\n\cM\0\x41\./", None),
(r"/\n\cM\0\x41\u1234\./u", None),
(r"/[\bb]/", None),
// we lose the flags ordering
("/abcd/igv", Some("/abcd/giv")),
(r"/\d/ug", Some(r"/\d/gu")),
// we always display "syntax characters" as their escaped form.
(r"/c]/", Some(r"/c\]/")),
(r"/[|\]]/", Some(r"/[\|\]]/")),
("/a{0}|b{1,2}|c{3,}/i", None),
// NOTE: surrogated characters can't be displayed without access to the next character so we
// can't print them correctly via `Display` trait. Instead we print their unicode code point
(r"/Em🥹j/", Some(r"/Em\uD83E\uDD79j/")),
// TODO: currently we do **NOT** respect the `Character::kind` field so everything is
// lowered to their final form.
(r"/\n\cM\0\x41\./", Some("/\n\r\0A\\./")),
(r"/\n\cM\0\x41\u1234\./u", Some("/\n\r\0Aሴ\\./u")),
(r"/\n\cM\0\x41\u{1f600}\./u", Some("/\n\r\0A😀\\./u")),
// TODO: `\b` is parsed as `\u{8}` which is wrong.
// (r"/[\bb]/", Some(r"/[\bb]/")),
// NOTE: we capitalize all hex values.
(r"/\n\cM\0\x41\u{1f600}\./u", Some(r"/\n\cM\0\x41\u{1F600}\./u")),
// TODO: fails with unicode mode
// (r"/c]/", None),
];

fn test_display(allocator: &Allocator, source: &str, expect: Option<&str>) {
use crate::{Parser, ParserOptions};
let expect = expect.unwrap_or(source);
let parsed = Parser::new(allocator, source, ParserOptions::default()).parse().unwrap();
assert_eq!(expect, parsed.to_string());
let parsed_utf16 =
Parser::new(allocator, source, ParserOptions::default()).parse().unwrap();
let parsed_unicode =
Parser::new(allocator, source, ParserOptions::default().with_unicode_mode())
.parse()
.unwrap();
assert_eq!(expect, parsed_utf16.to_string());
assert_eq!(expect, parsed_unicode.to_string());
}

#[test]
Expand Down

0 comments on commit f3723fb

Please sign in to comment.