Skip to content

Commit

Permalink
Revert of [regexp] Support unicode capture names in non-unicode patte…
Browse files Browse the repository at this point in the history
…rns (patchset #3 id:40001 of https://codereview.chromium.org/2791163003/ )

Reason for revert:
The decision for the specification was to not have this syntax, and instead the syntax before this patch.

Original issue's description:
> [regexp] Support unicode capture names in non-unicode patterns
>
> This ensures that capture names containing surrogate pairs are parsed
> correctly even in non-unicode RegExp patterns by introducing a new
> scanning mode which unconditionally combines surrogate pairs.
>
> BUG=v8:5437,v8:6192
>
> Review-Url: https://codereview.chromium.org/2791163003
> Cr-Commit-Position: refs/heads/master@{#44466}
> Committed: https://chromium.googlesource.com/v8/v8/+/a8651c5671dd6e41595ffb438e7ea013082f2d38

R=yangguo@chromium.org,jgruber@chromium.org
# Not skipping CQ checks because original CL landed more than 1 days ago.
BUG=v8:5437,v8:6192

Review-Url: https://codereview.chromium.org/2859933003
Cr-Commit-Position: refs/heads/master@{#45088}
  • Loading branch information
littledan authored and Commit bot committed May 4, 2017
1 parent 6b4e8c2 commit f918404
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 29 deletions.
30 changes: 13 additions & 17 deletions src/regexp/regexp-parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,13 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
Advance();
}

inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) {
template <bool update_position>
inline uc32 RegExpParser::ReadNext() {
int position = next_pos_;
uc32 c0 = in()->Get(position);
position++;
const bool try_combine_surrogate_pairs =
(unicode() || mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS);
if (try_combine_surrogate_pairs && position < in()->length() &&
// Read the whole surrogate pair in case of unicode flag, if possible.
if (unicode() && position < in()->length() &&
unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
uc16 c1 = in()->Get(position);
if (unibrow::Utf16::IsTrailSurrogate(c1)) {
Expand All @@ -67,13 +67,13 @@ inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) {

uc32 RegExpParser::Next() {
if (has_next()) {
return ReadNext(false, ScanMode::DEFAULT);
return ReadNext<false>();
} else {
return kEndMarker;
}
}

void RegExpParser::Advance(ScanMode mode) {
void RegExpParser::Advance() {
if (has_next()) {
StackLimitCheck check(isolate());
if (check.HasOverflowed()) {
Expand All @@ -83,7 +83,7 @@ void RegExpParser::Advance(ScanMode mode) {
} else if (zone()->excess_allocation()) {
ReportError(CStrVector("Regular expression too large"));
} else {
current_ = ReadNext(true, mode);
current_ = ReadNext<true>();
}
} else {
current_ = kEndMarker;
Expand All @@ -101,9 +101,9 @@ void RegExpParser::Reset(int pos) {
Advance();
}

void RegExpParser::Advance(int dist, ScanMode mode) {
void RegExpParser::Advance(int dist) {
next_pos_ += dist - 1;
Advance(mode);
Advance();
}


Expand Down Expand Up @@ -326,6 +326,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
if (FLAG_harmony_regexp_named_captures) {
has_named_captures_ = true;
is_named_capture = true;
Advance();
break;
}
// Fall through.
Expand Down Expand Up @@ -761,24 +762,18 @@ static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {

const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
DCHECK(FLAG_harmony_regexp_named_captures);
DCHECK_EQ(current(), '<');

ZoneVector<uc16>* name =
new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());

// Capture names can always contain surrogate pairs, and we need to scan
// accordingly.
const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS;
Advance(scan_mode);

bool at_start = true;
while (true) {
uc32 c = current();
Advance(scan_mode);
Advance();

// Convert unicode escapes.
if (c == '\\' && current() == 'u') {
Advance(scan_mode);
Advance();
if (!ParseUnicodeEscape(&c)) {
ReportError(CStrVector("Invalid Unicode escape sequence"));
return nullptr;
Expand Down Expand Up @@ -849,6 +844,7 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
return false;
}

Advance();
const ZoneVector<uc16>* name = ParseCaptureGroupName();
if (name == nullptr) {
return false;
Expand Down
14 changes: 4 additions & 10 deletions src/regexp/regexp-parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,18 +184,11 @@ class RegExpParser BASE_EMBEDDED {
// can be reparsed.
bool ParseBackReferenceIndex(int* index_out);

// The default behavior is to combine surrogate pairs in unicode mode and
// don't combine them otherwise (a quantifier after a surrogate pair would
// then apply only to the trailing surrogate). Forcing combination is required
// when parsing capture names since they can always legally contain surrogate
// pairs.
enum class ScanMode { DEFAULT, FORCE_COMBINE_SURROGATE_PAIRS };

bool ParseClassProperty(ZoneList<CharacterRange>* result);
CharacterRange ParseClassAtom(uc16* char_class);
RegExpTree* ReportError(Vector<const char> message);
void Advance(ScanMode mode = ScanMode::DEFAULT);
void Advance(int dist, ScanMode mode = ScanMode::DEFAULT);
void Advance();
void Advance(int dist);
void Reset(int pos);

// Reports whether the pattern might be used as a literal search string.
Expand Down Expand Up @@ -311,7 +304,8 @@ class RegExpParser BASE_EMBEDDED {
bool has_more() { return has_more_; }
bool has_next() { return next_pos_ < in()->length(); }
uc32 Next();
uc32 ReadNext(bool update_position, ScanMode mode);
template <bool update_position>
uc32 ReadNext();
FlatStringReader* in() { return in_; }
void ScanForCaptures();

Expand Down
4 changes: 2 additions & 2 deletions test/mjsunit/harmony/regexp-named-captures.js
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ assertThrows('/(?<𐒤>a)/u', SyntaxError); // ID_Continue but not ID_Start.
assertEquals("a", /(?<π>a)/.exec("bab").groups.π);
assertEquals("a", /(?<$>a)/.exec("bab").groups.$);
assertEquals("a", /(?<_>a)/.exec("bab").groups._);
assertEquals("a", /(?<$𐒤>a)/.exec("bab").groups.$𐒤);
assertThrows("/(?<$𐒤>a)/", SyntaxError);
assertEquals("a", /(?<ಠ_ಠ>a)/.exec("bab").groups.ಠ_ಠ);
assertThrows('/(?<❤>a)/', SyntaxError);
assertThrows('/(?<𐒤>a)/', SyntaxError); // ID_Continue but not ID_Start.
Expand Down Expand Up @@ -219,7 +219,7 @@ assertThrows("/(?<a\uDCA4>.)/", SyntaxError); // Trail
assertThrows("/(?<\\u{0041}>.)/", SyntaxError); // Non-surrogate
assertThrows("/(?<a\\u{104A4}>.)/", SyntaxError); // Surrogate, ID_Continue
assertTrue(RegExp("(?<\u{0041}>.)").test("a")); // Non-surrogate
assertTrue(RegExp("(?<a\u{104A4}>.)").test("a")); // Surrogate, ID_Continue
assertThrows("(?<a\u{104A4}>.)", SyntaxError); // Surrogate, ID_Continue
assertTrue(RegExp("(?<\\u0041>.)").test("a")); // Non-surrogate

// @@replace with a callable replacement argument (no named captures).
Expand Down

0 comments on commit f918404

Please sign in to comment.