Revert of [regexp] Support unicode capture names in non-unicode patte…

…rns (patchset #3 id:40001 of https://codereview.chromium.org/2791163003/ ) Reason for revert: The decision for the specification was to not have this syntax, and instead the syntax before this patch. Original issue's description: > [regexp] Support unicode capture names in non-unicode patterns > > This ensures that capture names containing surrogate pairs are parsed > correctly even in non-unicode RegExp patterns by introducing a new > scanning mode which unconditionally combines surrogate pairs. > > BUG=v8:5437,v8:6192 > > Review-Url: https://codereview.chromium.org/2791163003 > Cr-Commit-Position: refs/heads/master@{#44466} > Committed: https://chromium.googlesource.com/v8/v8/+/a8651c5671dd6e41595ffb438e7ea013082f2d38 R=yangguo@chromium.org,jgruber@chromium.org # Not skipping CQ checks because original CL landed more than 1 days ago. BUG=v8:5437,v8:6192 Review-Url: https://codereview.chromium.org/2859933003 Cr-Commit-Position: refs/heads/master@{#45088}
zhuzhuaicoding · May 4, 2017 · f918404 · f918404
1 parent 6b4e8c2
commit f918404
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 29 deletions.
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
@@ -46,13 +46,13 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
   Advance();
 }
 
-inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) {
+template <bool update_position>
+inline uc32 RegExpParser::ReadNext() {
   int position = next_pos_;
   uc32 c0 = in()->Get(position);
   position++;
-  const bool try_combine_surrogate_pairs =
-      (unicode() || mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS);
-  if (try_combine_surrogate_pairs && position < in()->length() &&
+  // Read the whole surrogate pair in case of unicode flag, if possible.
+  if (unicode() && position < in()->length() &&
       unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
     uc16 c1 = in()->Get(position);
     if (unibrow::Utf16::IsTrailSurrogate(c1)) {
@@ -67,13 +67,13 @@ inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) {
 
 uc32 RegExpParser::Next() {
   if (has_next()) {
-    return ReadNext(false, ScanMode::DEFAULT);
+    return ReadNext<false>();
   } else {
     return kEndMarker;
   }
 }
 
-void RegExpParser::Advance(ScanMode mode) {
+void RegExpParser::Advance() {
   if (has_next()) {
     StackLimitCheck check(isolate());
     if (check.HasOverflowed()) {
@@ -83,7 +83,7 @@ void RegExpParser::Advance(ScanMode mode) {
     } else if (zone()->excess_allocation()) {
       ReportError(CStrVector("Regular expression too large"));
     } else {
-      current_ = ReadNext(true, mode);
+      current_ = ReadNext<true>();
     }
   } else {
     current_ = kEndMarker;
@@ -101,9 +101,9 @@ void RegExpParser::Reset(int pos) {
   Advance();
 }
 
-void RegExpParser::Advance(int dist, ScanMode mode) {
+void RegExpParser::Advance(int dist) {
   next_pos_ += dist - 1;
-  Advance(mode);
+  Advance();
 }
 
 
@@ -326,6 +326,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
               if (FLAG_harmony_regexp_named_captures) {
                 has_named_captures_ = true;
                 is_named_capture = true;
+                Advance();
                 break;
               }
             // Fall through.
@@ -761,24 +762,18 @@ static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
 
 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
   DCHECK(FLAG_harmony_regexp_named_captures);
-  DCHECK_EQ(current(), '<');
 
   ZoneVector<uc16>* name =
       new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
 
-  // Capture names can always contain surrogate pairs, and we need to scan
-  // accordingly.
-  const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS;
-  Advance(scan_mode);
-
   bool at_start = true;
   while (true) {
     uc32 c = current();
-    Advance(scan_mode);
+    Advance();
 
     // Convert unicode escapes.
     if (c == '\\' && current() == 'u') {
-      Advance(scan_mode);
+      Advance();
       if (!ParseUnicodeEscape(&c)) {
         ReportError(CStrVector("Invalid Unicode escape sequence"));
         return nullptr;
@@ -849,6 +844,7 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
     return false;
   }
 
+  Advance();
   const ZoneVector<uc16>* name = ParseCaptureGroupName();
   if (name == nullptr) {
     return false;

diff --git a/src/regexp/regexp-parser.h b/src/regexp/regexp-parser.h
@@ -184,18 +184,11 @@ class RegExpParser BASE_EMBEDDED {
   // can be reparsed.
   bool ParseBackReferenceIndex(int* index_out);
 
-  // The default behavior is to combine surrogate pairs in unicode mode and
-  // don't combine them otherwise (a quantifier after a surrogate pair would
-  // then apply only to the trailing surrogate). Forcing combination is required
-  // when parsing capture names since they can always legally contain surrogate
-  // pairs.
-  enum class ScanMode { DEFAULT, FORCE_COMBINE_SURROGATE_PAIRS };
-
   bool ParseClassProperty(ZoneList<CharacterRange>* result);
   CharacterRange ParseClassAtom(uc16* char_class);
   RegExpTree* ReportError(Vector<const char> message);
-  void Advance(ScanMode mode = ScanMode::DEFAULT);
-  void Advance(int dist, ScanMode mode = ScanMode::DEFAULT);
+  void Advance();
+  void Advance(int dist);
   void Reset(int pos);
 
   // Reports whether the pattern might be used as a literal search string.
@@ -311,7 +304,8 @@ class RegExpParser BASE_EMBEDDED {
   bool has_more() { return has_more_; }
   bool has_next() { return next_pos_ < in()->length(); }
   uc32 Next();
-  uc32 ReadNext(bool update_position, ScanMode mode);
+  template <bool update_position>
+  uc32 ReadNext();
   FlatStringReader* in() { return in_; }
   void ScanForCaptures();
 

diff --git a/test/mjsunit/harmony/regexp-named-captures.js b/test/mjsunit/harmony/regexp-named-captures.js
@@ -147,7 +147,7 @@ assertThrows('/(?<𐒤>a)/u', SyntaxError);  // ID_Continue but not ID_Start.
 assertEquals("a", /(?<π>a)/.exec("bab").groups.π);
 assertEquals("a", /(?<$>a)/.exec("bab").groups.$);
 assertEquals("a", /(?<_>a)/.exec("bab").groups._);
-assertEquals("a", /(?<$𐒤>a)/.exec("bab").groups.$𐒤);
+assertThrows("/(?<$𐒤>a)/", SyntaxError);
 assertEquals("a", /(?<ಠ_ಠ>a)/.exec("bab").groups.ಠ_ಠ);
 assertThrows('/(?<❤>a)/', SyntaxError);
 assertThrows('/(?<𐒤>a)/', SyntaxError);  // ID_Continue but not ID_Start.
@@ -219,7 +219,7 @@ assertThrows("/(?<a\uDCA4>.)/", SyntaxError);  // Trail
 assertThrows("/(?<\\u{0041}>.)/", SyntaxError);  // Non-surrogate
 assertThrows("/(?<a\\u{104A4}>.)/", SyntaxError);  // Surrogate, ID_Continue
 assertTrue(RegExp("(?<\u{0041}>.)").test("a"));  // Non-surrogate
-assertTrue(RegExp("(?<a\u{104A4}>.)").test("a"));  // Surrogate, ID_Continue
+assertThrows("(?<a\u{104A4}>.)", SyntaxError);  // Surrogate, ID_Continue
 assertTrue(RegExp("(?<\\u0041>.)").test("a"));  // Non-surrogate
 
 // @@replace with a callable replacement argument (no named captures).