atom · maxbrunsfeld · Sep 20, 2016 · Aug 26, 2016
diff --git a/spec/onig-scanner-spec.coffee b/spec/onig-scanner-spec.coffee
@@ -63,6 +63,43 @@ describe "OnigScanner", ->
       match = scanner.findNextMatchSync('Возврат long_var_name;', 0)
       expect(match.captureIndices).toEqual [{index: 0, start: 0, end: 7, length: 7}]
 
+  describe "when the input string contains invalid surrogate pairs", ->
+    it "interprets them as a code point", ->
+      scanner = new OnigScanner(["X"])
+      match = scanner.findNextMatchSync('X' + String.fromCharCode(0xd83c) + 'X', 0)
+      expect(match.captureIndices).toEqual [{index: 0, start: 0, end: 1, length: 1}]
+
+      match = scanner.findNextMatchSync('X' + String.fromCharCode(0xd83c) + 'X', 1)
+      expect(match.captureIndices).toEqual [{index: 0, start: 2, end: 3, length: 1}]
+
+      match = scanner.findNextMatchSync('X' + String.fromCharCode(0xd83c) + 'X', 2)
+      expect(match.captureIndices).toEqual [{index: 0, start: 2, end: 3, length: 1}]
+
+      match = scanner.findNextMatchSync('X' + String.fromCharCode(0xdfff) + 'X', 0)
+      expect(match.captureIndices).toEqual [{index: 0, start: 0, end: 1, length: 1}]
+
+      match = scanner.findNextMatchSync('X' + String.fromCharCode(0xdfff) + 'X', 1)
+      expect(match.captureIndices).toEqual [{index: 0, start: 2, end: 3, length: 1}]
+
+      match = scanner.findNextMatchSync('X' + String.fromCharCode(0xdfff) + 'X', 2)
+      expect(match.captureIndices).toEqual [{index: 0, start: 2, end: 3, length: 1}]
+
+      # These are actually valid, just testing the min & max
+      match = scanner.findNextMatchSync('X' + String.fromCharCode(0xd800) + String.fromCharCode(0xdc00) + 'X', 2)
+      expect(match.captureIndices).toEqual [{index: 0, start: 3, end: 4, length: 1}]
+
+      match = scanner.findNextMatchSync('X' + String.fromCharCode(0xdbff) + String.fromCharCode(0xdfff) + 'X', 2)
+      expect(match.captureIndices).toEqual [{index: 0, start: 3, end: 4, length: 1}]
+
+  describe "when the start offset is out of bounds", ->
+    it "it gets clamped", ->
+      scanner = new OnigScanner(["X"])
+      match = scanner.findNextMatchSync('X💻X', -1000)
+      expect(match.captureIndices).toEqual [{index: 0, start: 0, end: 1, length: 1}]
+
+      match = scanner.findNextMatchSync('X💻X', 1000)
+      expect(match).toEqual null
+
   describe "::findNextMatch", ->
     matchCallback = null
 

diff --git a/src/onig-string.cc b/src/onig-string.cc
@@ -35,50 +35,46 @@ OnigString::OnigString(Local<String> value)
     utf8OffsetToUtf16[utf8_length_] = utf16_length_;
 
     // http://stackoverflow.com/a/148766
-    unsigned int codepoint = 0;
-    int i16_codepoint_start = 0;
     int i8 = 0;
     for (int i16 = 0, len = utf16_length_; i16 < len; i16++) {
       uint16_t in = (*utf16Value)[i16];
 
-      utf16OffsetToUtf8[i16] = i8;
+      unsigned int codepoint = in;
+      bool wasSurrogatePair = false;
 
       if (in >= 0xd800 && in <= 0xdbff) {
-        codepoint = ((in - 0xd800) << 10) + 0x10000;
-      } else {
-        if (in >= 0xdc00 && in <= 0xdfff) {
-          codepoint |= in - 0xdc00;
-        } else {
-          codepoint = in;
+        // Hit a high surrogate, try to look for a matching low surrogate
+        if (i16 + 1 < len) {
+          uint16_t next = (*utf16Value)[i16 + 1];
+          if (next >= 0xdc00 && next <= 0xdfff) {
+            // Found the matching low surrogate
+            codepoint = (((in - 0xd800) << 10) + 0x10000) | (next - 0xdc00);
+            wasSurrogatePair = true;
+          }
         }
+      }
 
-        if (codepoint <= 0x7f) {
-          utf8OffsetToUtf16[i8] = i16_codepoint_start;
-          i8++;
-        } else if (codepoint <= 0x7ff) {
-          utf8OffsetToUtf16[i8] = i16_codepoint_start;
-          i8++;
-          utf8OffsetToUtf16[i8] = i16_codepoint_start;
-          i8++;
-        } else if (codepoint <= 0xffff) {
-          utf8OffsetToUtf16[i8] = i16_codepoint_start;
-          i8++;
-          utf8OffsetToUtf16[i8] = i16_codepoint_start;
-          i8++;
-          utf8OffsetToUtf16[i8] = i16_codepoint_start;
-          i8++;
-        } else {
-          utf8OffsetToUtf16[i8] = i16_codepoint_start;
-          i8++;
-          utf8OffsetToUtf16[i8] = i16_codepoint_start;
-          i8++;
-          utf8OffsetToUtf16[i8] = i16_codepoint_start;
-          i8++;
-          utf8OffsetToUtf16[i8] = i16_codepoint_start;
-          i8++;
-        }
-        codepoint = 0;
-        i16_codepoint_start = i16 + 1;
+      utf16OffsetToUtf8[i16] = i8;
+
+      if (codepoint <= 0x7f) {
+        utf8OffsetToUtf16[i8++] = i16;
+      } else if (codepoint <= 0x7ff) {
+        utf8OffsetToUtf16[i8++] = i16;
+        utf8OffsetToUtf16[i8++] = i16;
+      } else if (codepoint <= 0xffff) {
+        utf8OffsetToUtf16[i8++] = i16;
+        utf8OffsetToUtf16[i8++] = i16;
+        utf8OffsetToUtf16[i8++] = i16;
+      } else {
+        utf8OffsetToUtf16[i8++] = i16;
+        utf8OffsetToUtf16[i8++] = i16;
+        utf8OffsetToUtf16[i8++] = i16;
+        utf8OffsetToUtf16[i8++] = i16;
+      }
+
+      if (wasSurrogatePair) {
+        utf16OffsetToUtf8[i16 + 1] = utf16OffsetToUtf8[i16];
+        i16++;
       }
     }
   }
@@ -93,13 +89,25 @@ OnigString::~OnigString() {
 
 int OnigString::ConvertUtf8OffsetToUtf16(int utf8Offset) {
   if (hasMultiByteChars) {
+    if (utf8Offset < 0) {
+      return 0;
+    }
+    if ((size_t)utf8Offset > utf8_length_) {
+      return utf16_length_;
+    }
     return utf8OffsetToUtf16[utf8Offset];
   }
   return utf8Offset;
 }
 
 int OnigString::ConvertUtf16OffsetToUtf8(int utf16Offset) {
   if (hasMultiByteChars) {
+    if (utf16Offset < 0) {
+      return 0;
+    }
+    if ((size_t)utf16Offset > utf16_length_) {
+      return utf8_length_;
+    }
     return utf16OffsetToUtf8[utf16Offset];
   }
   return utf16Offset;