Skip to content
This repository has been archived by the owner on Dec 15, 2022. It is now read-only.

Fixes #56: Do not ignore unmatched high surrogates, check index is within bounds #57

Merged
merged 1 commit into from
Sep 20, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions spec/onig-scanner-spec.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,43 @@ describe "OnigScanner", ->
match = scanner.findNextMatchSync('Возврат long_var_name;', 0)
expect(match.captureIndices).toEqual [{index: 0, start: 0, end: 7, length: 7}]

describe "when the input string contains invalid surrogate pairs", ->
it "interprets them as a code point", ->
scanner = new OnigScanner(["X"])
match = scanner.findNextMatchSync('X' + String.fromCharCode(0xd83c) + 'X', 0)
expect(match.captureIndices).toEqual [{index: 0, start: 0, end: 1, length: 1}]

match = scanner.findNextMatchSync('X' + String.fromCharCode(0xd83c) + 'X', 1)
expect(match.captureIndices).toEqual [{index: 0, start: 2, end: 3, length: 1}]

match = scanner.findNextMatchSync('X' + String.fromCharCode(0xd83c) + 'X', 2)
expect(match.captureIndices).toEqual [{index: 0, start: 2, end: 3, length: 1}]

match = scanner.findNextMatchSync('X' + String.fromCharCode(0xdfff) + 'X', 0)
expect(match.captureIndices).toEqual [{index: 0, start: 0, end: 1, length: 1}]

match = scanner.findNextMatchSync('X' + String.fromCharCode(0xdfff) + 'X', 1)
expect(match.captureIndices).toEqual [{index: 0, start: 2, end: 3, length: 1}]

match = scanner.findNextMatchSync('X' + String.fromCharCode(0xdfff) + 'X', 2)
expect(match.captureIndices).toEqual [{index: 0, start: 2, end: 3, length: 1}]

# These are actually valid, just testing the min & max
match = scanner.findNextMatchSync('X' + String.fromCharCode(0xd800) + String.fromCharCode(0xdc00) + 'X', 2)
expect(match.captureIndices).toEqual [{index: 0, start: 3, end: 4, length: 1}]

match = scanner.findNextMatchSync('X' + String.fromCharCode(0xdbff) + String.fromCharCode(0xdfff) + 'X', 2)
expect(match.captureIndices).toEqual [{index: 0, start: 3, end: 4, length: 1}]

describe "when the start offset is out of bounds", ->
it "it gets clamped", ->
scanner = new OnigScanner(["X"])
match = scanner.findNextMatchSync('X💻X', -1000)
expect(match.captureIndices).toEqual [{index: 0, start: 0, end: 1, length: 1}]

match = scanner.findNextMatchSync('X💻X', 1000)
expect(match).toEqual null

describe "::findNextMatch", ->
matchCallback = null

Expand Down
80 changes: 44 additions & 36 deletions src/onig-string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,50 +35,46 @@ OnigString::OnigString(Local<String> value)
utf8OffsetToUtf16[utf8_length_] = utf16_length_;

// http://stackoverflow.com/a/148766
unsigned int codepoint = 0;
int i16_codepoint_start = 0;
int i8 = 0;
for (int i16 = 0, len = utf16_length_; i16 < len; i16++) {
uint16_t in = (*utf16Value)[i16];

utf16OffsetToUtf8[i16] = i8;
unsigned int codepoint = in;
bool wasSurrogatePair = false;

if (in >= 0xd800 && in <= 0xdbff) {
codepoint = ((in - 0xd800) << 10) + 0x10000;
} else {
if (in >= 0xdc00 && in <= 0xdfff) {
codepoint |= in - 0xdc00;
} else {
codepoint = in;
// Hit a high surrogate, try to look for a matching low surrogate
if (i16 + 1 < len) {
uint16_t next = (*utf16Value)[i16 + 1];
if (next >= 0xdc00 && next <= 0xdfff) {
// Found the matching low surrogate
codepoint = (((in - 0xd800) << 10) + 0x10000) | (next - 0xdc00);
wasSurrogatePair = true;
}
}
}

if (codepoint <= 0x7f) {
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
} else if (codepoint <= 0x7ff) {
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
} else if (codepoint <= 0xffff) {
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
} else {
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
}
codepoint = 0;
i16_codepoint_start = i16 + 1;
utf16OffsetToUtf8[i16] = i8;

if (codepoint <= 0x7f) {
utf8OffsetToUtf16[i8++] = i16;
} else if (codepoint <= 0x7ff) {
utf8OffsetToUtf16[i8++] = i16;
utf8OffsetToUtf16[i8++] = i16;
} else if (codepoint <= 0xffff) {
utf8OffsetToUtf16[i8++] = i16;
utf8OffsetToUtf16[i8++] = i16;
utf8OffsetToUtf16[i8++] = i16;
} else {
utf8OffsetToUtf16[i8++] = i16;
utf8OffsetToUtf16[i8++] = i16;
utf8OffsetToUtf16[i8++] = i16;
utf8OffsetToUtf16[i8++] = i16;
}

if (wasSurrogatePair) {
utf16OffsetToUtf8[i16 + 1] = utf16OffsetToUtf8[i16];
i16++;
}
}
}
Expand All @@ -93,13 +89,25 @@ OnigString::~OnigString() {

int OnigString::ConvertUtf8OffsetToUtf16(int utf8Offset) {
if (hasMultiByteChars) {
if (utf8Offset < 0) {
return 0;
}
if ((size_t)utf8Offset > utf8_length_) {
return utf16_length_;
}
return utf8OffsetToUtf16[utf8Offset];
}
return utf8Offset;
}

int OnigString::ConvertUtf16OffsetToUtf8(int utf16Offset) {
if (hasMultiByteChars) {
if (utf16Offset < 0) {
return 0;
}
if ((size_t)utf16Offset > utf16_length_) {
return utf8_length_;
}
return utf16OffsetToUtf8[utf16Offset];
}
return utf16Offset;
Expand Down