From 884ec8e6b1bc1265ad6e37434b0c860606637ae4 Mon Sep 17 00:00:00 2001 From: Mathias Bynens Date: Sat, 24 Jan 2015 15:48:50 +0100 Subject: [PATCH] Avoid matching surrogate halves when lone surrogates are to be matched Fixes #28. --- README.md | 6 +- regenerate.js | 160 +++++++++++++++++++++++++++++++++++-------------- tests/tests.js | 29 +++++---- 3 files changed, 136 insertions(+), 59 deletions(-) diff --git a/README.md b/README.md index 5f5eb7d..bffb2a3 100644 --- a/README.md +++ b/README.md @@ -160,13 +160,13 @@ regenerate() .addRange(0x000000, 0x10FFFF) // add all Unicode code points .removeRange('A', 'z') // remove all symbols from `A` to `z` .toString(); -// → '[\\0-@\\{-\\uD7FF\\uDC00-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF]' +// → '[\\0-@\\{-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]' regenerate() .addRange(0x000000, 0x10FFFF) // add all Unicode code points .removeRange(0x0041, 0x007A) // remove all code points from U+0041 to U+007A .toString(); -// → '[\\0-@\\{-\\uD7FF\\uDC00-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF]' +// → '[\\0-@\\{-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]' ``` ### `regenerate.prototype.intersection(codePoints)` @@ -295,7 +295,7 @@ regenerate(codePoints).toString(); ## Support -Regenerate supports at least Chrome 27+, Firefox 3+, Safari 4+, Opera 10+, IE 6+, Node.js v0.10.0+, Narwhal 0.3.2+, RingoJS 0.8+, PhantomJS 1.9.0+, and Rhino 1.7RC4+. +Regenerate supports at least Chrome 27+, Firefox 3+, Safari 4+, Opera 10+, IE 6+, Node.js v0.10.0+, io.js v1.0.0+, Narwhal 0.3.2+, RingoJS 0.8+, PhantomJS 1.9.0+, and Rhino 1.7RC4+. ## Unit tests & code coverage diff --git a/regenerate.js b/regenerate.js index e8aeba6..decbe14 100644 --- a/regenerate.js +++ b/regenerate.js @@ -626,6 +626,7 @@ var splitAtBMP = function(data) { // Iterate over the data per `(start, end)` pair. var loneHighSurrogates = []; + var loneLowSurrogates = []; var bmp = []; var astral = []; var index = 0; @@ -635,51 +636,115 @@ while (index < length) { start = data[index]; end = data[index + 1] - 1; // Note: the `- 1` makes `end` inclusive. - if (start <= 0xFFFF && end <= 0xFFFF) { - // Both `start` and `end` are within the BMP range. - if (start >= HIGH_SURROGATE_MIN && start <= HIGH_SURROGATE_MAX) { - // `start` lies in the high surrogates range. - if (end <= HIGH_SURROGATE_MAX) { - loneHighSurrogates.push(start, end + 1); - } else { - loneHighSurrogates.push(start, HIGH_SURROGATE_MAX + 1); - bmp.push(HIGH_SURROGATE_MAX + 1, end + 1); - } - } else if (end >= HIGH_SURROGATE_MIN && end <= HIGH_SURROGATE_MAX) { + + if (start < HIGH_SURROGATE_MIN) { + + // The range starts and ends before the high surrogate range. + // E.g. (0, 0x10). + if (end < HIGH_SURROGATE_MIN) { + bmp.push(start, end + 1); + } + + // The range starts before the high surrogate range and ends within it. + // E.g. (0, 0xD855). + if (end >= HIGH_SURROGATE_MIN && end <= HIGH_SURROGATE_MAX) { bmp.push(start, HIGH_SURROGATE_MIN); loneHighSurrogates.push(HIGH_SURROGATE_MIN, end + 1); - } else if (start < HIGH_SURROGATE_MIN && end > HIGH_SURROGATE_MAX) { - bmp.push(start, HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1, end + 1); + } + + // The range starts before the high surrogate range and ends in the low + // surrogate range. E.g. (0, 0xDCFF). + if (end >= LOW_SURROGATE_MIN && end <= LOW_SURROGATE_MAX) { + bmp.push(start, HIGH_SURROGATE_MIN); loneHighSurrogates.push(HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1); - } else { - bmp.push(start, end + 1); + loneLowSurrogates.push(LOW_SURROGATE_MIN, end + 1); } - } - else if (start <= 0xFFFF && end > 0xFFFF) { - // `start` is in the BMP range, but `end` lies within the astral range. - if (start >= HIGH_SURROGATE_MIN && start <= HIGH_SURROGATE_MAX) { - // `start` lies in the high surrogates range. Since `end` is astral, - // we can just add all high surrogates starting from `start` to - // `loneHighSurrogates`, any other BMP code points to `bmp`, and the - // remaining symbols to `astral`. - loneHighSurrogates.push(start, HIGH_SURROGATE_MAX + 1); - bmp.push(HIGH_SURROGATE_MAX + 1, 0xFFFF + 1); - } else if (start < HIGH_SURROGATE_MIN) { - bmp.push(start, HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1, 0xFFFF + 1); + + // The range starts before the high surrogate range and ends after the + // low surrogate range. E.g. (0, 0x10FFFF). + if (end > LOW_SURROGATE_MAX) { + bmp.push(start, HIGH_SURROGATE_MIN); loneHighSurrogates.push(HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1); - } else { // `start > HIGH_SURROGATE_MAX` holds true. + loneLowSurrogates.push(LOW_SURROGATE_MIN, LOW_SURROGATE_MAX + 1); + if (end <= 0xFFFF) { + bmp.push(LOW_SURROGATE_MAX + 1, end + 1); + } else { + bmp.push(LOW_SURROGATE_MAX + 1, 0xFFFF + 1); + astral.push(0xFFFF + 1, end + 1); + } + } + + } else if (start >= HIGH_SURROGATE_MIN && start <= HIGH_SURROGATE_MAX) { + + // The range starts and ends in the high surrogate range. + // E.g. (0xD855, 0xD866). + if (end >= HIGH_SURROGATE_MIN && end <= HIGH_SURROGATE_MAX) { + loneHighSurrogates.push(start, end + 1); + } + + // The range starts in the high surrogate range and ends in the low + // surrogate range. E.g. (0xD855, 0xDCFF). + if (end >= LOW_SURROGATE_MIN && end <= LOW_SURROGATE_MAX) { + loneHighSurrogates.push(start, HIGH_SURROGATE_MAX + 1); + loneLowSurrogates.push(LOW_SURROGATE_MIN, end + 1); + } + + // The range starts in the high surrogate range and ends after the low + // surrogate range. E.g. (0xD855, 0x10FFFF). + if (end > LOW_SURROGATE_MAX) { + loneHighSurrogates.push(start, HIGH_SURROGATE_MAX + 1); + loneLowSurrogates.push(LOW_SURROGATE_MIN, LOW_SURROGATE_MAX + 1); + if (end <= 0xFFFF) { + bmp.push(LOW_SURROGATE_MAX + 1, end + 1); + } else { + bmp.push(LOW_SURROGATE_MAX + 1, 0xFFFF + 1); + astral.push(0xFFFF + 1, end + 1); + } + } + + } else if (start >= LOW_SURROGATE_MIN && start <= LOW_SURROGATE_MAX) { + + // The range starts and ends in the low surrogate range. + // E.g. (0xDCFF, 0xDDFF). + if (end >= LOW_SURROGATE_MIN && end <= LOW_SURROGATE_MAX) { + loneLowSurrogates.push(start, end + 1); + } + + // The range starts in the low surrogate range and ends after the low + // surrogate range. E.g. (0xDCFF, 0x10FFFF). + if (end > LOW_SURROGATE_MAX) { + loneLowSurrogates.push(start, LOW_SURROGATE_MAX + 1); + if (end <= 0xFFFF) { + bmp.push(LOW_SURROGATE_MAX + 1, end + 1); + } else { + bmp.push(LOW_SURROGATE_MAX + 1, 0xFFFF + 1); + astral.push(0xFFFF + 1, end + 1); + } + } + + } else if (start > LOW_SURROGATE_MAX && start <= 0xFFFF) { + + // The range starts and ends after the low surrogate range. + // E.g. (0xFFAA, 0x10FFFF). + if (end <= 0xFFFF) { + bmp.push(start, end + 1); + } else { bmp.push(start, 0xFFFF + 1); + astral.push(0xFFFF + 1, end + 1); } - astral.push(0xFFFF + 1, end + 1); - } - else { - // Both `start` and `end` are in the astral range. + + } else { + + // The range starts and ends in the astral range. astral.push(start, end + 1); + } + index += 2; } return { 'loneHighSurrogates': loneHighSurrogates, + 'loneLowSurrogates': loneLowSurrogates, 'bmp': bmp, 'astral': astral }; @@ -898,19 +963,15 @@ var parts = splitAtBMP(data); var loneHighSurrogates = parts.loneHighSurrogates; + var loneLowSurrogates = parts.loneLowSurrogates; var bmp = parts.bmp; var astral = parts.astral; var hasAstral = !dataIsEmpty(parts.astral); - var hasLoneSurrogates = !dataIsEmpty(loneHighSurrogates); + var hasLoneHighSurrogates = !dataIsEmpty(loneHighSurrogates); + var hasLoneLowSurrogates = !dataIsEmpty(loneLowSurrogates); var surrogateMappings = surrogateSet(astral); - // If we’re not dealing with any astral symbols, there’s no need to move - // individual code points that are high surrogates to the end of the regex. - if (!hasAstral && hasLoneSurrogates) { - bmp = dataAddData(bmp, loneHighSurrogates); - } - if (!dataIsEmpty(bmp)) { // The data set contains BMP code points that are not high surrogates // needed for astral code points in the set. @@ -921,11 +982,20 @@ // based on their surrogate pairs. result.push(createSurrogateCharacterClasses(surrogateMappings)); } - if (hasAstral && hasLoneSurrogates) { - // The data set contains lone high surrogates; append these. Lone high - // surrogates must go at the end of the regex if astral symbols are to be - // matched as well. - result.push(createBMPCharacterClasses(loneHighSurrogates)); + // https://gist.github.com/mathiasbynens/bbe7f870208abcfec860 + if (hasLoneHighSurrogates) { + result.push( + createBMPCharacterClasses(loneHighSurrogates) + + // Make sure the high surrogates aren’t part of a surrogate pair. + '(?![\\uDC00-\\uDFFF])' + ); + } + if (hasLoneLowSurrogates) { + result.push( + // Make sure the low surrogates aren’t part of a surrogate pair. + '(?:[^\\uD800-\\uDBFF]|^)' + + createBMPCharacterClasses(loneLowSurrogates) + ); } return result.join('|'); }; @@ -1066,7 +1136,7 @@ return regenerate; }); } else if (freeExports && !freeExports.nodeType) { - if (freeModule) { // in Node.js or RingoJS v0.8.0+ + if (freeModule) { // in Node.js, io.js, or RingoJS v0.8.0+ freeModule.exports = regenerate; } else { // in Narwhal or RingoJS v0.7.0- freeExports.regenerate = regenerate; diff --git a/tests/tests.js b/tests/tests.js index 115e8f0..7ced955 100644 --- a/tests/tests.js +++ b/tests/tests.js @@ -358,12 +358,12 @@ ); equal( regenerate(0xD800, 0xD801, 0xD802, 0xD803, 0xDBFF).toString(), - '[\\uD800-\\uD803\\uDBFF]', + '[\\uD800-\\uD803\\uDBFF](?![\\uDC00-\\uDFFF])', 'Unmatched high surrogates' ); equal( regenerate(0xDC00, 0xDC01, 0xDC02, 0xDC03, 0xDC04, 0xDC05, 0xDFFB, 0xDFFD, 0xDFFE, 0xDFFF).toString(), - '[\\uDC00-\\uDC05\\uDFFB\\uDFFD-\\uDFFF]', + '(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDC05\\uDFFB\\uDFFD-\\uDFFF]', 'Unmatched low surrogates' ); equal( @@ -388,7 +388,7 @@ ); equal( regenerate().addRange(0x0, 0xFFFF).toString(), - '[\\0-\\uFFFF]', + '[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]', 'All BMP code points' ); equal( @@ -398,7 +398,7 @@ ); equal( regenerate().addRange(0x0, 0x10FFFF).toString(), - '[\\0-\\uD7FF\\uDC00-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF]', + '[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]', 'All Unicode code points' ); raises( @@ -501,12 +501,12 @@ ); equal( regenerate('\uD800', '\uD801', '\uD802', '\uD803', '\uDBFF').toString(), - '[\\uD800-\\uD803\\uDBFF]', + '[\\uD800-\\uD803\\uDBFF](?![\\uDC00-\\uDFFF])', 'Unmatched high surrogates, using symbols as input' ); equal( regenerate('\uDC00', '\uDC01', '\uDC02', '\uDC03', '\uDC04', '\uDC05', '\uDFFB', '\uDFFD', '\uDFFE', '\uDFFF').toString(), - '[\\uDC00-\\uDC05\\uDFFB\\uDFFD-\\uDFFF]', + '(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDC05\\uDFFB\\uDFFD-\\uDFFF]', 'Unmatched low surrogates, using symbols as input' ); equal( @@ -620,12 +620,12 @@ ); equal( regenerate().addRange(0xD800, 0xDBFF).addRange(0xDC00, 0xDFFF).add(0xFFFF).toString(), - '[\\uD800-\\uDFFF\\uFFFF]', + '\\uFFFF|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]', 'BMP-only symbols incl. lone surrogates but with higher code points too' ); equal( regenerate().addRange(0xD800, 0xDBFF).addRange(0xDC00, 0xDFFF).add(0xFFFF, 0x1D306).toString(), - '[\\uDC00-\\uDFFF\\uFFFF]|\\uD834\\uDF06|[\\uD800-\\uDBFF]', + '\\uFFFF|\\uD834\\uDF06|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]', 'BMP-only symbols incl. lone surrogates but with higher code points and an astral code point too' ); equal( @@ -655,17 +655,17 @@ ); equal( regenerate().addRange(0x20, 0xD900).toString(), - '[ -\\uD900]', + '[ -\\uD7FF]|[\\uD800-\\uD900](?![\\uDC00-\\uDFFF])', 'adding a range that starts in ASCII and ends in the high surrogate range' ); equal( regenerate().addRange(0x20, 0x1D306).toString(), - '[ -\\uD7FF\\uDC00-\\uFFFF]|[\\uD800-\\uD833][\\uDC00-\\uDFFF]|\\uD834[\\uDC00-\\uDF06]|[\\uD800-\\uDBFF]', + '[ -\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD833][\\uDC00-\\uDFFF]|\\uD834[\\uDC00-\\uDF06]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]', 'adding a range that starts in ASCII and ends in the astral range' ); equal( regenerate().addRange(0xD900, 0x1D306).toString(), - '[\\uDC00-\\uFFFF]|[\\uD800-\\uD833][\\uDC00-\\uDFFF]|\\uD834[\\uDC00-\\uDF06]|[\\uD900-\\uDBFF]', + '[\\uE000-\\uFFFF]|[\\uD800-\\uD833][\\uDC00-\\uDFFF]|\\uD834[\\uDC00-\\uDF06]|[\\uD900-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]', 'adding a range that starts in the high surrogate range and ends in the astral range' ); equal( @@ -678,6 +678,13 @@ '[\\uFFEF-\\uFFF8\\uFFFE\\uFFFF]|\\uD800[\\uDC00-\\uDEFF]', 'mixed BMP + astral code points' ); + equal( + '\uD834\uDF06'.match( + RegExp('(' + regenerate().addRange(0xD800, 0xDBFF).addRange(0xDC00, 0xDFFF).toString() + ')') + ), + null, + 'https://github.com/mathiasbynens/regenerate/issues/28' + ); equal( regenerate.prototype.valueOf, regenerate.prototype.toArray,