diff --git a/README.md b/README.md index 5f5eb7d..bffb2a3 100644 --- a/README.md +++ b/README.md @@ -160,13 +160,13 @@ regenerate() .addRange(0x000000, 0x10FFFF) // add all Unicode code points .removeRange('A', 'z') // remove all symbols from `A` to `z` .toString(); -// → '[\\0-@\\{-\\uD7FF\\uDC00-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF]' +// → '[\\0-@\\{-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]' regenerate() .addRange(0x000000, 0x10FFFF) // add all Unicode code points .removeRange(0x0041, 0x007A) // remove all code points from U+0041 to U+007A .toString(); -// → '[\\0-@\\{-\\uD7FF\\uDC00-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF]' +// → '[\\0-@\\{-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]' ``` ### `regenerate.prototype.intersection(codePoints)` @@ -295,7 +295,7 @@ regenerate(codePoints).toString(); ## Support -Regenerate supports at least Chrome 27+, Firefox 3+, Safari 4+, Opera 10+, IE 6+, Node.js v0.10.0+, Narwhal 0.3.2+, RingoJS 0.8+, PhantomJS 1.9.0+, and Rhino 1.7RC4+. +Regenerate supports at least Chrome 27+, Firefox 3+, Safari 4+, Opera 10+, IE 6+, Node.js v0.10.0+, io.js v1.0.0+, Narwhal 0.3.2+, RingoJS 0.8+, PhantomJS 1.9.0+, and Rhino 1.7RC4+. ## Unit tests & code coverage diff --git a/regenerate.js b/regenerate.js index e8aeba6..d4dc94c 100644 --- a/regenerate.js +++ b/regenerate.js @@ -626,6 +626,7 @@ var splitAtBMP = function(data) { // Iterate over the data per `(start, end)` pair. var loneHighSurrogates = []; + var loneLowSurrogates = []; var bmp = []; var astral = []; var index = 0; @@ -635,51 +636,105 @@ while (index < length) { start = data[index]; end = data[index + 1] - 1; // Note: the `- 1` makes `end` inclusive. - if (start <= 0xFFFF && end <= 0xFFFF) { - // Both `start` and `end` are within the BMP range. - if (start >= HIGH_SURROGATE_MIN && start <= HIGH_SURROGATE_MAX) { - // `start` lies in the high surrogates range. - if (end <= HIGH_SURROGATE_MAX) { - loneHighSurrogates.push(start, end + 1); - } else { - loneHighSurrogates.push(start, HIGH_SURROGATE_MAX + 1); - bmp.push(HIGH_SURROGATE_MAX + 1, end + 1); - } - } else if (end >= HIGH_SURROGATE_MIN && end <= HIGH_SURROGATE_MAX) { + + if (start < HIGH_SURROGATE_MIN) { + + // starts before high surr & ends before high surr e.g. (0, 0x10) + if (end < HIGH_SURROGATE_MIN) { + bmp.push(start, end + 1); + } + + // starts before high surr & ends in high surr range e.g. (0, 0xD855) + if (end >= HIGH_SURROGATE_MIN && end <= HIGH_SURROGATE_MAX) { bmp.push(start, HIGH_SURROGATE_MIN); loneHighSurrogates.push(HIGH_SURROGATE_MIN, end + 1); - } else if (start < HIGH_SURROGATE_MIN && end > HIGH_SURROGATE_MAX) { - bmp.push(start, HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1, end + 1); + } + + // starts before high surr & ends in low surr range e.g. (0, 0xDCFF) + if (end >= LOW_SURROGATE_MIN && end <= LOW_SURROGATE_MAX) { + bmp.push(start, HIGH_SURROGATE_MIN); loneHighSurrogates.push(HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1); - } else { - bmp.push(start, end + 1); + loneLowSurrogates.push(LOW_SURROGATE_MIN, end + 1); } - } - else if (start <= 0xFFFF && end > 0xFFFF) { - // `start` is in the BMP range, but `end` lies within the astral range. - if (start >= HIGH_SURROGATE_MIN && start <= HIGH_SURROGATE_MAX) { - // `start` lies in the high surrogates range. Since `end` is astral, - // we can just add all high surrogates starting from `start` to - // `loneHighSurrogates`, any other BMP code points to `bmp`, and the - // remaining symbols to `astral`. - loneHighSurrogates.push(start, HIGH_SURROGATE_MAX + 1); - bmp.push(HIGH_SURROGATE_MAX + 1, 0xFFFF + 1); - } else if (start < HIGH_SURROGATE_MIN) { - bmp.push(start, HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1, 0xFFFF + 1); + + // starts before high surr & ends after low surr e.g. (0, 0x10FFFF) + if (end > LOW_SURROGATE_MAX) { + bmp.push(start, HIGH_SURROGATE_MIN); loneHighSurrogates.push(HIGH_SURROGATE_MIN, HIGH_SURROGATE_MAX + 1); - } else { // `start > HIGH_SURROGATE_MAX` holds true. + loneLowSurrogates.push(LOW_SURROGATE_MIN, LOW_SURROGATE_MAX + 1); + if (end <= 0xFFFF) { + bmp.push(LOW_SURROGATE_MAX + 1, end + 1); + } else { + bmp.push(LOW_SURROGATE_MAX + 1, 0xFFFF + 1); + astral.push(0xFFFF + 1, end + 1); + } + } + + } else if (start >= HIGH_SURROGATE_MIN && start <= HIGH_SURROGATE_MAX) { + + // starts in high surr range & ends in high surr range e.g. (0xD855, 0xD866) + if (end >= HIGH_SURROGATE_MIN && end <= HIGH_SURROGATE_MAX) { + loneHighSurrogates.push(start, end + 1); + } + + // starts in high surr range & ends in low surr range e.g. (0xD855, 0xDCFF) + if (end >= LOW_SURROGATE_MIN && end <= LOW_SURROGATE_MAX) { + loneHighSurrogates.push(start, LOW_SURROGATE_MAX + 1); + loneLowSurrogates.push(LOW_SURROGATE_MIN, end + 1); + } + + // starts in high surr range & ends after low surr e.g. (0xD855, 0x10FFFF) + if (end > LOW_SURROGATE_MAX) { + loneHighSurrogates.push(start, LOW_SURROGATE_MAX + 1); + loneLowSurrogates.push(LOW_SURROGATE_MIN, LOW_SURROGATE_MAX + 1); + if (end <= 0xFFFF) { + bmp.push(LOW_SURROGATE_MAX + 1, end + 1); + } else { + bmp.push(LOW_SURROGATE_MAX + 1, 0xFFFF + 1); + astral.push(0xFFFF + 1, end + 1); + } + } + + } else if (start >= LOW_SURROGATE_MIN && start <= LOW_SURROGATE_MAX) { + + // starts in low surr range & ends in low surr range e.g. (0xDCFF, 0xDDFF) + if (end >= LOW_SURROGATE_MIN && end <= LOW_SURROGATE_MAX) { + loneLowSurrogates.push(start, end + 1); + } + + // starts in low surr range & ends after low surr in BMP e.g. (0xDCFF, 0xFFFF) + if (end > LOW_SURROGATE_MAX) { + loneLowSurrogates.push(start, LOW_SURROGATE_MAX + 1); + if (end <= 0xFFFF) { + bmp.push(LOW_SURROGATE_MAX + 1, end + 1); + } else { + bmp.push(LOW_SURROGATE_MAX + 1, 0xFFFF + 1); + astral.push(0xFFFF + 1, end + 1); + } + } + + } else if (start > LOW_SURROGATE_MAX && start <= 0xFFFF) { + + // starts after low surr in BMP & ends after low surr e.g. (0xFFAA, 0x10FFFF) + if (end <= 0xFFFF) { + bmp.push(start, end + 1); + } else { bmp.push(start, 0xFFFF + 1); + astral.push(0xFFFF + 1, end + 1); } - astral.push(0xFFFF + 1, end + 1); - } - else { + + } else { + // Both `start` and `end` are in the astral range. astral.push(start, end + 1); + } + index += 2; } return { 'loneHighSurrogates': loneHighSurrogates, + 'loneLowSurrogates': loneLowSurrogates, 'bmp': bmp, 'astral': astral }; @@ -898,19 +953,15 @@ var parts = splitAtBMP(data); var loneHighSurrogates = parts.loneHighSurrogates; + var loneLowSurrogates = parts.loneLowSurrogates; var bmp = parts.bmp; var astral = parts.astral; var hasAstral = !dataIsEmpty(parts.astral); - var hasLoneSurrogates = !dataIsEmpty(loneHighSurrogates); + var hasLoneHighSurrogates = !dataIsEmpty(loneHighSurrogates); + var hasLoneLowSurrogates = !dataIsEmpty(loneLowSurrogates); var surrogateMappings = surrogateSet(astral); - // If we’re not dealing with any astral symbols, there’s no need to move - // individual code points that are high surrogates to the end of the regex. - if (!hasAstral && hasLoneSurrogates) { - bmp = dataAddData(bmp, loneHighSurrogates); - } - if (!dataIsEmpty(bmp)) { // The data set contains BMP code points that are not high surrogates // needed for astral code points in the set. @@ -921,11 +972,20 @@ // based on their surrogate pairs. result.push(createSurrogateCharacterClasses(surrogateMappings)); } - if (hasAstral && hasLoneSurrogates) { - // The data set contains lone high surrogates; append these. Lone high - // surrogates must go at the end of the regex if astral symbols are to be - // matched as well. - result.push(createBMPCharacterClasses(loneHighSurrogates)); + // https://gist.github.com/mathiasbynens/bbe7f870208abcfec860 + if (hasLoneHighSurrogates) { + result.push( + createBMPCharacterClasses(loneHighSurrogates) + + // Make sure the high surrogates aren’t part of a surrogate pair. + '(?![\\uDC00-\\uDFFF])' + ); + } + if (hasLoneLowSurrogates) { + result.push( + // Make sure the low surrogates aren’t part of a surrogate pair. + '(?:[^\\uD800-\\uDBFF]|^)' + + createBMPCharacterClasses(loneLowSurrogates) + ); } return result.join('|'); }; @@ -1066,7 +1126,7 @@ return regenerate; }); } else if (freeExports && !freeExports.nodeType) { - if (freeModule) { // in Node.js or RingoJS v0.8.0+ + if (freeModule) { // in Node.js, io.js, or RingoJS v0.8.0+ freeModule.exports = regenerate; } else { // in Narwhal or RingoJS v0.7.0- freeExports.regenerate = regenerate; diff --git a/tests/tests.js b/tests/tests.js index 115e8f0..55dfb8e 100644 --- a/tests/tests.js +++ b/tests/tests.js @@ -358,12 +358,12 @@ ); equal( regenerate(0xD800, 0xD801, 0xD802, 0xD803, 0xDBFF).toString(), - '[\\uD800-\\uD803\\uDBFF]', + '[\\uD800-\\uD803\\uDBFF](?![\\uDC00-\\uDFFF])', 'Unmatched high surrogates' ); equal( regenerate(0xDC00, 0xDC01, 0xDC02, 0xDC03, 0xDC04, 0xDC05, 0xDFFB, 0xDFFD, 0xDFFE, 0xDFFF).toString(), - '[\\uDC00-\\uDC05\\uDFFB\\uDFFD-\\uDFFF]', + '(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDC05\\uDFFB\\uDFFD-\\uDFFF]', 'Unmatched low surrogates' ); equal( @@ -388,7 +388,7 @@ ); equal( regenerate().addRange(0x0, 0xFFFF).toString(), - '[\\0-\\uFFFF]', + '[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]', 'All BMP code points' ); equal( @@ -398,7 +398,7 @@ ); equal( regenerate().addRange(0x0, 0x10FFFF).toString(), - '[\\0-\\uD7FF\\uDC00-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF]', + '[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]', 'All Unicode code points' ); raises( @@ -501,12 +501,12 @@ ); equal( regenerate('\uD800', '\uD801', '\uD802', '\uD803', '\uDBFF').toString(), - '[\\uD800-\\uD803\\uDBFF]', + '[\\uD800-\\uD803\\uDBFF](?![\\uDC00-\\uDFFF])', 'Unmatched high surrogates, using symbols as input' ); equal( regenerate('\uDC00', '\uDC01', '\uDC02', '\uDC03', '\uDC04', '\uDC05', '\uDFFB', '\uDFFD', '\uDFFE', '\uDFFF').toString(), - '[\\uDC00-\\uDC05\\uDFFB\\uDFFD-\\uDFFF]', + '(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDC05\\uDFFB\\uDFFD-\\uDFFF]', 'Unmatched low surrogates, using symbols as input' ); equal( @@ -620,12 +620,12 @@ ); equal( regenerate().addRange(0xD800, 0xDBFF).addRange(0xDC00, 0xDFFF).add(0xFFFF).toString(), - '[\\uD800-\\uDFFF\\uFFFF]', + '\\uFFFF|[\\uD800-\\uDFFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]', 'BMP-only symbols incl. lone surrogates but with higher code points too' ); equal( regenerate().addRange(0xD800, 0xDBFF).addRange(0xDC00, 0xDFFF).add(0xFFFF, 0x1D306).toString(), - '[\\uDC00-\\uDFFF\\uFFFF]|\\uD834\\uDF06|[\\uD800-\\uDBFF]', + '\\uFFFF|\\uD834\\uDF06|[\\uD800-\\uDFFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]', 'BMP-only symbols incl. lone surrogates but with higher code points and an astral code point too' ); equal( @@ -655,17 +655,17 @@ ); equal( regenerate().addRange(0x20, 0xD900).toString(), - '[ -\\uD900]', + '[ -\\uD7FF]|[\\uD800-\\uD900](?![\\uDC00-\\uDFFF])', 'adding a range that starts in ASCII and ends in the high surrogate range' ); equal( regenerate().addRange(0x20, 0x1D306).toString(), - '[ -\\uD7FF\\uDC00-\\uFFFF]|[\\uD800-\\uD833][\\uDC00-\\uDFFF]|\\uD834[\\uDC00-\\uDF06]|[\\uD800-\\uDBFF]', + '[ -\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD833][\\uDC00-\\uDFFF]|\\uD834[\\uDC00-\\uDF06]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]', 'adding a range that starts in ASCII and ends in the astral range' ); equal( regenerate().addRange(0xD900, 0x1D306).toString(), - '[\\uDC00-\\uFFFF]|[\\uD800-\\uD833][\\uDC00-\\uDFFF]|\\uD834[\\uDC00-\\uDF06]|[\\uD900-\\uDBFF]', + '[\\uE000-\\uFFFF]|[\\uD800-\\uD833][\\uDC00-\\uDFFF]|\\uD834[\\uDC00-\\uDF06]|[\\uD900-\\uDFFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]', 'adding a range that starts in the high surrogate range and ends in the astral range' ); equal(