From 24ef1e67757514db9ee1aeded555d4fb336ca817 Mon Sep 17 00:00:00 2001 From: Brian White Date: Thu, 29 Dec 2016 12:02:31 -0500 Subject: [PATCH] string_decoder: align UTF-8 handling with V8 V8 5.5 changed how invalid characters are handled and it now appears to follow the WHATWG Encoding standard, where all of an invalid character's bytes are replaced by a single replacement character (\ufffd) instead of replacing each invalid byte with separate replacement characters. Example: the byte sequence 0xF0,0xB8,0x41 is decoded as '\ufffdA' in V8 5.5, but is decoded as '\ufffd\ufffdA' in previous versions of V8. PR-URL: https://github.com/nodejs/node/pull/9618 Reviewed-By: Ali Ijaz Sheikh Reviewed-By: Ben Noordhuis --- lib/string_decoder.js | 22 +++++++++++----------- test/parallel/test-string-decoder-end.js | 7 ------- test/parallel/test-string-decoder.js | 15 +++++---------- 3 files changed, 16 insertions(+), 28 deletions(-) diff --git a/lib/string_decoder.js b/lib/string_decoder.js index cb9fbe409fa68e..672ba185cc2821 100644 --- a/lib/string_decoder.js +++ b/lib/string_decoder.js @@ -81,7 +81,7 @@ StringDecoder.prototype.fillLast = function(buf) { }; // Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a -// continuation byte. +// continuation byte. If an invalid byte is detected, -2 is returned. function utf8CheckByte(byte) { if (byte <= 0x7F) return 0; @@ -91,7 +91,7 @@ function utf8CheckByte(byte) { return 3; else if (byte >> 3 === 0x1E) return 4; - return -1; + return (byte >> 6 === 0x02 ? -1 : -2); } // Checks at most 3 bytes at the end of a Buffer in order to detect an @@ -107,7 +107,7 @@ function utf8CheckIncomplete(self, buf, i) { self.lastNeed = nb - 1; return nb; } - if (--j < i) + if (--j < i || nb === -2) return 0; nb = utf8CheckByte(buf[j]); if (nb >= 0) { @@ -115,7 +115,7 @@ function utf8CheckIncomplete(self, buf, i) { self.lastNeed = nb - 2; return nb; } - if (--j < i) + if (--j < i || nb === -2) return 0; nb = utf8CheckByte(buf[j]); if (nb >= 0) { @@ -133,7 +133,7 @@ function utf8CheckIncomplete(self, buf, i) { // Validates as many continuation bytes for a multi-byte UTF-8 character as // needed or are available. If we see a non-continuation byte where we expect // one, we "replace" the validated continuation bytes we've seen so far with -// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding +// a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding // behavior. The continuation byte check is included three times in the case // where all of the continuation bytes for a character exist in the same buffer. // It is also done this way as a slight performance increase instead of using a @@ -141,17 +141,17 @@ function utf8CheckIncomplete(self, buf, i) { function utf8CheckExtraBytes(self, buf, p) { if ((buf[0] & 0xC0) !== 0x80) { self.lastNeed = 0; - return '\ufffd'.repeat(p); + return '\ufffd'; } if (self.lastNeed > 1 && buf.length > 1) { if ((buf[1] & 0xC0) !== 0x80) { self.lastNeed = 1; - return '\ufffd'.repeat(p + 1); + return '\ufffd'; } if (self.lastNeed > 2 && buf.length > 2) { if ((buf[2] & 0xC0) !== 0x80) { self.lastNeed = 2; - return '\ufffd'.repeat(p + 2); + return '\ufffd'; } } } @@ -184,12 +184,12 @@ function utf8Text(buf, i) { return buf.toString('utf8', i, end); } -// For UTF-8, a replacement character for each buffered byte of a (partial) -// character needs to be added to the output. +// For UTF-8, a replacement character is added when ending on a partial +// character. function utf8End(buf) { const r = (buf && buf.length ? this.write(buf) : ''); if (this.lastNeed) - return r + '\ufffd'.repeat(this.lastTotal - this.lastNeed); + return r + '\ufffd'; return r; } diff --git a/test/parallel/test-string-decoder-end.js b/test/parallel/test-string-decoder-end.js index 6e9eea3193e163..921a7a138ad8c9 100644 --- a/test/parallel/test-string-decoder-end.js +++ b/test/parallel/test-string-decoder-end.js @@ -18,8 +18,6 @@ for (let i = 1; i <= 16; i++) { encodings.forEach(testEncoding); -console.log('ok'); - function testEncoding(encoding) { bufs.forEach((buf) => { testBuf(encoding, buf); @@ -27,8 +25,6 @@ function testEncoding(encoding) { } function testBuf(encoding, buf) { - console.error('# %s', encoding, buf); - // write one byte at a time. let s = new SD(encoding); let res1 = ''; @@ -46,9 +42,6 @@ function testBuf(encoding, buf) { // .toString() on the buffer const res3 = buf.toString(encoding); - console.log('expect=%j', res3); - console.log('res1=%j', res1); - console.log('res2=%j', res2); assert.strictEqual(res1, res3, 'one byte at a time should match toString'); assert.strictEqual(res2, res3, 'all bytes at once should match toString'); } diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js index a8fdb999a0923f..4d22d670f63d6d 100644 --- a/test/parallel/test-string-decoder.js +++ b/test/parallel/test-string-decoder.js @@ -8,8 +8,6 @@ const StringDecoder = require('string_decoder').StringDecoder; let decoder = new StringDecoder(); assert.strictEqual(decoder.encoding, 'utf8'); -process.stdout.write('scanning '); - // UTF-8 test('utf-8', Buffer.from('$', 'utf-8'), '$'); test('utf-8', Buffer.from('¢', 'utf-8'), '¢'); @@ -42,15 +40,15 @@ test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA'); test('utf-8', Buffer.from('E2', 'hex'), '\ufffd'); test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA'); test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338'); -test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA'); +test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffdA'); test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338'); test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0'); test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38'); -test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338'); +test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\u0338'); test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001'); -test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D - '\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd'); test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379'); +// CESU-8 of U+1D40D +test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), '\ufffd\ufffd'); // UCS-2 test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc'); @@ -58,8 +56,6 @@ test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc'); // UTF-16LE test('utf16le', Buffer.from('3DD84DDC', 'hex'), '\ud83d\udc4d'); // thumbs up -console.log(' crayon!'); - // Additional UTF-8 tests decoder = new StringDecoder('utf8'); assert.strictEqual(decoder.write(Buffer.from('E1', 'hex')), ''); @@ -67,7 +63,7 @@ assert.strictEqual(decoder.end(), '\ufffd'); decoder = new StringDecoder('utf8'); assert.strictEqual(decoder.write(Buffer.from('E18B', 'hex')), ''); -assert.strictEqual(decoder.end(), '\ufffd\ufffd'); +assert.strictEqual(decoder.end(), '\ufffd'); decoder = new StringDecoder('utf8'); assert.strictEqual(decoder.write(Buffer.from('\ufffd')), '\ufffd'); @@ -131,7 +127,6 @@ function test(encoding, input, expected, singleSequence) { output += decoder.write(input.slice(write[0], write[1])); }); output += decoder.end(); - process.stdout.write('.'); if (output !== expected) { const message = 'Expected "' + unicodeEscape(expected) + '", ' +