-
Notifications
You must be signed in to change notification settings - Fork 30k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
string_decoder: fix bad utf8 character handling #7310
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,8 +25,10 @@ function StringDecoder(encoding) { | |
case 'utf16le': | ||
this.text = utf16Text; | ||
this.end = utf16End; | ||
// fall through | ||
nb = 4; | ||
break; | ||
case 'utf8': | ||
this.fillLast = utf8FillLast; | ||
nb = 4; | ||
break; | ||
case 'base64': | ||
|
@@ -68,7 +70,7 @@ StringDecoder.prototype.end = utf8End; | |
// Returns only complete characters in a Buffer | ||
StringDecoder.prototype.text = utf8Text; | ||
|
||
// Attempts to complete a partial character using bytes from a Buffer | ||
// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer | ||
StringDecoder.prototype.fillLast = function(buf) { | ||
if (this.lastNeed <= buf.length) { | ||
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed); | ||
|
@@ -92,38 +94,83 @@ function utf8CheckByte(byte) { | |
return -1; | ||
} | ||
|
||
// Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8 | ||
// character, returning the total number of bytes needed to complete the partial | ||
// character (if applicable). | ||
// Checks at most 3 bytes at the end of a Buffer in order to detect an | ||
// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4) | ||
// needed to complete the UTF-8 character (if applicable) are returned. | ||
function utf8CheckIncomplete(self, buf, i) { | ||
var j = buf.length - 1; | ||
if (j < i) | ||
return 0; | ||
var nb = utf8CheckByte(buf[j--]); | ||
var nb = utf8CheckByte(buf[j]); | ||
if (nb >= 0) { | ||
if (nb > 0) | ||
self.lastNeed = nb + 1 - (buf.length - j); | ||
self.lastNeed = nb - 1; | ||
return nb; | ||
} | ||
if (j < i) | ||
if (--j < i) | ||
return 0; | ||
nb = utf8CheckByte(buf[j--]); | ||
nb = utf8CheckByte(buf[j]); | ||
if (nb >= 0) { | ||
if (nb > 0) | ||
self.lastNeed = nb + 1 - (buf.length - j); | ||
self.lastNeed = nb - 2; | ||
return nb; | ||
} | ||
if (j < i) | ||
if (--j < i) | ||
return 0; | ||
nb = utf8CheckByte(buf[j--]); | ||
nb = utf8CheckByte(buf[j]); | ||
if (nb >= 0) { | ||
if (nb > 0) | ||
self.lastNeed = nb + 1 - (buf.length - j); | ||
if (nb > 0) { | ||
if (nb === 2) | ||
nb = 0; | ||
else | ||
self.lastNeed = nb - 3; | ||
} | ||
return nb; | ||
} | ||
return 0; | ||
} | ||
|
||
// Validates as many continuation bytes for a multi-byte UTF-8 character as | ||
// needed or are available. If we see a non-continuation byte where we expect | ||
// one, we "replace" the validated continuation bytes we've seen so far with | ||
// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding | ||
// behavior. The continuation byte check is included three times in the case | ||
// where all of the continuation bytes for a character exist in the same buffer. | ||
// It is also done this way as a slight performance increase instead of using a | ||
// loop. | ||
function utf8CheckExtraBytes(self, buf, p) { | ||
if ((buf[0] & 0xC0) !== 0x80) { | ||
self.lastNeed = 0; | ||
return '\ufffd'.repeat(p); | ||
} | ||
if (self.lastNeed > 1 && buf.length > 1) { | ||
if ((buf[1] & 0xC0) !== 0x80) { | ||
self.lastNeed = 1; | ||
return '\ufffd'.repeat(p + 1); | ||
} | ||
if (self.lastNeed > 2 && buf.length > 2) { | ||
if ((buf[2] & 0xC0) !== 0x80) { | ||
self.lastNeed = 2; | ||
return '\ufffd'.repeat(p + 2); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the reason to repeat it two more times? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because of the two preceding (valid) continuation bytes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, but it is still not obvious to me. |
||
} | ||
} | ||
} | ||
} | ||
|
||
// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer. | ||
function utf8FillLast(buf) { | ||
const p = this.lastTotal - this.lastNeed; | ||
var r = utf8CheckExtraBytes(this, buf, p); | ||
if (r !== undefined) | ||
return r; | ||
if (this.lastNeed <= buf.length) { | ||
buf.copy(this.lastChar, p, 0, this.lastNeed); | ||
return this.lastChar.toString(this.encoding, 0, this.lastTotal); | ||
} | ||
buf.copy(this.lastChar, p, 0, buf.length); | ||
this.lastNeed -= buf.length; | ||
} | ||
|
||
// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a | ||
// partial character, the character's bytes are buffered until the required | ||
// number of bytes are available. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,30 @@ test( | |
'\u02e4\u0064\u12e4\u0030\u3045' | ||
); | ||
|
||
// Some invalid input, known to have caused trouble with chunking | ||
// in https://github.com/nodejs/node/pull/7310#issuecomment-226445923 | ||
// 00: |00000000 ASCII | ||
// 41: |01000001 ASCII | ||
// B8: 10|111000 continuation | ||
// CC: 110|01100 two-byte head | ||
// E2: 1110|0010 three-byte head | ||
// F0: 11110|000 four-byte head | ||
// F1: 11110|001'another four-byte head | ||
// FB: 111110|11 "five-byte head", not UTF-8 | ||
test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA'); | ||
test('utf-8', Buffer.from('E2', 'hex'), '\ufffd'); | ||
test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA'); | ||
test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338'); | ||
test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA'); | ||
test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338'); | ||
test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0'); | ||
test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38'); | ||
test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338'); | ||
test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001'); | ||
test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D | ||
'\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd'); | ||
test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379'); | ||
|
||
// UCS-2 | ||
test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc'); | ||
|
||
|
@@ -55,7 +79,12 @@ assert.strictEqual(decoder.write(Buffer.from('\ufffd\ufffd\ufffd')), | |
assert.strictEqual(decoder.end(), ''); | ||
|
||
decoder = new StringDecoder('utf8'); | ||
assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd'); | ||
assert.strictEqual(decoder.write(Buffer.from('EFBFBDE2', 'hex')), '\ufffd'); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why this change? And if needed, maybe leave the old test too? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's just making capitalization consistent with all of the other hex buffers. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd be inclined to leave it just to make sure that lower-case is tested. (I'm sure that under the hood, While I have an opinion, I don't feel strongly enough about this to protest or anything. If you do feel strongly, feel free to leave it as you have it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand. The hex string is being passed to the Buffer creation function ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mscdex Argh! You're right, of course. Ignore me. |
||
assert.strictEqual(decoder.end(), '\ufffd'); | ||
|
||
decoder = new StringDecoder('utf8'); | ||
assert.strictEqual(decoder.write(Buffer.from('F1', 'hex')), ''); | ||
assert.strictEqual(decoder.write(Buffer.from('41F2', 'hex')), '\ufffdA'); | ||
assert.strictEqual(decoder.end(), '\ufffd'); | ||
|
||
|
||
|
@@ -93,6 +122,7 @@ function test(encoding, input, expected, singleSequence) { | |
sequence.forEach(function(write) { | ||
output += decoder.write(input.slice(write[0], write[1])); | ||
}); | ||
output += decoder.end(); | ||
process.stdout.write('.'); | ||
if (output !== expected) { | ||
var message = | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Couldn't it still go negative here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, because if execution has reached here,
nb
would have to be a 2, 3, or 4-byte character indicator. Soself.lastNeed
would range from 0 to 2.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh right,
nb
can't be 1.