-
Notifications
You must be signed in to change notification settings - Fork 30k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
string_decoder: support Uint8Array input to methods #11613
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,52 +24,121 @@ | |
const Buffer = require('buffer').Buffer; | ||
const internalUtil = require('internal/util'); | ||
const isEncoding = Buffer[internalUtil.kIsEncodingSymbol]; | ||
const { | ||
copy, latin1Slice, asciiSlice, hexSlice, utf8Slice, ucs2Slice, base64Slice | ||
} = process.binding('buffer'); | ||
|
||
// Do not cache `Buffer.isEncoding` when checking encoding names as some | ||
// modules monkey-patch it to support additional encodings | ||
function normalizeEncoding(enc) { | ||
const nenc = internalUtil.normalizeEncoding(enc); | ||
if (typeof nenc !== 'string' && | ||
(Buffer.isEncoding === isEncoding || !Buffer.isEncoding(enc))) | ||
throw new Error(`Unknown encoding: ${enc}`); | ||
return nenc || enc; | ||
const encodings = [ | ||
// 0 | ||
[ | ||
'utf8', // normalized encoding name string | ||
4, // buffer size | ||
(self) => { self.fillLast = utf8FillLast; } // StringDecoder initialization | ||
], | ||
// 1 | ||
[ | ||
'utf16le', | ||
4, | ||
(self) => { self.text = utf16Text; self.end = utf16End; } | ||
], | ||
// 2 | ||
[ | ||
'latin1', | ||
0, | ||
(self) => { self.text = latin1Text; self.end = simpleEnd; } | ||
], | ||
// 3 | ||
[ | ||
'base64', | ||
3, | ||
(self) => { self.text = base64Text; self.end = base64End; } | ||
], | ||
// 4 | ||
[ | ||
'ascii', | ||
0, | ||
(self) => { self.text = asciiText; self.end = simpleEnd; } | ||
], | ||
// 5 | ||
[ | ||
'hex', | ||
0, | ||
(self) => { self.text = hexText; self.end = simpleEnd; } | ||
] | ||
]; | ||
|
||
function translateEncoding(enc) { | ||
if (!enc) return 0; | ||
enc += ''; | ||
switch (enc.length) { | ||
case 4: | ||
if (enc === 'utf8') return 0; | ||
if (enc === 'ucs2') return 1; | ||
enc = enc.toLowerCase(); | ||
if (enc === 'utf8') return 0; | ||
if (enc === 'ucs2') return 1; | ||
break; | ||
case 5: | ||
if (enc === 'utf-8') return 0; | ||
if (enc === 'ascii') return 4; | ||
if (enc === 'ucs-2') return 1; | ||
enc = enc.toLowerCase(); | ||
if (enc === 'utf-8') return 0; | ||
if (enc === 'ascii') return 4; | ||
if (enc === 'ucs-2') return 1; | ||
break; | ||
case 7: | ||
return (enc === 'utf16le' || enc.toLowerCase() === 'utf16le' ? 1 : -1); | ||
case 8: | ||
return (enc === 'utf-16le' || enc.toLowerCase() === 'utf-16le' ? 1 : -1); | ||
case 6: | ||
if (enc === 'latin1') return 2; | ||
if (enc === 'binary') return 2; | ||
if (enc === 'base64') return 3; | ||
enc = enc.toLowerCase(); | ||
if (enc === 'latin1') return 2; | ||
if (enc === 'binary') return 2; | ||
if (enc === 'base64') return 3; | ||
break; | ||
case 3: | ||
return (enc === 'hex' || enc.toLowerCase() === 'hex' ? 5 : -1); | ||
} | ||
return -1; | ||
} | ||
|
||
// StringDecoder provides an interface for efficiently splitting a series of | ||
// buffers into a series of JS strings without breaking apart multi-byte | ||
// characters. | ||
// Do not cache `Buffer.isEncoding` when checking encoding names as some | ||
// modules monkey-patch it to support additional encodings | ||
exports.StringDecoder = StringDecoder; | ||
function StringDecoder(encoding) { | ||
this.encoding = normalizeEncoding(encoding); | ||
var nb; | ||
switch (this.encoding) { | ||
case 'utf16le': | ||
this.text = utf16Text; | ||
this.end = utf16End; | ||
nb = 4; | ||
break; | ||
case 'utf8': | ||
this.fillLast = utf8FillLast; | ||
nb = 4; | ||
break; | ||
case 'base64': | ||
this.text = base64Text; | ||
this.end = base64End; | ||
nb = 3; | ||
break; | ||
default: | ||
this.write = simpleWrite; | ||
this.end = simpleEnd; | ||
return; | ||
function StringDecoder(enc) { | ||
var info; | ||
const encIdx = translateEncoding(enc); | ||
if (encIdx === -1) { | ||
if (Buffer.isEncoding === isEncoding || !Buffer.isEncoding(enc)) | ||
throw new Error(`Unknown encoding: ${enc}`); | ||
this.encoding = enc; | ||
return; | ||
} else { | ||
info = encodings[encIdx]; | ||
} | ||
this.encoding = info[0]; | ||
const nb = info[1]; | ||
info[2](this); | ||
if (nb === 0) | ||
return; | ||
this.lastNeed = 0; | ||
this.lastTotal = 0; | ||
this.lastChar = Buffer.allocUnsafe(nb); | ||
} | ||
|
||
// TODO(addaleax): This method should not accept strings as input. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand the comment here. Is the suggestion that in the future it should throw on a string? Otherwise the comment seems at odds with the string check below. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes… do you have different thoughts? It doesn’t really make sense to pass in a string here, does it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe just leave the behavior about string inputs as it is and make it throw in another PR? (that would be semver-major, I guess) (EDIT: OK this PR is already semver-major..) |
||
StringDecoder.prototype.write = function(buf) { | ||
if (buf.length === 0) | ||
return ''; | ||
if (typeof buf === 'string') | ||
return buf; | ||
var r; | ||
var i; | ||
if (this.lastNeed) { | ||
|
@@ -94,10 +163,10 @@ StringDecoder.prototype.text = utf8Text; | |
// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer | ||
StringDecoder.prototype.fillLast = function(buf) { | ||
if (this.lastNeed <= buf.length) { | ||
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed); | ||
copy(buf, this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed); | ||
return this.lastChar.toString(this.encoding, 0, this.lastTotal); | ||
} | ||
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length); | ||
copy(buf, this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length); | ||
this.lastNeed -= buf.length; | ||
}; | ||
|
||
|
@@ -185,10 +254,10 @@ function utf8FillLast(buf) { | |
if (r !== undefined) | ||
return r; | ||
if (this.lastNeed <= buf.length) { | ||
buf.copy(this.lastChar, p, 0, this.lastNeed); | ||
return this.lastChar.toString(this.encoding, 0, this.lastTotal); | ||
copy(buf, this.lastChar, p, 0, this.lastNeed); | ||
return utf8Slice(this.lastChar, 0, this.lastTotal); | ||
} | ||
buf.copy(this.lastChar, p, 0, buf.length); | ||
copy(buf, this.lastChar, p, 0, buf.length); | ||
this.lastNeed -= buf.length; | ||
} | ||
|
||
|
@@ -198,11 +267,11 @@ function utf8FillLast(buf) { | |
function utf8Text(buf, i) { | ||
const total = utf8CheckIncomplete(this, buf, i); | ||
if (!this.lastNeed) | ||
return buf.toString('utf8', i); | ||
return utf8Slice(buf, i, buf.length); | ||
this.lastTotal = total; | ||
const end = buf.length - (total - this.lastNeed); | ||
buf.copy(this.lastChar, 0, end); | ||
return buf.toString('utf8', i, end); | ||
copy(buf, this.lastChar, 0, end); | ||
return utf8Slice(buf, i, end); | ||
} | ||
|
||
// For UTF-8, a replacement character is added when ending on a partial | ||
|
@@ -220,7 +289,7 @@ function utf8End(buf) { | |
// decode the last character properly. | ||
function utf16Text(buf, i) { | ||
if ((buf.length - i) % 2 === 0) { | ||
const r = buf.toString('utf16le', i); | ||
const r = ucs2Slice(buf, i, buf.length); | ||
if (r) { | ||
const c = r.charCodeAt(r.length - 1); | ||
if (c >= 0xD800 && c <= 0xDBFF) { | ||
|
@@ -236,7 +305,7 @@ function utf16Text(buf, i) { | |
this.lastNeed = 1; | ||
this.lastTotal = 2; | ||
this.lastChar[0] = buf[buf.length - 1]; | ||
return buf.toString('utf16le', i, buf.length - 1); | ||
return ucs2Slice(buf, i, buf.length - 1); | ||
} | ||
|
||
// For UTF-16LE we do not explicitly append special replacement characters if we | ||
|
@@ -245,15 +314,15 @@ function utf16End(buf) { | |
const r = (buf && buf.length ? this.write(buf) : ''); | ||
if (this.lastNeed) { | ||
const end = this.lastTotal - this.lastNeed; | ||
return r + this.lastChar.toString('utf16le', 0, end); | ||
return r + ucs2Slice(this.lastChar, 0, end); | ||
} | ||
return r; | ||
} | ||
|
||
function base64Text(buf, i) { | ||
const n = (buf.length - i) % 3; | ||
if (n === 0) | ||
return buf.toString('base64', i); | ||
return base64Slice(buf, i, buf.length); | ||
this.lastNeed = 3 - n; | ||
this.lastTotal = 3; | ||
if (n === 1) { | ||
|
@@ -262,20 +331,28 @@ function base64Text(buf, i) { | |
this.lastChar[0] = buf[buf.length - 2]; | ||
this.lastChar[1] = buf[buf.length - 1]; | ||
} | ||
return buf.toString('base64', i, buf.length - n); | ||
return base64Slice(buf, i, buf.length - n); | ||
} | ||
|
||
|
||
function base64End(buf) { | ||
const r = (buf && buf.length ? this.write(buf) : ''); | ||
if (this.lastNeed) | ||
return r + this.lastChar.toString('base64', 0, 3 - this.lastNeed); | ||
return r + base64Slice(this.lastChar, 0, 3 - this.lastNeed); | ||
return r; | ||
} | ||
|
||
// Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex) | ||
function simpleWrite(buf) { | ||
return buf.toString(this.encoding); | ||
function latin1Text(buf) { | ||
return latin1Slice(buf, 0, buf.length); | ||
} | ||
|
||
function asciiText(buf) { | ||
return asciiSlice(buf, 0, buf.length); | ||
} | ||
|
||
function hexText(buf) { | ||
return hexSlice(buf, 0, buf.length); | ||
} | ||
|
||
function simpleEnd(buf) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it affect performance if we use constants with names instead of number literals for indices?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIRC yes.