Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

string_decoder: support Uint8Array input to methods #11613

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions doc/api/string_decoder.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

> Stability: 2 - Stable

The `string_decoder` module provides an API for decoding `Buffer` objects into
strings in a manner that preserves encoded multi-byte UTF-8 and UTF-16
characters. It can be accessed using:
The `string_decoder` module provides an API for decoding `Buffer` and
`Uint8Array` objects into strings in a manner that preserves encoded multi-byte
UTF-8 and UTF-16 characters. It can be accessed using:

```js
const StringDecoder = require('string_decoder').StringDecoder;
Expand Down Expand Up @@ -53,9 +53,14 @@ Creates a new `StringDecoder` instance.
### stringDecoder.end([buffer])
<!-- YAML
added: v0.9.3
changes:
- version: REPLACEME
pr-url: https://github.com/nodejs/node/pull/11613
description: The `buffer` argument can now be a `Uint8Array` instance.
-->

* `buffer` {Buffer} A `Buffer` containing the bytes to decode.
* `buffer` {Buffer|Uint8Array} A `Buffer` or `Uint8Array` containing the bytes
to decode.

Returns any remaining input stored in the internal buffer as a string. Bytes
representing incomplete UTF-8 and UTF-16 characters will be replaced with
Expand All @@ -68,13 +73,17 @@ is performed before returning the remaining input.
<!-- YAML
added: v0.1.99
changes:
- version: REPLACEME
pr-url: https://github.com/nodejs/node/pull/11613
description: The `buffer` argument can now be a `Uint8Array` instance.
- version: REPLACEME
pr-url: https://github.com/nodejs/node/pull/9618
description: Each invalid character is now replaced by a single replacement
character instead of one for each individual byte.
-->

* `buffer` {Buffer} A `Buffer` containing the bytes to decode.
* `buffer` {Buffer|Uint8Array} A `Buffer` or `Uint8Array` containing the bytes
to decode.

Returns a decoded string, ensuring that any incomplete multibyte characters at
the end of the `Buffer` are omitted from the returned string and stored in an
Expand Down
169 changes: 123 additions & 46 deletions lib/string_decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,52 +24,121 @@
const Buffer = require('buffer').Buffer;
const internalUtil = require('internal/util');
const isEncoding = Buffer[internalUtil.kIsEncodingSymbol];
const {
copy, latin1Slice, asciiSlice, hexSlice, utf8Slice, ucs2Slice, base64Slice
} = process.binding('buffer');

// Do not cache `Buffer.isEncoding` when checking encoding names as some
// modules monkey-patch it to support additional encodings
function normalizeEncoding(enc) {
const nenc = internalUtil.normalizeEncoding(enc);
if (typeof nenc !== 'string' &&
(Buffer.isEncoding === isEncoding || !Buffer.isEncoding(enc)))
throw new Error(`Unknown encoding: ${enc}`);
return nenc || enc;
const encodings = [
// 0
[
'utf8', // normalized encoding name string
4, // buffer size
(self) => { self.fillLast = utf8FillLast; } // StringDecoder initialization
],
// 1
[
'utf16le',
4,
(self) => { self.text = utf16Text; self.end = utf16End; }
],
// 2
[
'latin1',
0,
(self) => { self.text = latin1Text; self.end = simpleEnd; }
],
// 3
[
'base64',
3,
(self) => { self.text = base64Text; self.end = base64End; }
],
// 4
[
'ascii',
0,
(self) => { self.text = asciiText; self.end = simpleEnd; }
],
// 5
[
'hex',
0,
(self) => { self.text = hexText; self.end = simpleEnd; }
]
];

function translateEncoding(enc) {
if (!enc) return 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it affect performance if we use constants with names instead of number literals for indices?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC yes.

enc += '';
switch (enc.length) {
case 4:
if (enc === 'utf8') return 0;
if (enc === 'ucs2') return 1;
enc = enc.toLowerCase();
if (enc === 'utf8') return 0;
if (enc === 'ucs2') return 1;
break;
case 5:
if (enc === 'utf-8') return 0;
if (enc === 'ascii') return 4;
if (enc === 'ucs-2') return 1;
enc = enc.toLowerCase();
if (enc === 'utf-8') return 0;
if (enc === 'ascii') return 4;
if (enc === 'ucs-2') return 1;
break;
case 7:
return (enc === 'utf16le' || enc.toLowerCase() === 'utf16le' ? 1 : -1);
case 8:
return (enc === 'utf-16le' || enc.toLowerCase() === 'utf-16le' ? 1 : -1);
case 6:
if (enc === 'latin1') return 2;
if (enc === 'binary') return 2;
if (enc === 'base64') return 3;
enc = enc.toLowerCase();
if (enc === 'latin1') return 2;
if (enc === 'binary') return 2;
if (enc === 'base64') return 3;
break;
case 3:
return (enc === 'hex' || enc.toLowerCase() === 'hex' ? 5 : -1);
}
return -1;
}

// StringDecoder provides an interface for efficiently splitting a series of
// buffers into a series of JS strings without breaking apart multi-byte
// characters.
// Do not cache `Buffer.isEncoding` when checking encoding names as some
// modules monkey-patch it to support additional encodings
exports.StringDecoder = StringDecoder;
function StringDecoder(encoding) {
this.encoding = normalizeEncoding(encoding);
var nb;
switch (this.encoding) {
case 'utf16le':
this.text = utf16Text;
this.end = utf16End;
nb = 4;
break;
case 'utf8':
this.fillLast = utf8FillLast;
nb = 4;
break;
case 'base64':
this.text = base64Text;
this.end = base64End;
nb = 3;
break;
default:
this.write = simpleWrite;
this.end = simpleEnd;
return;
function StringDecoder(enc) {
var info;
const encIdx = translateEncoding(enc);
if (encIdx === -1) {
if (Buffer.isEncoding === isEncoding || !Buffer.isEncoding(enc))
throw new Error(`Unknown encoding: ${enc}`);
this.encoding = enc;
return;
} else {
info = encodings[encIdx];
}
this.encoding = info[0];
const nb = info[1];
info[2](this);
if (nb === 0)
return;
this.lastNeed = 0;
this.lastTotal = 0;
this.lastChar = Buffer.allocUnsafe(nb);
}

// TODO(addaleax): This method should not accept strings as input.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand the comment here. Is the suggestion that in the future it should throw on a string? Otherwise the comment seems at odds with the string check below.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand the comment here. Is the suggestion that in the future it should throw on a string? Otherwise the comment seems at odds with the string check below.

Yes… do you have different thoughts? It doesn’t really make sense to pass in a string here, does it?

Copy link
Member

@joyeecheung joyeecheung May 5, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe just leave the behavior about string inputs as it is and make it throw in another PR? (that would be semver-major, I guess) (EDIT: OK this PR is already semver-major..)

StringDecoder.prototype.write = function(buf) {
if (buf.length === 0)
return '';
if (typeof buf === 'string')
return buf;
var r;
var i;
if (this.lastNeed) {
Expand All @@ -94,10 +163,10 @@ StringDecoder.prototype.text = utf8Text;
// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
StringDecoder.prototype.fillLast = function(buf) {
if (this.lastNeed <= buf.length) {
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
copy(buf, this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
}
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length);
copy(buf, this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length);
this.lastNeed -= buf.length;
};

Expand Down Expand Up @@ -185,10 +254,10 @@ function utf8FillLast(buf) {
if (r !== undefined)
return r;
if (this.lastNeed <= buf.length) {
buf.copy(this.lastChar, p, 0, this.lastNeed);
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
copy(buf, this.lastChar, p, 0, this.lastNeed);
return utf8Slice(this.lastChar, 0, this.lastTotal);
}
buf.copy(this.lastChar, p, 0, buf.length);
copy(buf, this.lastChar, p, 0, buf.length);
this.lastNeed -= buf.length;
}

Expand All @@ -198,11 +267,11 @@ function utf8FillLast(buf) {
function utf8Text(buf, i) {
const total = utf8CheckIncomplete(this, buf, i);
if (!this.lastNeed)
return buf.toString('utf8', i);
return utf8Slice(buf, i, buf.length);
this.lastTotal = total;
const end = buf.length - (total - this.lastNeed);
buf.copy(this.lastChar, 0, end);
return buf.toString('utf8', i, end);
copy(buf, this.lastChar, 0, end);
return utf8Slice(buf, i, end);
}

// For UTF-8, a replacement character is added when ending on a partial
Expand All @@ -220,7 +289,7 @@ function utf8End(buf) {
// decode the last character properly.
function utf16Text(buf, i) {
if ((buf.length - i) % 2 === 0) {
const r = buf.toString('utf16le', i);
const r = ucs2Slice(buf, i, buf.length);
if (r) {
const c = r.charCodeAt(r.length - 1);
if (c >= 0xD800 && c <= 0xDBFF) {
Expand All @@ -236,7 +305,7 @@ function utf16Text(buf, i) {
this.lastNeed = 1;
this.lastTotal = 2;
this.lastChar[0] = buf[buf.length - 1];
return buf.toString('utf16le', i, buf.length - 1);
return ucs2Slice(buf, i, buf.length - 1);
}

// For UTF-16LE we do not explicitly append special replacement characters if we
Expand All @@ -245,15 +314,15 @@ function utf16End(buf) {
const r = (buf && buf.length ? this.write(buf) : '');
if (this.lastNeed) {
const end = this.lastTotal - this.lastNeed;
return r + this.lastChar.toString('utf16le', 0, end);
return r + ucs2Slice(this.lastChar, 0, end);
}
return r;
}

function base64Text(buf, i) {
const n = (buf.length - i) % 3;
if (n === 0)
return buf.toString('base64', i);
return base64Slice(buf, i, buf.length);
this.lastNeed = 3 - n;
this.lastTotal = 3;
if (n === 1) {
Expand All @@ -262,20 +331,28 @@ function base64Text(buf, i) {
this.lastChar[0] = buf[buf.length - 2];
this.lastChar[1] = buf[buf.length - 1];
}
return buf.toString('base64', i, buf.length - n);
return base64Slice(buf, i, buf.length - n);
}


function base64End(buf) {
const r = (buf && buf.length ? this.write(buf) : '');
if (this.lastNeed)
return r + this.lastChar.toString('base64', 0, 3 - this.lastNeed);
return r + base64Slice(this.lastChar, 0, 3 - this.lastNeed);
return r;
}

// Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex)
function simpleWrite(buf) {
return buf.toString(this.encoding);
function latin1Text(buf) {
return latin1Slice(buf, 0, buf.length);
}

function asciiText(buf) {
return asciiSlice(buf, 0, buf.length);
}

function hexText(buf) {
return hexSlice(buf, 0, buf.length);
}

function simpleEnd(buf) {
Expand Down
Loading