Skip to content

Commit

Permalink
string_decoder: support Uint8Array input to methods
Browse files Browse the repository at this point in the history
This is a bit odd since `string_decoder` does currently not
perform any type checking. Also, this adds an explicit check
for `string` input, which does not really make sense but is relied upon
by our test suite.
  • Loading branch information
addaleax committed May 5, 2017
1 parent 8b79a17 commit f4e7b55
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 40 deletions.
19 changes: 14 additions & 5 deletions doc/api/string_decoder.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

> Stability: 2 - Stable
The `string_decoder` module provides an API for decoding `Buffer` objects into
strings in a manner that preserves encoded multi-byte UTF-8 and UTF-16
characters. It can be accessed using:
The `string_decoder` module provides an API for decoding `Buffer` and
`Uint8Array` objects into strings in a manner that preserves encoded multi-byte
UTF-8 and UTF-16 characters. It can be accessed using:

```js
const StringDecoder = require('string_decoder').StringDecoder;
Expand Down Expand Up @@ -53,9 +53,14 @@ Creates a new `StringDecoder` instance.
### stringDecoder.end([buffer])
<!-- YAML
added: v0.9.3
changes:
- version: REPLACEME
pr-url: https://github.com/nodejs/node/pull/11613
description: The `buffer` argument can now be a `Uint8Array` instance.
-->

* `buffer` {Buffer} A `Buffer` containing the bytes to decode.
* `buffer` {Buffer|Uint8Array} A `Buffer` or `Uint8Array` containing the bytes
to decode.

Returns any remaining input stored in the internal buffer as a string. Bytes
representing incomplete UTF-8 and UTF-16 characters will be replaced with
Expand All @@ -68,13 +73,17 @@ is performed before returning the remaining input.
<!-- YAML
added: v0.1.99
changes:
- version: REPLACEME
pr-url: https://github.com/nodejs/node/pull/11613
description: The `buffer` argument can now be a `Uint8Array` instance.
- version: REPLACEME
pr-url: https://github.com/nodejs/node/pull/9618
description: Each invalid character is now replaced by a single replacement
character instead of one for each individual byte.
-->

* `buffer` {Buffer} A `Buffer` containing the bytes to decode.
* `buffer` {Buffer|Uint8Array} A `Buffer` or `Uint8Array` containing the bytes
to decode.

Returns a decoded string, ensuring that any incomplete multibyte characters at
the end of the `Buffer` are omitted from the returned string and stored in an
Expand Down
58 changes: 40 additions & 18 deletions lib/string_decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
const Buffer = require('buffer').Buffer;
const internalUtil = require('internal/util');
const isEncoding = Buffer[internalUtil.kIsEncodingSymbol];
const {
copy, latin1Slice, asciiSlice, hexSlice, utf8Slice, ucs2Slice, base64Slice
} = process.binding('buffer');

// Do not cache `Buffer.isEncoding` when checking encoding names as some
// modules monkey-patch it to support additional encodings
Expand Down Expand Up @@ -57,8 +60,16 @@ function StringDecoder(encoding) {
this.end = base64End;
nb = 3;
break;
default:
this.write = simpleWrite;
case 'hex':
this.write = hexText;
this.end = simpleEnd;
return;
case 'latin1':
this.write = latin1Text;
this.end = simpleEnd;
return;
case 'ascii':
this.write = asciiText;
this.end = simpleEnd;
return;
}
Expand All @@ -67,9 +78,12 @@ function StringDecoder(encoding) {
this.lastChar = Buffer.allocUnsafe(nb);
}

// TODO(addaleax): This method should not accept strings as input.
StringDecoder.prototype.write = function(buf) {
if (buf.length === 0)
return '';
if (typeof buf === 'string')
return buf;
var r;
var i;
if (this.lastNeed) {
Expand All @@ -94,10 +108,10 @@ StringDecoder.prototype.text = utf8Text;
// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
StringDecoder.prototype.fillLast = function(buf) {
if (this.lastNeed <= buf.length) {
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
copy(buf, this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
}
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length);
copy(buf, this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length);
this.lastNeed -= buf.length;
};

Expand Down Expand Up @@ -185,10 +199,10 @@ function utf8FillLast(buf) {
if (r !== undefined)
return r;
if (this.lastNeed <= buf.length) {
buf.copy(this.lastChar, p, 0, this.lastNeed);
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
copy(buf, this.lastChar, p, 0, this.lastNeed);
return utf8Slice(this.lastChar, 0, this.lastTotal);
}
buf.copy(this.lastChar, p, 0, buf.length);
copy(buf, this.lastChar, p, 0, buf.length);
this.lastNeed -= buf.length;
}

Expand All @@ -198,11 +212,11 @@ function utf8FillLast(buf) {
function utf8Text(buf, i) {
const total = utf8CheckIncomplete(this, buf, i);
if (!this.lastNeed)
return buf.toString('utf8', i);
return utf8Slice(buf, i, buf.length);
this.lastTotal = total;
const end = buf.length - (total - this.lastNeed);
buf.copy(this.lastChar, 0, end);
return buf.toString('utf8', i, end);
copy(buf, this.lastChar, 0, end);
return utf8Slice(buf, i, end);
}

// For UTF-8, a replacement character is added when ending on a partial
Expand All @@ -220,7 +234,7 @@ function utf8End(buf) {
// decode the last character properly.
function utf16Text(buf, i) {
if ((buf.length - i) % 2 === 0) {
const r = buf.toString('utf16le', i);
const r = ucs2Slice(buf, i, buf.length);
if (r) {
const c = r.charCodeAt(r.length - 1);
if (c >= 0xD800 && c <= 0xDBFF) {
Expand All @@ -236,7 +250,7 @@ function utf16Text(buf, i) {
this.lastNeed = 1;
this.lastTotal = 2;
this.lastChar[0] = buf[buf.length - 1];
return buf.toString('utf16le', i, buf.length - 1);
return ucs2Slice(buf, i, buf.length - 1);
}

// For UTF-16LE we do not explicitly append special replacement characters if we
Expand All @@ -245,15 +259,15 @@ function utf16End(buf) {
const r = (buf && buf.length ? this.write(buf) : '');
if (this.lastNeed) {
const end = this.lastTotal - this.lastNeed;
return r + this.lastChar.toString('utf16le', 0, end);
return r + ucs2Slice(this.lastChar, 0, end);
}
return r;
}

function base64Text(buf, i) {
const n = (buf.length - i) % 3;
if (n === 0)
return buf.toString('base64', i);
return base64Slice(buf, i, buf.length);
this.lastNeed = 3 - n;
this.lastTotal = 3;
if (n === 1) {
Expand All @@ -262,20 +276,28 @@ function base64Text(buf, i) {
this.lastChar[0] = buf[buf.length - 2];
this.lastChar[1] = buf[buf.length - 1];
}
return buf.toString('base64', i, buf.length - n);
return base64Slice(buf, i, buf.length - n);
}


function base64End(buf) {
const r = (buf && buf.length ? this.write(buf) : '');
if (this.lastNeed)
return r + this.lastChar.toString('base64', 0, 3 - this.lastNeed);
return r + base64Slice(this.lastChar, 0, 3 - this.lastNeed);
return r;
}

// Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex)
function simpleWrite(buf) {
return buf.toString(this.encoding);
function latin1Text(buf) {
return latin1Slice(buf, 0, buf.length);
}

function asciiText(buf) {
return asciiSlice(buf, 0, buf.length);
}

function hexText(buf) {
return hexSlice(buf, 0, buf.length);
}

function simpleEnd(buf) {
Expand Down
39 changes: 22 additions & 17 deletions test/parallel/test-string-decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ assert.strictEqual(decoder.end(), '\ufffd');

// Additional utf8Text test
decoder = new StringDecoder('utf8');
assert.strictEqual(decoder.text(Buffer.from([0x41]), 2), '');
assert.strictEqual(decoder.text(Buffer.from([0x41]), 1), '');

// Additional UTF-16LE surrogate pair tests
decoder = new StringDecoder('utf16le');
Expand Down Expand Up @@ -144,23 +144,28 @@ function test(encoding, input, expected, singleSequence) {
} else {
sequences = [singleSequence];
}
sequences.forEach((sequence) => {
const decoder = new StringDecoder(encoding);
let output = '';
sequence.forEach((write) => {
output += decoder.write(input.slice(write[0], write[1]));
for (const useUint8array of [ false, true ]) {
sequences.forEach((sequence) => {
const decoder = new StringDecoder(encoding);
let output = '';
sequence.forEach((write) => {
let slice = input.slice(write[0], write[1]);
if (useUint8array)
slice = new Uint8Array(slice);
output += decoder.write(slice);
});
output += decoder.end();
if (output !== expected) {
const message =
'Expected "' + unicodeEscape(expected) + '", ' +
'but got "' + unicodeEscape(output) + '"\n' +
'input: ' + input.toString('hex').match(/.{2}/g) + '\n' +
'Write sequence: ' + JSON.stringify(sequence) + '\n' +
'Full Decoder State: ' + inspect(decoder);
assert.fail(output, expected, message);
}
});
output += decoder.end();
if (output !== expected) {
const message =
'Expected "' + unicodeEscape(expected) + '", ' +
'but got "' + unicodeEscape(output) + '"\n' +
'input: ' + input.toString('hex').match(/.{2}/g) + '\n' +
'Write sequence: ' + JSON.stringify(sequence) + '\n' +
'Full Decoder State: ' + inspect(decoder);
assert.fail(output, expected, message);
}
});
}
}

// unicodeEscape prints the str contents as unicode escape codes.
Expand Down

0 comments on commit f4e7b55

Please sign in to comment.