From 8b79a170f34db9e6d653573115949954665ebeb6 Mon Sep 17 00:00:00 2001 From: Anna Henningsen Date: Tue, 28 Feb 2017 17:57:58 +0100 Subject: [PATCH 1/3] buffer: make `*Slice` methods available on binding Makes the string slice methods of buffers available on the binding object in addition to the `Buffer` prototype. This enables subsequent `string_decoder` changes to use these methods directly without performance loss, since all parameters are known by the string decoder in those cases. --- src/node_buffer.cc | 54 +++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/src/node_buffer.cc b/src/node_buffer.cc index 367af6592ff39b..40504abfc0fbc2 100644 --- a/src/node_buffer.cc +++ b/src/node_buffer.cc @@ -453,17 +453,20 @@ void CreateFromString(const FunctionCallbackInfo& args) { template -void StringSlice(const FunctionCallbackInfo& args) { +inline void StringSlice(const FunctionCallbackInfo& args, + Local buffer_arg, + Local start_arg, + Local end_arg) { Environment* env = Environment::GetCurrent(args); Isolate* isolate = env->isolate(); - THROW_AND_RETURN_UNLESS_BUFFER(env, args.This()); - SPREAD_BUFFER_ARG(args.This(), ts_obj); + THROW_AND_RETURN_UNLESS_BUFFER(env, buffer_arg); + SPREAD_BUFFER_ARG(buffer_arg, ts_obj); if (ts_obj_length == 0) return args.GetReturnValue().SetEmptyString(); - SLICE_START_END(args[0], args[1], ts_obj_length) + SLICE_START_END(start_arg, end_arg, ts_obj_length) Local error; MaybeLocal ret = @@ -482,17 +485,20 @@ void StringSlice(const FunctionCallbackInfo& args) { template <> -void StringSlice(const FunctionCallbackInfo& args) { +inline void StringSlice(const FunctionCallbackInfo& args, + Local buffer_arg, + Local start_arg, + Local end_arg) { Isolate* isolate = args.GetIsolate(); - Environment* env = Environment::GetCurrent(isolate); + Environment* env = Environment::GetCurrent(args); - THROW_AND_RETURN_UNLESS_BUFFER(env, args.This()); - SPREAD_BUFFER_ARG(args.This(), ts_obj); + THROW_AND_RETURN_UNLESS_BUFFER(env, buffer_arg); + SPREAD_BUFFER_ARG(buffer_arg, ts_obj); if (ts_obj_length == 0) return args.GetReturnValue().SetEmptyString(); - SLICE_START_END(args[0], args[1], ts_obj_length) + SLICE_START_END(start_arg, end_arg, ts_obj_length) length /= 2; const char* data = ts_obj_data + start; @@ -540,6 +546,17 @@ void StringSlice(const FunctionCallbackInfo& args) { } +template +void StringSliceProto(const FunctionCallbackInfo& args) { + return StringSlice(args, args.This(), args[0], args[1]); +} + +template +void StringSliceStandalone(const FunctionCallbackInfo& args) { + return StringSlice(args, args[0], args[1], args[2]); +} + + // bytesCopied = copy(buffer, target[, targetStart][, sourceStart][, sourceEnd]) void Copy(const FunctionCallbackInfo &args) { Environment* env = Environment::GetCurrent(args); @@ -1186,12 +1203,12 @@ void SetupBufferJS(const FunctionCallbackInfo& args) { Local proto = args[0].As(); env->set_buffer_prototype_object(proto); - env->SetMethod(proto, "asciiSlice", StringSlice); - env->SetMethod(proto, "base64Slice", StringSlice); - env->SetMethod(proto, "latin1Slice", StringSlice); - env->SetMethod(proto, "hexSlice", StringSlice); - env->SetMethod(proto, "ucs2Slice", StringSlice); - env->SetMethod(proto, "utf8Slice", StringSlice); + env->SetMethod(proto, "asciiSlice", StringSliceProto); + env->SetMethod(proto, "base64Slice", StringSliceProto); + env->SetMethod(proto, "latin1Slice", StringSliceProto); + env->SetMethod(proto, "hexSlice", StringSliceProto); + env->SetMethod(proto, "ucs2Slice", StringSliceProto); + env->SetMethod(proto, "utf8Slice", StringSliceProto); env->SetMethod(proto, "asciiWrite", StringWrite); env->SetMethod(proto, "base64Write", StringWrite); @@ -1244,6 +1261,13 @@ void Initialize(Local target, env->SetMethod(target, "swap32", Swap32); env->SetMethod(target, "swap64", Swap64); + env->SetMethod(target, "asciiSlice", StringSliceStandalone); + env->SetMethod(target, "base64Slice", StringSliceStandalone); + env->SetMethod(target, "latin1Slice", StringSliceStandalone); + env->SetMethod(target, "hexSlice", StringSliceStandalone); + env->SetMethod(target, "ucs2Slice", StringSliceStandalone); + env->SetMethod(target, "utf8Slice", StringSliceStandalone); + target->Set(env->context(), FIXED_ONE_BYTE_STRING(env->isolate(), "kMaxLength"), Integer::NewFromUnsigned(env->isolate(), kMaxLength)).FromJust(); From f4e7b557a8219751b7810b9e16e98ec482dfebb3 Mon Sep 17 00:00:00 2001 From: Anna Henningsen Date: Fri, 23 Dec 2016 13:50:57 +0100 Subject: [PATCH 2/3] string_decoder: support Uint8Array input to methods This is a bit odd since `string_decoder` does currently not perform any type checking. Also, this adds an explicit check for `string` input, which does not really make sense but is relied upon by our test suite. --- doc/api/string_decoder.md | 19 ++++++--- lib/string_decoder.js | 58 +++++++++++++++++++--------- test/parallel/test-string-decoder.js | 39 +++++++++++-------- 3 files changed, 76 insertions(+), 40 deletions(-) diff --git a/doc/api/string_decoder.md b/doc/api/string_decoder.md index a92f62ebcdfea9..75506dc564cf30 100644 --- a/doc/api/string_decoder.md +++ b/doc/api/string_decoder.md @@ -2,9 +2,9 @@ > Stability: 2 - Stable -The `string_decoder` module provides an API for decoding `Buffer` objects into -strings in a manner that preserves encoded multi-byte UTF-8 and UTF-16 -characters. It can be accessed using: +The `string_decoder` module provides an API for decoding `Buffer` and +`Uint8Array` objects into strings in a manner that preserves encoded multi-byte +UTF-8 and UTF-16 characters. It can be accessed using: ```js const StringDecoder = require('string_decoder').StringDecoder; @@ -53,9 +53,14 @@ Creates a new `StringDecoder` instance. ### stringDecoder.end([buffer]) -* `buffer` {Buffer} A `Buffer` containing the bytes to decode. +* `buffer` {Buffer|Uint8Array} A `Buffer` or `Uint8Array` containing the bytes + to decode. Returns any remaining input stored in the internal buffer as a string. Bytes representing incomplete UTF-8 and UTF-16 characters will be replaced with @@ -68,13 +73,17 @@ is performed before returning the remaining input. -* `buffer` {Buffer} A `Buffer` containing the bytes to decode. +* `buffer` {Buffer|Uint8Array} A `Buffer` or `Uint8Array` containing the bytes + to decode. Returns a decoded string, ensuring that any incomplete multibyte characters at the end of the `Buffer` are omitted from the returned string and stored in an diff --git a/lib/string_decoder.js b/lib/string_decoder.js index eee9a7d9273ebf..23bd2ad66546ce 100644 --- a/lib/string_decoder.js +++ b/lib/string_decoder.js @@ -24,6 +24,9 @@ const Buffer = require('buffer').Buffer; const internalUtil = require('internal/util'); const isEncoding = Buffer[internalUtil.kIsEncodingSymbol]; +const { + copy, latin1Slice, asciiSlice, hexSlice, utf8Slice, ucs2Slice, base64Slice +} = process.binding('buffer'); // Do not cache `Buffer.isEncoding` when checking encoding names as some // modules monkey-patch it to support additional encodings @@ -57,8 +60,16 @@ function StringDecoder(encoding) { this.end = base64End; nb = 3; break; - default: - this.write = simpleWrite; + case 'hex': + this.write = hexText; + this.end = simpleEnd; + return; + case 'latin1': + this.write = latin1Text; + this.end = simpleEnd; + return; + case 'ascii': + this.write = asciiText; this.end = simpleEnd; return; } @@ -67,9 +78,12 @@ function StringDecoder(encoding) { this.lastChar = Buffer.allocUnsafe(nb); } +// TODO(addaleax): This method should not accept strings as input. StringDecoder.prototype.write = function(buf) { if (buf.length === 0) return ''; + if (typeof buf === 'string') + return buf; var r; var i; if (this.lastNeed) { @@ -94,10 +108,10 @@ StringDecoder.prototype.text = utf8Text; // Attempts to complete a partial non-UTF-8 character using bytes from a Buffer StringDecoder.prototype.fillLast = function(buf) { if (this.lastNeed <= buf.length) { - buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed); + copy(buf, this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed); return this.lastChar.toString(this.encoding, 0, this.lastTotal); } - buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length); + copy(buf, this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length); this.lastNeed -= buf.length; }; @@ -185,10 +199,10 @@ function utf8FillLast(buf) { if (r !== undefined) return r; if (this.lastNeed <= buf.length) { - buf.copy(this.lastChar, p, 0, this.lastNeed); - return this.lastChar.toString(this.encoding, 0, this.lastTotal); + copy(buf, this.lastChar, p, 0, this.lastNeed); + return utf8Slice(this.lastChar, 0, this.lastTotal); } - buf.copy(this.lastChar, p, 0, buf.length); + copy(buf, this.lastChar, p, 0, buf.length); this.lastNeed -= buf.length; } @@ -198,11 +212,11 @@ function utf8FillLast(buf) { function utf8Text(buf, i) { const total = utf8CheckIncomplete(this, buf, i); if (!this.lastNeed) - return buf.toString('utf8', i); + return utf8Slice(buf, i, buf.length); this.lastTotal = total; const end = buf.length - (total - this.lastNeed); - buf.copy(this.lastChar, 0, end); - return buf.toString('utf8', i, end); + copy(buf, this.lastChar, 0, end); + return utf8Slice(buf, i, end); } // For UTF-8, a replacement character is added when ending on a partial @@ -220,7 +234,7 @@ function utf8End(buf) { // decode the last character properly. function utf16Text(buf, i) { if ((buf.length - i) % 2 === 0) { - const r = buf.toString('utf16le', i); + const r = ucs2Slice(buf, i, buf.length); if (r) { const c = r.charCodeAt(r.length - 1); if (c >= 0xD800 && c <= 0xDBFF) { @@ -236,7 +250,7 @@ function utf16Text(buf, i) { this.lastNeed = 1; this.lastTotal = 2; this.lastChar[0] = buf[buf.length - 1]; - return buf.toString('utf16le', i, buf.length - 1); + return ucs2Slice(buf, i, buf.length - 1); } // For UTF-16LE we do not explicitly append special replacement characters if we @@ -245,7 +259,7 @@ function utf16End(buf) { const r = (buf && buf.length ? this.write(buf) : ''); if (this.lastNeed) { const end = this.lastTotal - this.lastNeed; - return r + this.lastChar.toString('utf16le', 0, end); + return r + ucs2Slice(this.lastChar, 0, end); } return r; } @@ -253,7 +267,7 @@ function utf16End(buf) { function base64Text(buf, i) { const n = (buf.length - i) % 3; if (n === 0) - return buf.toString('base64', i); + return base64Slice(buf, i, buf.length); this.lastNeed = 3 - n; this.lastTotal = 3; if (n === 1) { @@ -262,20 +276,28 @@ function base64Text(buf, i) { this.lastChar[0] = buf[buf.length - 2]; this.lastChar[1] = buf[buf.length - 1]; } - return buf.toString('base64', i, buf.length - n); + return base64Slice(buf, i, buf.length - n); } function base64End(buf) { const r = (buf && buf.length ? this.write(buf) : ''); if (this.lastNeed) - return r + this.lastChar.toString('base64', 0, 3 - this.lastNeed); + return r + base64Slice(this.lastChar, 0, 3 - this.lastNeed); return r; } // Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex) -function simpleWrite(buf) { - return buf.toString(this.encoding); +function latin1Text(buf) { + return latin1Slice(buf, 0, buf.length); +} + +function asciiText(buf) { + return asciiSlice(buf, 0, buf.length); +} + +function hexText(buf) { + return hexSlice(buf, 0, buf.length); } function simpleEnd(buf) { diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js index af88b809b89640..aec92cc78a76e9 100644 --- a/test/parallel/test-string-decoder.js +++ b/test/parallel/test-string-decoder.js @@ -106,7 +106,7 @@ assert.strictEqual(decoder.end(), '\ufffd'); // Additional utf8Text test decoder = new StringDecoder('utf8'); -assert.strictEqual(decoder.text(Buffer.from([0x41]), 2), ''); +assert.strictEqual(decoder.text(Buffer.from([0x41]), 1), ''); // Additional UTF-16LE surrogate pair tests decoder = new StringDecoder('utf16le'); @@ -144,23 +144,28 @@ function test(encoding, input, expected, singleSequence) { } else { sequences = [singleSequence]; } - sequences.forEach((sequence) => { - const decoder = new StringDecoder(encoding); - let output = ''; - sequence.forEach((write) => { - output += decoder.write(input.slice(write[0], write[1])); + for (const useUint8array of [ false, true ]) { + sequences.forEach((sequence) => { + const decoder = new StringDecoder(encoding); + let output = ''; + sequence.forEach((write) => { + let slice = input.slice(write[0], write[1]); + if (useUint8array) + slice = new Uint8Array(slice); + output += decoder.write(slice); + }); + output += decoder.end(); + if (output !== expected) { + const message = + 'Expected "' + unicodeEscape(expected) + '", ' + + 'but got "' + unicodeEscape(output) + '"\n' + + 'input: ' + input.toString('hex').match(/.{2}/g) + '\n' + + 'Write sequence: ' + JSON.stringify(sequence) + '\n' + + 'Full Decoder State: ' + inspect(decoder); + assert.fail(output, expected, message); + } }); - output += decoder.end(); - if (output !== expected) { - const message = - 'Expected "' + unicodeEscape(expected) + '", ' + - 'but got "' + unicodeEscape(output) + '"\n' + - 'input: ' + input.toString('hex').match(/.{2}/g) + '\n' + - 'Write sequence: ' + JSON.stringify(sequence) + '\n' + - 'Full Decoder State: ' + inspect(decoder); - assert.fail(output, expected, message); - } - }); + } } // unicodeEscape prints the str contents as unicode escape codes. From 525fabd7ed6d610d09a00e7271796be7276b6217 Mon Sep 17 00:00:00 2001 From: Brian White Date: Wed, 12 Apr 2017 21:04:52 -0400 Subject: [PATCH 3/3] string_decoder: refactor encoding normalization --- lib/string_decoder.js | 131 ++++++++++++++++++++++++++++++------------ 1 file changed, 93 insertions(+), 38 deletions(-) diff --git a/lib/string_decoder.js b/lib/string_decoder.js index 23bd2ad66546ce..65d91bc63d5cde 100644 --- a/lib/string_decoder.js +++ b/lib/string_decoder.js @@ -28,51 +28,106 @@ const { copy, latin1Slice, asciiSlice, hexSlice, utf8Slice, ucs2Slice, base64Slice } = process.binding('buffer'); -// Do not cache `Buffer.isEncoding` when checking encoding names as some -// modules monkey-patch it to support additional encodings -function normalizeEncoding(enc) { - const nenc = internalUtil.normalizeEncoding(enc); - if (typeof nenc !== 'string' && - (Buffer.isEncoding === isEncoding || !Buffer.isEncoding(enc))) - throw new Error(`Unknown encoding: ${enc}`); - return nenc || enc; +const encodings = [ + // 0 + [ + 'utf8', // normalized encoding name string + 4, // buffer size + (self) => { self.fillLast = utf8FillLast; } // StringDecoder initialization + ], + // 1 + [ + 'utf16le', + 4, + (self) => { self.text = utf16Text; self.end = utf16End; } + ], + // 2 + [ + 'latin1', + 0, + (self) => { self.text = latin1Text; self.end = simpleEnd; } + ], + // 3 + [ + 'base64', + 3, + (self) => { self.text = base64Text; self.end = base64End; } + ], + // 4 + [ + 'ascii', + 0, + (self) => { self.text = asciiText; self.end = simpleEnd; } + ], + // 5 + [ + 'hex', + 0, + (self) => { self.text = hexText; self.end = simpleEnd; } + ] +]; + +function translateEncoding(enc) { + if (!enc) return 0; + enc += ''; + switch (enc.length) { + case 4: + if (enc === 'utf8') return 0; + if (enc === 'ucs2') return 1; + enc = enc.toLowerCase(); + if (enc === 'utf8') return 0; + if (enc === 'ucs2') return 1; + break; + case 5: + if (enc === 'utf-8') return 0; + if (enc === 'ascii') return 4; + if (enc === 'ucs-2') return 1; + enc = enc.toLowerCase(); + if (enc === 'utf-8') return 0; + if (enc === 'ascii') return 4; + if (enc === 'ucs-2') return 1; + break; + case 7: + return (enc === 'utf16le' || enc.toLowerCase() === 'utf16le' ? 1 : -1); + case 8: + return (enc === 'utf-16le' || enc.toLowerCase() === 'utf-16le' ? 1 : -1); + case 6: + if (enc === 'latin1') return 2; + if (enc === 'binary') return 2; + if (enc === 'base64') return 3; + enc = enc.toLowerCase(); + if (enc === 'latin1') return 2; + if (enc === 'binary') return 2; + if (enc === 'base64') return 3; + break; + case 3: + return (enc === 'hex' || enc.toLowerCase() === 'hex' ? 5 : -1); + } + return -1; } // StringDecoder provides an interface for efficiently splitting a series of // buffers into a series of JS strings without breaking apart multi-byte // characters. +// Do not cache `Buffer.isEncoding` when checking encoding names as some +// modules monkey-patch it to support additional encodings exports.StringDecoder = StringDecoder; -function StringDecoder(encoding) { - this.encoding = normalizeEncoding(encoding); - var nb; - switch (this.encoding) { - case 'utf16le': - this.text = utf16Text; - this.end = utf16End; - nb = 4; - break; - case 'utf8': - this.fillLast = utf8FillLast; - nb = 4; - break; - case 'base64': - this.text = base64Text; - this.end = base64End; - nb = 3; - break; - case 'hex': - this.write = hexText; - this.end = simpleEnd; - return; - case 'latin1': - this.write = latin1Text; - this.end = simpleEnd; - return; - case 'ascii': - this.write = asciiText; - this.end = simpleEnd; - return; +function StringDecoder(enc) { + var info; + const encIdx = translateEncoding(enc); + if (encIdx === -1) { + if (Buffer.isEncoding === isEncoding || !Buffer.isEncoding(enc)) + throw new Error(`Unknown encoding: ${enc}`); + this.encoding = enc; + return; + } else { + info = encodings[encIdx]; } + this.encoding = info[0]; + const nb = info[1]; + info[2](this); + if (nb === 0) + return; this.lastNeed = 0; this.lastTotal = 0; this.lastChar = Buffer.allocUnsafe(nb);