Skip to content

Commit

Permalink
string_decoder: refactor encoding normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
mscdex authored and addaleax committed May 5, 2017
1 parent f4e7b55 commit 525fabd
Showing 1 changed file with 93 additions and 38 deletions.
131 changes: 93 additions & 38 deletions lib/string_decoder.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,51 +28,106 @@ const {
copy, latin1Slice, asciiSlice, hexSlice, utf8Slice, ucs2Slice, base64Slice
} = process.binding('buffer');

// Do not cache `Buffer.isEncoding` when checking encoding names as some
// modules monkey-patch it to support additional encodings
function normalizeEncoding(enc) {
const nenc = internalUtil.normalizeEncoding(enc);
if (typeof nenc !== 'string' &&
(Buffer.isEncoding === isEncoding || !Buffer.isEncoding(enc)))
throw new Error(`Unknown encoding: ${enc}`);
return nenc || enc;
const encodings = [
// 0
[
'utf8', // normalized encoding name string
4, // buffer size
(self) => { self.fillLast = utf8FillLast; } // StringDecoder initialization
],
// 1
[
'utf16le',
4,
(self) => { self.text = utf16Text; self.end = utf16End; }
],
// 2
[
'latin1',
0,
(self) => { self.text = latin1Text; self.end = simpleEnd; }
],
// 3
[
'base64',
3,
(self) => { self.text = base64Text; self.end = base64End; }
],
// 4
[
'ascii',
0,
(self) => { self.text = asciiText; self.end = simpleEnd; }
],
// 5
[
'hex',
0,
(self) => { self.text = hexText; self.end = simpleEnd; }
]
];

function translateEncoding(enc) {
if (!enc) return 0;
enc += '';
switch (enc.length) {
case 4:
if (enc === 'utf8') return 0;
if (enc === 'ucs2') return 1;
enc = enc.toLowerCase();
if (enc === 'utf8') return 0;
if (enc === 'ucs2') return 1;
break;
case 5:
if (enc === 'utf-8') return 0;
if (enc === 'ascii') return 4;
if (enc === 'ucs-2') return 1;
enc = enc.toLowerCase();
if (enc === 'utf-8') return 0;
if (enc === 'ascii') return 4;
if (enc === 'ucs-2') return 1;
break;
case 7:
return (enc === 'utf16le' || enc.toLowerCase() === 'utf16le' ? 1 : -1);
case 8:
return (enc === 'utf-16le' || enc.toLowerCase() === 'utf-16le' ? 1 : -1);
case 6:
if (enc === 'latin1') return 2;
if (enc === 'binary') return 2;
if (enc === 'base64') return 3;
enc = enc.toLowerCase();
if (enc === 'latin1') return 2;
if (enc === 'binary') return 2;
if (enc === 'base64') return 3;
break;
case 3:
return (enc === 'hex' || enc.toLowerCase() === 'hex' ? 5 : -1);
}
return -1;
}

// StringDecoder provides an interface for efficiently splitting a series of
// buffers into a series of JS strings without breaking apart multi-byte
// characters.
// Do not cache `Buffer.isEncoding` when checking encoding names as some
// modules monkey-patch it to support additional encodings
exports.StringDecoder = StringDecoder;
function StringDecoder(encoding) {
this.encoding = normalizeEncoding(encoding);
var nb;
switch (this.encoding) {
case 'utf16le':
this.text = utf16Text;
this.end = utf16End;
nb = 4;
break;
case 'utf8':
this.fillLast = utf8FillLast;
nb = 4;
break;
case 'base64':
this.text = base64Text;
this.end = base64End;
nb = 3;
break;
case 'hex':
this.write = hexText;
this.end = simpleEnd;
return;
case 'latin1':
this.write = latin1Text;
this.end = simpleEnd;
return;
case 'ascii':
this.write = asciiText;
this.end = simpleEnd;
return;
function StringDecoder(enc) {
var info;
const encIdx = translateEncoding(enc);
if (encIdx === -1) {
if (Buffer.isEncoding === isEncoding || !Buffer.isEncoding(enc))
throw new Error(`Unknown encoding: ${enc}`);
this.encoding = enc;
return;
} else {
info = encodings[encIdx];
}
this.encoding = info[0];
const nb = info[1];
info[2](this);
if (nb === 0)
return;
this.lastNeed = 0;
this.lastTotal = 0;
this.lastChar = Buffer.allocUnsafe(nb);
Expand Down

0 comments on commit 525fabd

Please sign in to comment.