From f266161c2587321cef56c214485241c2cc881c1a Mon Sep 17 00:00:00 2001 From: felix Date: Thu, 16 Jan 2020 11:05:16 +0100 Subject: [PATCH] Add 'utf-8-lossy' encoding mode Also expand README with guidance on when to use which encoding mode. --- README.md | 17 +++++- luaparse.js | 149 ++++++++++++++++++++++++++++++++++++++++++++----- test/runner.js | 17 ++++++ 3 files changed, 165 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index ce7fa32c..481e5015 100644 --- a/README.md +++ b/README.md @@ -141,15 +141,22 @@ Possible values are as follows: - `'none'`: Source code characters all pass through as-is and string literals are not interpreted at all; the string literal nodes contain - the value `null`. This is the default mode. + the value `null`. This is the default mode. Use it when you are not + interested in the concrete values of Lua string literals. - `'x-user-defined'`: Source code has been decoded with the WHATWG `x-user-defined` encoding; escapes of bytes in the range \[0x80, 0xff] - are mapped to the Unicode range \[U+F780, U+F7FF]. + are mapped to the Unicode range \[U+F780, U+F7FF]. Use this mode to parse + files decoded with the WHATWG [`TextDecoder`] API. - `'pseudo-latin1'`: Source code has been decoded with the IANA `iso-8859-1` encoding; escapes of bytes in the range \[0x80, 0xff] are mapped to Unicode range \[U+0080, U+00FF]. Note that this is **not** the same as how WHATWG standards define the `iso-8859-1` - encoding, which is to say, as a synonym of `windows-1252`. + encoding, which is to say, as a synonym of `windows-1252`. Use this mode + to interoperate with an API like [`readAsBinaryString`], [`atob`] or + [`Buffer`]. +- `'utf-8-lossy'`: Source code has been decoded as UTF-8. Use this mode + when you expect most string literals to be UTF-8, but do not require + exact faithful representation in all circumstances. ### Custom AST @@ -307,3 +314,7 @@ MIT [lua]: https://www.lua.org [esprima]: http://esprima.org [wtf8]: https://simonsapin.github.io/wtf-8/ +[`TextDecoder`]: https://encoding.spec.whatwg.org/#interface-textdecoder +[`readAsBinaryString`]: https://w3c.github.io/FileAPI/#dfn-readAsBinaryString +[`atob`]: https://html.spec.whatwg.org/multipage/webappapis.html#atob +[`Buffer`]: https://nodejs.org/api/buffer.html diff --git a/luaparse.js b/luaparse.js index ea1932f5..4fbb5ecd 100644 --- a/luaparse.js +++ b/luaparse.js @@ -119,6 +119,21 @@ } } + var fromCodePoint = /* istanbul ignore next */ function (codepoint) { + if (codepoint > 0x10ffff) + throw new RangeError(codepoint); + if (codepoint > 0xffff) + return String.fromCharCode( + 0xd800 | ((codepoint >> 10) & 0x3ff), + 0xdc00 | ( codepoint & 0x3ff) + ); + return String.fromCharCode(codepoint); + }; + + /* istanbul ignore else */ + if (String.fromCodePoint) + fromCodePoint = String.fromCodePoint; + function toHex(num, digits) { var result = num.toString(16); while (result.length < digits) @@ -135,6 +150,97 @@ }; } + function makeUTF8Mode(fallbackFuncByte, fallbackFuncUnicode) { + return { + fixup: function (s) { + return s.replace(/[\ud800-\udbff][\udc00-\udfff]|[\ud800-\udfff]/g, function (m) { + if (m.length === 2) + return m; + return fallbackFuncUnicode(m.charCodeAt(0), null); + }); + }, + initState: function () { + return { + byteBuffer: [], + rawBuffer: '' + }; + }, + encodeByte: function (value, raw, encodingState) { + var buffer = encodingState.byteBuffer; + if (value === null) { + var result = ''; + var off = 0; + var rawBuffer = encodingState.rawBuffer; + + encodingState.byteBuffer = []; + encodingState.rawBuffer = ''; + + while (off < buffer.length) { + var l = 1; + var m = 0; + var codepoint = null; + + if (buffer[off] < 0x80) { + codepoint = buffer[off]; + } else if (0xc2 <= buffer[off] && buffer[off] <= 0xdf) { + codepoint = buffer[off] & 0x1f; + l = 2; + m = 0x80; + } else if (0xe1 <= buffer[off] && buffer[off] <= 0xef) { + codepoint = buffer[off] & 0x0f; + l = 3; + m = 0x800; + } else if (0xf0 <= buffer[off] && buffer[off] <= 0xf4) { + codepoint = buffer[off] & 0x07; + l = 4; + m = 0x10000; + } + + if (off + l > buffer.length) + codepoint = null; + else { + for (var j = 1; j < l; ++j) { + if ((buffer[off + j] & 0xc0) !== 0x80) { + codepoint = null; + break; + } + codepoint <<= 6; + codepoint |= buffer[off + j] & 0x3f; + } + } + + if (codepoint > 0x10ffff) + codepoint = null; + if (codepoint < m) + codepoint = null; + + if (codepoint === null) { + result += fallbackFuncByte(buffer[off], rawBuffer); + off++; + } else { + result += fromCodePoint(codepoint); + off += l; + } + } + + return result; + } + + buffer.push(value); + encodingState.rawBuffer += raw; + return ''; + }, + encodeUTF8: function (codepoint, raw, encodingState) { + /* istanbul ignore if */ + if (codepoint > 0x10ffff) + return fallbackFuncUnicode(codepoint, raw); + if (0xd800 <= codepoint && codepoint <= 0xdfff) + return fallbackFuncUnicode(codepoint, raw); + return fromCodePoint(codepoint); + } + }; + } + var encodingModes = { // `pseudo-latin1` encoding mode: assume the input was decoded with the latin1 encoding // WARNING: latin1 does **NOT** mean cp1252 here like in the bone-headed WHATWG standard; @@ -166,6 +272,13 @@ } }, + // `utf-8-lossy` encoding mode: replace non-UTF-8 escapes with U+FFFD + 'utf-8-lossy': makeUTF8Mode(function (value, raw) { + return '\ufffd'; + }, function (codepoint, raw) { + return '\ufffd'; + }), + // `none` encoding mode: disregard intrepretation of string literals, leave identifiers as-is 'none': { discardStrings: true, @@ -218,6 +331,8 @@ , gotoJumpInLocalScope: ' jumps into the scope of local \'%2\'' , cannotUseVararg: 'cannot use \'...\' outside a vararg function near \'%1\'' , invalidCodeUnit: 'code unit U+%1 is not allowed in the current encoding mode' + , invalidByteEscape: 'byte escape sequence \'%1\' is not allowed in the current encoding mode' + , invalidUTF8Escape: 'UTF-8 escape \'%1\' is not allowed in the current encoding mode' }; // ### Abstract Syntax Tree @@ -891,6 +1006,7 @@ , beginLineStart = lineStart , stringStart = index , string = encodingMode.discardStrings ? null : '' + , encodingState = encodingMode.initState ? encodingMode.initState() : null , charCode; for (;;) { @@ -899,22 +1015,21 @@ // EOF or `\n` terminates a string literal. If we haven't found the // ending delimiter by now, raise an exception. if (index > length || isLineTerminator(charCode)) { - string += input.slice(stringStart, index - 1); raise(null, errors.unfinishedString, input.slice(tokenStart, index - 1)); } if (92 === charCode) { // backslash - if (!encodingMode.discardStrings) { + if (string !== null) { var beforeEscape = input.slice(stringStart, index - 1); string += encodingMode.fixup(beforeEscape); } - var escapeValue = readEscapeSequence(); - if (!encodingMode.discardStrings) + var escapeValue = readEscapeSequence(encodingState); + if (string !== null) string += escapeValue; stringStart = index; } } - if (!encodingMode.discardStrings) { - string += encodingMode.encodeByte(null); + if (string !== null) { + string += encodingMode.encodeByte(null, null, encodingState); string += encodingMode.fixup(input.slice(stringStart, index - 1)); } @@ -1131,7 +1246,7 @@ }; } - function readUnicodeEscapeSequence() { + function readUnicodeEscapeSequence(encodingState) { var sequenceStart = index++; if (input.charAt(index++) !== '{') @@ -1160,14 +1275,14 @@ var frag = '\\' + input.slice(sequenceStart, index); if (codepoint > 0x10ffff) { - raise(null, errors.tooLargeCodepoint, frag); + raise(null, errors.tooLargeCodepoint, frag, encodingState); } - return encodingMode.encodeUTF8(codepoint, frag); + return encodingMode.encodeUTF8(codepoint, frag, encodingState); } // Translate escape sequences to the actual characters. - function readEscapeSequence() { + function readEscapeSequence(encodingState) { var sequenceStart = index; switch (input.charAt(index)) { // Lua allow the following escape sequences. @@ -1195,9 +1310,9 @@ var frag = input.slice(sequenceStart, index); var ddd = parseInt(frag, 10); if (ddd > 255) { - raise(null, errors.decimalEscapeTooLarge, '\\' + ddd); + raise(null, errors.decimalEscapeTooLarge, '\\' + ddd, encodingState); } - return encodingMode.encodeByte(ddd, '\\' + frag); + return encodingMode.encodeByte(ddd, '\\' + frag, encodingState); case 'z': if (features.skipWhitespaceEscape) { @@ -1213,7 +1328,11 @@ if (isHexDigit(input.charCodeAt(index + 1)) && isHexDigit(input.charCodeAt(index + 2))) { index += 3; - return encodingMode.encodeByte(parseInt(input.slice(sequenceStart + 1, index), 16), '\\' + input.slice(sequenceStart, index)); + return encodingMode.encodeByte( + parseInt(input.slice(sequenceStart + 1, index), 16), + '\\' + input.slice(sequenceStart, index), + encodingState + ); } raise(null, errors.hexadecimalDigitExpected, '\\' + input.slice(sequenceStart, index + 2)); } @@ -1221,7 +1340,7 @@ case 'u': if (features.unicodeEscapes) - return readUnicodeEscapeSequence(); + return readUnicodeEscapeSequence(encodingState); break; case '\\': case '"': case "'": @@ -1287,7 +1406,7 @@ // Read a multiline string by calculating the depth of `=` characters and // then appending until an equal depth is found. - function readLongString(isComment) { + function readLongString(isComment, encodingState) { var level = 0 , content = '' , terminator = false diff --git a/test/runner.js b/test/runner.js index f9c162ec..0174e75e 100644 --- a/test/runner.js +++ b/test/runner.js @@ -630,6 +630,23 @@ { mode: 'pseudo-latin1', src: '\x80', name: '\x80' }, { mode: 'pseudo-latin1', src: 'a', name: 'a' }, + { mode: 'utf-8-lossy', src: 'a', name: 'a' }, + { mode: 'utf-8-lossy', src: '\x80', name: '\x80' }, + { mode: 'utf-8-lossy', src: '\uf780', name: '\uf780' }, + { mode: 'utf-8-lossy', src: '"a"', value: 'a' }, + { mode: 'utf-8-lossy', src: '"\u0080"', value: '\u0080' }, + { mode: 'utf-8-lossy', src: '"\ud83d\udca9"', value: '\ud83d\udca9' }, + { mode: 'utf-8-lossy', src: '"\uf780"', value: '\uf780' }, + { mode: 'utf-8-lossy', src: '"\\xf4\\x8f\\xbf\\xbf"', value: '\udbff\udfff' }, + { mode: 'utf-8-lossy', src: '"\\xef\\xbf\\xbf"', value: '\uffff' }, + { mode: 'utf-8-lossy', src: '"\\xdf\\xbf"', value: '\u07ff' }, + { mode: 'utf-8-lossy', src: '"\\x61"', value: 'a' }, + { mode: 'utf-8-lossy', src: '"\\u{80}"', value: '\u0080' }, + { mode: 'utf-8-lossy', src: '"\\u{d800}"', value: '\ufffd' }, + { mode: 'utf-8-lossy', src: '"\\x80"', value: '\ufffd' }, + { mode: 'utf-8-lossy', src: '"\\128"', value: '\ufffd' }, + { mode: 'utf-8-lossy', src: '"\ud800"', value: '\ufffd' }, + { mode: 'none', src: '"\\u{80}"', value: null }, { mode: 'none', src: '"\\u{d800}"', value: null }, { mode: 'none', src: '"\\x80"', value: null },