Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add 'utf-8-lossy' encoding mode #100

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,15 +141,22 @@ Possible values are as follows:

- `'none'`: Source code characters all pass through as-is and string
literals are not interpreted at all; the string literal nodes contain
the value `null`. This is the default mode.
the value `null`. This is the default mode. Use it when you are not
interested in the concrete values of Lua string literals.
- `'x-user-defined'`: Source code has been decoded with the WHATWG
`x-user-defined` encoding; escapes of bytes in the range \[0x80, 0xff]
are mapped to the Unicode range \[U+F780, U+F7FF].
are mapped to the Unicode range \[U+F780, U+F7FF]. Use this mode to parse
files decoded with the WHATWG [`TextDecoder`] API.
- `'pseudo-latin1'`: Source code has been decoded with the IANA
`iso-8859-1` encoding; escapes of bytes in the range \[0x80, 0xff]
are mapped to Unicode range \[U+0080, U+00FF]. Note that this is
**not** the same as how WHATWG standards define the `iso-8859-1`
encoding, which is to say, as a synonym of `windows-1252`.
encoding, which is to say, as a synonym of `windows-1252`. Use this mode
to interoperate with an API like [`readAsBinaryString`], [`atob`] or
[`Buffer`].
- `'utf-8-lossy'`: Source code has been decoded as UTF-8. Use this mode
when you expect most string literals to be UTF-8, but do not require
exact faithful representation in all circumstances.

### Custom AST

Expand Down Expand Up @@ -307,3 +314,7 @@ MIT
[lua]: https://www.lua.org
[esprima]: http://esprima.org
[wtf8]: https://simonsapin.github.io/wtf-8/
[`TextDecoder`]: https://encoding.spec.whatwg.org/#interface-textdecoder
[`readAsBinaryString`]: https://w3c.github.io/FileAPI/#dfn-readAsBinaryString
[`atob`]: https://html.spec.whatwg.org/multipage/webappapis.html#atob
[`Buffer`]: https://nodejs.org/api/buffer.html
149 changes: 134 additions & 15 deletions luaparse.js
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,21 @@
}
}

var fromCodePoint = /* istanbul ignore next */ function (codepoint) {
if (codepoint > 0x10ffff)
throw new RangeError(codepoint);
if (codepoint > 0xffff)
return String.fromCharCode(
0xd800 | ((codepoint >> 10) & 0x3ff),
0xdc00 | ( codepoint & 0x3ff)
);
return String.fromCharCode(codepoint);
};

/* istanbul ignore else */
if (String.fromCodePoint)
fromCodePoint = String.fromCodePoint;

function toHex(num, digits) {
var result = num.toString(16);
while (result.length < digits)
Expand All @@ -135,6 +150,97 @@
};
}

function makeUTF8Mode(fallbackFuncByte, fallbackFuncUnicode) {
return {
fixup: function (s) {
return s.replace(/[\ud800-\udbff][\udc00-\udfff]|[\ud800-\udfff]/g, function (m) {
if (m.length === 2)
return m;
return fallbackFuncUnicode(m.charCodeAt(0), null);
});
},
initState: function () {
return {
byteBuffer: [],
rawBuffer: ''
};
},
encodeByte: function (value, raw, encodingState) {
var buffer = encodingState.byteBuffer;
if (value === null) {
var result = '';
var off = 0;
var rawBuffer = encodingState.rawBuffer;

encodingState.byteBuffer = [];
encodingState.rawBuffer = '';

while (off < buffer.length) {
var l = 1;
var m = 0;
var codepoint = null;

if (buffer[off] < 0x80) {
codepoint = buffer[off];
} else if (0xc2 <= buffer[off] && buffer[off] <= 0xdf) {
codepoint = buffer[off] & 0x1f;
l = 2;
m = 0x80;
} else if (0xe1 <= buffer[off] && buffer[off] <= 0xef) {
codepoint = buffer[off] & 0x0f;
l = 3;
m = 0x800;
} else if (0xf0 <= buffer[off] && buffer[off] <= 0xf4) {
codepoint = buffer[off] & 0x07;
l = 4;
m = 0x10000;
}

if (off + l > buffer.length)
codepoint = null;
else {
for (var j = 1; j < l; ++j) {
if ((buffer[off + j] & 0xc0) !== 0x80) {
codepoint = null;
break;
}
codepoint <<= 6;
codepoint |= buffer[off + j] & 0x3f;
}
}

if (codepoint > 0x10ffff)
codepoint = null;
if (codepoint < m)
codepoint = null;

if (codepoint === null) {
result += fallbackFuncByte(buffer[off], rawBuffer);
off++;
} else {
result += fromCodePoint(codepoint);
off += l;
}
}

return result;
}

buffer.push(value);
encodingState.rawBuffer += raw;
return '';
},
encodeUTF8: function (codepoint, raw, encodingState) {
/* istanbul ignore if */
if (codepoint > 0x10ffff)
return fallbackFuncUnicode(codepoint, raw);
if (0xd800 <= codepoint && codepoint <= 0xdfff)
return fallbackFuncUnicode(codepoint, raw);
return fromCodePoint(codepoint);
}
};
}

var encodingModes = {
// `pseudo-latin1` encoding mode: assume the input was decoded with the latin1 encoding
// WARNING: latin1 does **NOT** mean cp1252 here like in the bone-headed WHATWG standard;
Expand Down Expand Up @@ -166,6 +272,13 @@
}
},

// `utf-8-lossy` encoding mode: replace non-UTF-8 escapes with U+FFFD
'utf-8-lossy': makeUTF8Mode(function (value, raw) {
return '\ufffd';
}, function (codepoint, raw) {
return '\ufffd';
}),

// `none` encoding mode: disregard intrepretation of string literals, leave identifiers as-is
'none': {
discardStrings: true,
Expand Down Expand Up @@ -218,6 +331,8 @@
, gotoJumpInLocalScope: '<goto %1> jumps into the scope of local \'%2\''
, cannotUseVararg: 'cannot use \'...\' outside a vararg function near \'%1\''
, invalidCodeUnit: 'code unit U+%1 is not allowed in the current encoding mode'
, invalidByteEscape: 'byte escape sequence \'%1\' is not allowed in the current encoding mode'
, invalidUTF8Escape: 'UTF-8 escape \'%1\' is not allowed in the current encoding mode'
};

// ### Abstract Syntax Tree
Expand Down Expand Up @@ -891,6 +1006,7 @@
, beginLineStart = lineStart
, stringStart = index
, string = encodingMode.discardStrings ? null : ''
, encodingState = encodingMode.initState ? encodingMode.initState() : null
, charCode;

for (;;) {
Expand All @@ -899,22 +1015,21 @@
// EOF or `\n` terminates a string literal. If we haven't found the
// ending delimiter by now, raise an exception.
if (index > length || isLineTerminator(charCode)) {
string += input.slice(stringStart, index - 1);
raise(null, errors.unfinishedString, input.slice(tokenStart, index - 1));
}
if (92 === charCode) { // backslash
if (!encodingMode.discardStrings) {
if (string !== null) {
var beforeEscape = input.slice(stringStart, index - 1);
string += encodingMode.fixup(beforeEscape);
}
var escapeValue = readEscapeSequence();
if (!encodingMode.discardStrings)
var escapeValue = readEscapeSequence(encodingState);
if (string !== null)
string += escapeValue;
stringStart = index;
}
}
if (!encodingMode.discardStrings) {
string += encodingMode.encodeByte(null);
if (string !== null) {
string += encodingMode.encodeByte(null, null, encodingState);
string += encodingMode.fixup(input.slice(stringStart, index - 1));
}

Expand Down Expand Up @@ -1131,7 +1246,7 @@
};
}

function readUnicodeEscapeSequence() {
function readUnicodeEscapeSequence(encodingState) {
var sequenceStart = index++;

if (input.charAt(index++) !== '{')
Expand Down Expand Up @@ -1160,14 +1275,14 @@
var frag = '\\' + input.slice(sequenceStart, index);

if (codepoint > 0x10ffff) {
raise(null, errors.tooLargeCodepoint, frag);
raise(null, errors.tooLargeCodepoint, frag, encodingState);
}

return encodingMode.encodeUTF8(codepoint, frag);
return encodingMode.encodeUTF8(codepoint, frag, encodingState);
}

// Translate escape sequences to the actual characters.
function readEscapeSequence() {
function readEscapeSequence(encodingState) {
var sequenceStart = index;
switch (input.charAt(index)) {
// Lua allow the following escape sequences.
Expand Down Expand Up @@ -1195,9 +1310,9 @@
var frag = input.slice(sequenceStart, index);
var ddd = parseInt(frag, 10);
if (ddd > 255) {
raise(null, errors.decimalEscapeTooLarge, '\\' + ddd);
raise(null, errors.decimalEscapeTooLarge, '\\' + ddd, encodingState);
}
return encodingMode.encodeByte(ddd, '\\' + frag);
return encodingMode.encodeByte(ddd, '\\' + frag, encodingState);

case 'z':
if (features.skipWhitespaceEscape) {
Expand All @@ -1213,15 +1328,19 @@
if (isHexDigit(input.charCodeAt(index + 1)) &&
isHexDigit(input.charCodeAt(index + 2))) {
index += 3;
return encodingMode.encodeByte(parseInt(input.slice(sequenceStart + 1, index), 16), '\\' + input.slice(sequenceStart, index));
return encodingMode.encodeByte(
parseInt(input.slice(sequenceStart + 1, index), 16),
'\\' + input.slice(sequenceStart, index),
encodingState
);
}
raise(null, errors.hexadecimalDigitExpected, '\\' + input.slice(sequenceStart, index + 2));
}
break;

case 'u':
if (features.unicodeEscapes)
return readUnicodeEscapeSequence();
return readUnicodeEscapeSequence(encodingState);
break;

case '\\': case '"': case "'":
Expand Down Expand Up @@ -1287,7 +1406,7 @@
// Read a multiline string by calculating the depth of `=` characters and
// then appending until an equal depth is found.

function readLongString(isComment) {
function readLongString(isComment, encodingState) {
var level = 0
, content = ''
, terminator = false
Expand Down
17 changes: 17 additions & 0 deletions test/runner.js
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,23 @@
{ mode: 'pseudo-latin1', src: '\x80', name: '\x80' },
{ mode: 'pseudo-latin1', src: 'a', name: 'a' },

{ mode: 'utf-8-lossy', src: 'a', name: 'a' },
{ mode: 'utf-8-lossy', src: '\x80', name: '\x80' },
{ mode: 'utf-8-lossy', src: '\uf780', name: '\uf780' },
{ mode: 'utf-8-lossy', src: '"a"', value: 'a' },
{ mode: 'utf-8-lossy', src: '"\u0080"', value: '\u0080' },
{ mode: 'utf-8-lossy', src: '"\ud83d\udca9"', value: '\ud83d\udca9' },
{ mode: 'utf-8-lossy', src: '"\uf780"', value: '\uf780' },
{ mode: 'utf-8-lossy', src: '"\\xf4\\x8f\\xbf\\xbf"', value: '\udbff\udfff' },
{ mode: 'utf-8-lossy', src: '"\\xef\\xbf\\xbf"', value: '\uffff' },
{ mode: 'utf-8-lossy', src: '"\\xdf\\xbf"', value: '\u07ff' },
{ mode: 'utf-8-lossy', src: '"\\x61"', value: 'a' },
{ mode: 'utf-8-lossy', src: '"\\u{80}"', value: '\u0080' },
{ mode: 'utf-8-lossy', src: '"\\u{d800}"', value: '\ufffd' },
{ mode: 'utf-8-lossy', src: '"\\x80"', value: '\ufffd' },
{ mode: 'utf-8-lossy', src: '"\\128"', value: '\ufffd' },
{ mode: 'utf-8-lossy', src: '"\ud800"', value: '\ufffd' },

{ mode: 'none', src: '"\\u{80}"', value: null },
{ mode: 'none', src: '"\\u{d800}"', value: null },
{ mode: 'none', src: '"\\x80"', value: null },
Expand Down