Skip to content

Commit

Permalink
Add 'utf-8-lossy' encoding mode
Browse files Browse the repository at this point in the history
Also expand README with guidance on when to use which encoding mode.
  • Loading branch information
fstirlitz committed Apr 28, 2021
1 parent 6f8a7ea commit d2f3480
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 18 deletions.
17 changes: 14 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,15 +141,22 @@ Possible values are as follows:

- `'none'`: Source code characters all pass through as-is and string
literals are not interpreted at all; the string literal nodes contain
the value `null`. This is the default mode.
the value `null`. This is the default mode. Use it when you are not
interested in the concrete values of Lua string literals.
- `'x-user-defined'`: Source code has been decoded with the WHATWG
`x-user-defined` encoding; escapes of bytes in the range \[0x80, 0xff]
are mapped to the Unicode range \[U+F780, U+F7FF].
are mapped to the Unicode range \[U+F780, U+F7FF]. Use this mode to parse
files decoded with the WHATWG [`TextDecoder`] API.
- `'pseudo-latin1'`: Source code has been decoded with the IANA
`iso-8859-1` encoding; escapes of bytes in the range \[0x80, 0xff]
are mapped to Unicode range \[U+0080, U+00FF]. Note that this is
**not** the same as how WHATWG standards define the `iso-8859-1`
encoding, which is to say, as a synonym of `windows-1252`.
encoding, which is to say, as a synonym of `windows-1252`. Use this mode
to interoperate with an API like [`readAsBinaryString`], [`atob`] or
[`Buffer`].
- `'utf-8-lossy'`: Source code has been decoded as UTF-8. Use this mode
when you expect most string literals to be UTF-8, but do not require
exact faithful representation in all circumstances.

### Custom AST

Expand Down Expand Up @@ -307,3 +314,7 @@ MIT
[lua]: https://www.lua.org
[esprima]: http://esprima.org
[wtf8]: https://simonsapin.github.io/wtf-8/
[`TextDecoder`]: https://encoding.spec.whatwg.org/#interface-textdecoder
[`readAsBinaryString`]: https://w3c.github.io/FileAPI/#dfn-readAsBinaryString
[`atob`]: https://html.spec.whatwg.org/multipage/webappapis.html#atob
[`Buffer`]: https://nodejs.org/api/buffer.html
149 changes: 134 additions & 15 deletions luaparse.js
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,21 @@
}
}

var fromCodePoint = /* istanbul ignore next */ function (codepoint) {
if (codepoint > 0x10ffff)
throw new RangeError(codepoint);
if (codepoint > 0xffff)
return String.fromCharCode(
0xd800 | ((codepoint >> 10) & 0x3ff),
0xdc00 | ( codepoint & 0x3ff)
);
return String.fromCharCode(codepoint);
};

/* istanbul ignore else */
if (String.fromCodePoint)
fromCodePoint = String.fromCodePoint;

function toHex(num, digits) {
var result = num.toString(16);
while (result.length < digits)
Expand All @@ -135,6 +150,97 @@
};
}

function makeUTF8Mode(fallbackFuncByte, fallbackFuncUnicode) {
return {
fixup: function (s) {
return s.replace(/[\ud800-\udbff][\udc00-\udfff]|[\ud800-\udfff]/g, function (m) {
if (m.length === 2)
return m;
return fallbackFuncUnicode(m.charCodeAt(0), null);
});
},
initState: function () {
return {
byteBuffer: [],
rawBuffer: ''
};
},
encodeByte: function (value, raw, encodingState) {
var buffer = encodingState.byteBuffer;
if (value === null) {
var result = '';
var off = 0;
var rawBuffer = encodingState.rawBuffer;

encodingState.byteBuffer = [];
encodingState.rawBuffer = '';

while (off < buffer.length) {
var l = 1;
var m = 0;
var codepoint = null;

if (buffer[off] < 0x80) {
codepoint = buffer[off];
} else if (0xc2 <= buffer[off] && buffer[off] <= 0xdf) {
codepoint = buffer[off] & 0x1f;
l = 2;
m = 0x80;
} else if (0xe1 <= buffer[off] && buffer[off] <= 0xef) {
codepoint = buffer[off] & 0x0f;
l = 3;
m = 0x800;
} else if (0xf0 <= buffer[off] && buffer[off] <= 0xf4) {
codepoint = buffer[off] & 0x07;
l = 4;
m = 0x10000;
}

if (off + l > buffer.length)
codepoint = null;
else {
for (var j = 1; j < l; ++j) {
if ((buffer[off + j] & 0xc0) !== 0x80) {
codepoint = null;
break;
}
codepoint <<= 6;
codepoint |= buffer[off + j] & 0x3f;
}
}

if (codepoint > 0x10ffff)
codepoint = null;
if (codepoint < m)
codepoint = null;

if (codepoint === null) {
result += fallbackFuncByte(buffer[off], rawBuffer);
off++;
} else {
result += fromCodePoint(codepoint);
off += l;
}
}

return result;
}

buffer.push(value);
encodingState.rawBuffer += raw;
return '';
},
encodeUTF8: function (codepoint, raw, encodingState) {
/* istanbul ignore if */
if (codepoint > 0x10ffff)
return fallbackFuncUnicode(codepoint, raw);
if (0xd800 <= codepoint && codepoint <= 0xdfff)
return fallbackFuncUnicode(codepoint, raw);
return fromCodePoint(codepoint);
}
};
}

var encodingModes = {
// `pseudo-latin1` encoding mode: assume the input was decoded with the latin1 encoding
// WARNING: latin1 does **NOT** mean cp1252 here like in the bone-headed WHATWG standard;
Expand Down Expand Up @@ -166,6 +272,13 @@
}
},

// `utf-8-lossy` encoding mode: replace non-UTF-8 escapes with U+FFFD
'utf-8-lossy': makeUTF8Mode(function (value, raw) {
return '\ufffd';
}, function (codepoint, raw) {
return '\ufffd';
}),

// `none` encoding mode: disregard intrepretation of string literals, leave identifiers as-is
'none': {
discardStrings: true,
Expand Down Expand Up @@ -218,6 +331,8 @@
, gotoJumpInLocalScope: '<goto %1> jumps into the scope of local \'%2\''
, cannotUseVararg: 'cannot use \'...\' outside a vararg function near \'%1\''
, invalidCodeUnit: 'code unit U+%1 is not allowed in the current encoding mode'
, invalidByteEscape: 'byte escape sequence \'%1\' is not allowed in the current encoding mode'
, invalidUTF8Escape: 'UTF-8 escape \'%1\' is not allowed in the current encoding mode'
};

// ### Abstract Syntax Tree
Expand Down Expand Up @@ -891,6 +1006,7 @@
, beginLineStart = lineStart
, stringStart = index
, string = encodingMode.discardStrings ? null : ''
, encodingState = encodingMode.initState ? encodingMode.initState() : null
, charCode;

for (;;) {
Expand All @@ -899,22 +1015,21 @@
// EOF or `\n` terminates a string literal. If we haven't found the
// ending delimiter by now, raise an exception.
if (index > length || isLineTerminator(charCode)) {
string += input.slice(stringStart, index - 1);
raise(null, errors.unfinishedString, input.slice(tokenStart, index - 1));
}
if (92 === charCode) { // backslash
if (!encodingMode.discardStrings) {
if (string !== null) {
var beforeEscape = input.slice(stringStart, index - 1);
string += encodingMode.fixup(beforeEscape);
}
var escapeValue = readEscapeSequence();
if (!encodingMode.discardStrings)
var escapeValue = readEscapeSequence(encodingState);
if (string !== null)
string += escapeValue;
stringStart = index;
}
}
if (!encodingMode.discardStrings) {
string += encodingMode.encodeByte(null);
if (string !== null) {
string += encodingMode.encodeByte(null, null, encodingState);
string += encodingMode.fixup(input.slice(stringStart, index - 1));
}

Expand Down Expand Up @@ -1131,7 +1246,7 @@
};
}

function readUnicodeEscapeSequence() {
function readUnicodeEscapeSequence(encodingState) {
var sequenceStart = index++;

if (input.charAt(index++) !== '{')
Expand Down Expand Up @@ -1160,14 +1275,14 @@
var frag = '\\' + input.slice(sequenceStart, index);

if (codepoint > 0x10ffff) {
raise(null, errors.tooLargeCodepoint, frag);
raise(null, errors.tooLargeCodepoint, frag, encodingState);
}

return encodingMode.encodeUTF8(codepoint, frag);
return encodingMode.encodeUTF8(codepoint, frag, encodingState);
}

// Translate escape sequences to the actual characters.
function readEscapeSequence() {
function readEscapeSequence(encodingState) {
var sequenceStart = index;
switch (input.charAt(index)) {
// Lua allow the following escape sequences.
Expand Down Expand Up @@ -1195,9 +1310,9 @@
var frag = input.slice(sequenceStart, index);
var ddd = parseInt(frag, 10);
if (ddd > 255) {
raise(null, errors.decimalEscapeTooLarge, '\\' + ddd);
raise(null, errors.decimalEscapeTooLarge, '\\' + ddd, encodingState);
}
return encodingMode.encodeByte(ddd, '\\' + frag);
return encodingMode.encodeByte(ddd, '\\' + frag, encodingState);

case 'z':
if (features.skipWhitespaceEscape) {
Expand All @@ -1213,15 +1328,19 @@
if (isHexDigit(input.charCodeAt(index + 1)) &&
isHexDigit(input.charCodeAt(index + 2))) {
index += 3;
return encodingMode.encodeByte(parseInt(input.slice(sequenceStart + 1, index), 16), '\\' + input.slice(sequenceStart, index));
return encodingMode.encodeByte(
parseInt(input.slice(sequenceStart + 1, index), 16),
'\\' + input.slice(sequenceStart, index),
encodingState
);
}
raise(null, errors.hexadecimalDigitExpected, '\\' + input.slice(sequenceStart, index + 2));
}
break;

case 'u':
if (features.unicodeEscapes)
return readUnicodeEscapeSequence();
return readUnicodeEscapeSequence(encodingState);
break;

case '\\': case '"': case "'":
Expand Down Expand Up @@ -1287,7 +1406,7 @@
// Read a multiline string by calculating the depth of `=` characters and
// then appending until an equal depth is found.

function readLongString(isComment) {
function readLongString(isComment, encodingState) {
var level = 0
, content = ''
, terminator = false
Expand Down
5 changes: 5 additions & 0 deletions test/runner.js
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,11 @@
{ mode: 'pseudo-latin1', src: '\x80', name: '\x80' },
{ mode: 'pseudo-latin1', src: 'a', name: 'a' },

{ mode: 'utf-8-lossy', src: '"\\u{d800}"', value: '\ufffd' },
{ mode: 'utf-8-lossy', src: '"\\x80"', value: '\ufffd' },
{ mode: 'utf-8-lossy', src: '"\\128"', value: '\ufffd' },
{ mode: 'utf-8-lossy', src: '"\ud800"', value: '\ufffd' },

{ mode: 'none', src: '"\\u{80}"', value: null },
{ mode: 'none', src: '"\\u{d800}"', value: null },
{ mode: 'none', src: '"\\x80"', value: null },
Expand Down

0 comments on commit d2f3480

Please sign in to comment.