diff --git a/.eslintrc b/.eslintrc index b7a87b93..f972bfce 100644 --- a/.eslintrc +++ b/.eslintrc @@ -9,8 +9,8 @@ "func-name-matching": 0, "id-length": [2, { "min": 1, "max": 25, "properties": "never" }], "indent": [2, 4], - "max-params": [2, 12], - "max-statements": [2, 45], + "max-params": [2, 14], + "max-statements": [2, 52], "no-continue": 1, "no-magic-numbers": 0, "no-restricted-syntax": [2, "BreakStatement", "DebuggerStatement", "ForInStatement", "LabeledStatement", "WithStatement"], diff --git a/README.md b/README.md index d8119666..7fbe063a 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,62 @@ var withDots = qs.parse('a.b=c', { allowDots: true }); assert.deepEqual(withDots, { a: { b: 'c' } }); ``` +If you have to deal with legacy browsers or services, there's +also support for decoding percent-encoded octets as iso-8859-1: + +```javascript +var oldCharset = qs.parse('a=%A7', { charset: 'iso-8859-1' }); +assert.deepEqual(oldCharset, { a: '§' }); +``` + +Some services add an initial `utf8=✓` value to forms so that old +Internet Explorer versions are more likely to submit the form as +utf-8. Additionally, the server can check the value against wrong +encodings of the checkmark character and detect that a query string +or `application/x-www-form-urlencoded` body was *not* sent as +utf-8, eg. if the form had an `accept-charset` parameter or the +containing page had a different character set. + +**qs** supports this mechanism via the `charsetSentinel` option. +If specified, the `utf8` parameter will be omitted from the +returned object. It will be used to switch to `iso-8859-1`/`utf-8` +mode depending on how the checkmark is encoded. + +**Important**: When you specify both the `charset` option and the +`charsetSentinel` option, the `charset` will be overridden when +the request contains a `utf8` parameter from which the actual +charset can be deduced. In that sense the `charset` will behave +as the default charset rather than the authoritative charset. + +```javascript +var detectedAsUtf8 = qs.parse('utf8=%E2%9C%93&a=%C3%B8', { + charset: 'iso-8859-1', + charsetSentinel: true +}); +assert.deepEqual(detectedAsUtf8, { a: 'ø' }); + +// Browsers encode the checkmark as ✓ when submitting as iso-8859-1: +var detectedAsIso8859_1 = qs.parse('utf8=%26%2310003%3B&a=%F8', { + charset: 'utf-8', + charsetSentinel: true +}); +assert.deepEqual(detectedAsIso8859_1, { a: 'ø' }); +``` + +If you want to decode the `&#...;` syntax to the actual character, +you can specify the `interpretNumericEntities` option as well: + +```javascript +var detectedAsIso8859_1 = qs.parse('a=%26%239786%3B', { + charset: 'iso-8859-1', + interpretNumericEntities: true +}); +assert.deepEqual(detectedAsIso8859_1, { a: '☺' }); +``` + +It also works when the charset has been detected in `charsetSentinel` +mode. + ### Parsing Arrays **qs** can also parse arrays using a similar `[]` notation: @@ -426,10 +482,40 @@ var nullsSkipped = qs.stringify({ a: 'b', c: null}, { skipNulls: true }); assert.equal(nullsSkipped, 'a=b'); ``` +If you're communicating with legacy systems, you can switch to `iso-8859-1` +using the `charset` option: + +```javascript +var iso = qs.stringify({ æ: 'æ' }, { charset: 'iso-8859-1' }); +assert.equal(iso, '%E6=%E6'); +``` + +Characters that don't exist in `iso-8859-1` will be converted to numeric +entities, similar to what browsers do: + +```javascript +var numeric = qs.stringify({ a: '☺' }, { charset: 'iso-8859-1' }); +assert.equal(numeric, 'a=%26%239786%3B'); +``` + +You can use the `charsetSentinel` option to announce the character by +including an `utf8=✓` parameter with the proper encoding if the checkmark, +similar to what Ruby on Rails and others do when submitting forms. + +```javascript +var sentinel = qs.stringify({ a: '☺' }, { charsetSentinel: true }); +assert.equal(sentinel, 'utf8=%E2%9C%93&a=%E2%98%BA'); + +var isoSentinel = qs.stringify({ a: 'æ' }, { charsetSentinel: true, charset: 'iso-8859-1' }); +assert.equal(isoSentinel, 'utf8=%26%2310003%3B&a=%E6'); +``` + ### Dealing with special character sets -By default the encoding and decoding of characters is done in `utf-8`. If you -wish to encode querystrings to a different character set (i.e. +By default the encoding and decoding of characters is done in `utf-8`, +and `iso-8859-1` support is also built in via the `charset` parameter. + +If you wish to encode querystrings to a different character set (i.e. [Shift JIS](https://en.wikipedia.org/wiki/Shift_JIS)) you can use the [`qs-iconv`](https://github.com/martinheidegger/qs-iconv) library: diff --git a/lib/parse.js b/lib/parse.js index 4abc1a70..582907ea 100644 --- a/lib/parse.js +++ b/lib/parse.js @@ -8,21 +8,64 @@ var defaults = { allowDots: false, allowPrototypes: false, arrayLimit: 20, + charset: 'utf-8', + charsetSentinel: false, decoder: utils.decode, delimiter: '&', depth: 5, + interpretNumericEntities: false, parameterLimit: 1000, plainObjects: false, strictNullHandling: false }; +var interpretNumericEntities = function (str) { + return str.replace(/&#(\d+);/g, function ($0, numberStr) { + return String.fromCharCode(parseInt(numberStr, 10)); + }); +}; + +// This is what browsers will submit when the ✓ character occurs in an +// application/x-www-form-urlencoded body and the encoding of the page containing +// the form is iso-8859-1, or when the submitted form has an accept-charset +// attribute of iso-8859-1. Presumably also with other charsets that do not contain +// the ✓ character, such as us-ascii. +var isoSentinel = 'utf8=%26%2310003%3B'; // encodeURIComponent('✓') + +// These are the percent-encoded utf-8 octets representing a checkmark, indicating +// that the request actually is utf-8 encoded. +var charsetSentinel = 'utf8=%E2%9C%93'; // encodeURIComponent('✓') + var parseValues = function parseQueryStringValues(str, options) { var obj = {}; var cleanStr = options.ignoreQueryPrefix ? str.replace(/^\?/, '') : str; var limit = options.parameterLimit === Infinity ? undefined : options.parameterLimit; var parts = cleanStr.split(options.delimiter, limit); + var charset = options.charset; + var skipIndex = -1; // Keep track of where the utf8 sentinel was found + var i; + + if (charset !== undefined && charset !== 'utf-8' && charset !== 'iso-8859-1') { + throw new Error('The charset option must be either utf-8, iso-8859-1, or undefined'); + } + if (options.charsetSentinel) { + for (i = 0; i < parts.length; ++i) { + if (parts[i].indexOf('utf8=') === 0) { + if (parts[i] === charsetSentinel) { + charset = 'utf-8'; + } else if (parts[i] === isoSentinel) { + charset = 'iso-8859-1'; + } + skipIndex = i; + i = parts.length; // The eslint settings do not allow break; + } + } + } - for (var i = 0; i < parts.length; ++i) { + for (i = 0; i < parts.length; ++i) { + if (i === skipIndex) { + continue; + } var part = parts[i]; var bracketEqualsPos = part.indexOf(']='); @@ -30,11 +73,15 @@ var parseValues = function parseQueryStringValues(str, options) { var key, val; if (pos === -1) { - key = options.decoder(part, defaults.decoder); + key = options.decoder(part, defaults.decoder, charset); val = options.strictNullHandling ? null : ''; } else { - key = options.decoder(part.slice(0, pos), defaults.decoder); - val = options.decoder(part.slice(pos + 1), defaults.decoder); + key = options.decoder(part.slice(0, pos), defaults.decoder, charset); + val = options.decoder(part.slice(pos + 1), defaults.decoder, charset); + } + + if (options.interpretNumericEntities && charset === 'iso-8859-1') { + val = interpretNumericEntities(val); } if (has.call(obj, key)) { obj[key] = [].concat(obj[key]).concat(val); diff --git a/lib/stringify.js b/lib/stringify.js index ab915ac4..5bf5e8b5 100644 --- a/lib/stringify.js +++ b/lib/stringify.js @@ -41,7 +41,8 @@ var stringify = function stringify( // eslint-disable-line func-name-matching allowDots, serializeDate, formatter, - encodeValuesOnly + encodeValuesOnly, + charset ) { var obj = object; if (typeof filter === 'function') { @@ -50,7 +51,7 @@ var stringify = function stringify( // eslint-disable-line func-name-matching obj = serializeDate(obj); } else if (obj === null) { if (strictNullHandling) { - return encoder && !encodeValuesOnly ? encoder(prefix, defaults.encoder) : prefix; + return encoder && !encodeValuesOnly ? encoder(prefix, defaults.encoder, charset) : prefix; } obj = ''; @@ -58,8 +59,8 @@ var stringify = function stringify( // eslint-disable-line func-name-matching if (typeof obj === 'string' || typeof obj === 'number' || typeof obj === 'boolean' || utils.isBuffer(obj)) { if (encoder) { - var keyValue = encodeValuesOnly ? prefix : encoder(prefix, defaults.encoder); - return [formatter(keyValue) + '=' + formatter(encoder(obj, defaults.encoder))]; + var keyValue = encodeValuesOnly ? prefix : encoder(prefix, defaults.encoder, charset); + return [formatter(keyValue) + '=' + formatter(encoder(obj, defaults.encoder, charset))]; } return [formatter(prefix) + '=' + formatter(String(obj))]; } @@ -98,7 +99,8 @@ var stringify = function stringify( // eslint-disable-line func-name-matching allowDots, serializeDate, formatter, - encodeValuesOnly + encodeValuesOnly, + charset )); } else { values = values.concat(stringify( @@ -113,7 +115,8 @@ var stringify = function stringify( // eslint-disable-line func-name-matching allowDots, serializeDate, formatter, - encodeValuesOnly + encodeValuesOnly, + charset )); } } @@ -138,6 +141,11 @@ module.exports = function (object, opts) { var allowDots = typeof options.allowDots === 'undefined' ? false : options.allowDots; var serializeDate = typeof options.serializeDate === 'function' ? options.serializeDate : defaults.serializeDate; var encodeValuesOnly = typeof options.encodeValuesOnly === 'boolean' ? options.encodeValuesOnly : defaults.encodeValuesOnly; + var charset = options.charset || 'utf-8'; + if (charset !== undefined && charset !== 'utf-8' && charset !== 'iso-8859-1') { + throw new Error('The charset option must be either utf-8, iso-8859-1, or undefined'); + } + if (typeof options.format === 'undefined') { options.format = formats['default']; } else if (!Object.prototype.hasOwnProperty.call(formats.formatters, options.format)) { @@ -199,12 +207,23 @@ module.exports = function (object, opts) { allowDots, serializeDate, formatter, - encodeValuesOnly + encodeValuesOnly, + charset )); } var joined = keys.join(delimiter); var prefix = options.addQueryPrefix === true ? '?' : ''; + if (options.charsetSentinel) { + if (charset === 'iso-8859-1') { + // encodeURIComponent('✓'), the "numeric entity" representation of a checkmark + prefix += 'utf8=%26%2310003%3B&'; + } else { + // encodeURIComponent('✓') + prefix += 'utf8=%E2%9C%93&'; + } + } + return joined.length > 0 ? prefix + joined : ''; }; diff --git a/lib/utils.js b/lib/utils.js index 8775a327..7390c9ef 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -107,15 +107,21 @@ var assign = function assignSingleSource(target, source) { }, target); }; -var decode = function (str) { +var decode = function (str, decoder, charset) { + var strWithoutPlus = str.replace(/\+/g, ' '); + if (charset === 'iso-8859-1') { + // unescape never throws, no try...catch needed: + return strWithoutPlus.replace(/%[0-9a-f]{2}/gi, unescape); + } + // utf-8 try { - return decodeURIComponent(str.replace(/\+/g, ' ')); + return decodeURIComponent(strWithoutPlus); } catch (e) { - return str; + return strWithoutPlus; } }; -var encode = function encode(str) { +var encode = function encode(str, defaultEncoder, charset) { // This code was originally written by Brian White (mscdex) for the io.js core querystring library. // It has been adapted here for stricter adherence to RFC 3986 if (str.length === 0) { @@ -124,6 +130,12 @@ var encode = function encode(str) { var string = typeof str === 'string' ? str : String(str); + if (charset === 'iso-8859-1') { + return escape(string).replace(/%u[0-9a-f]{4}/gi, function ($0) { + return '%26%23' + parseInt($0.slice(2), 16) + '%3B'; + }); + } + var out = ''; for (var i = 0; i < string.length; ++i) { var c = string.charCodeAt(i); diff --git a/test/parse.js b/test/parse.js index c6d0fbfe..f0ce23b5 100644 --- a/test/parse.js +++ b/test/parse.js @@ -577,5 +577,72 @@ test('parse()', function (t) { st.end(); }); + t.test('throws if an invalid charset is specified', function (st) { + st['throws'](function () { + qs.parse('a=b', { charset: 'foobar' }); + }, new TypeError('The charset option must be either utf-8, iso-8859-1, or undefined')); + st.end(); + }); + + t.test('parses an iso-8859-1 string if asked to', function (st) { + st.deepEqual(qs.parse('%A2=%BD', { charset: 'iso-8859-1' }), { '¢': '½' }); + st.end(); + }); + + var urlEncodedCheckmarkInUtf8 = '%E2%9C%93'; + var urlEncodedOSlashInUtf8 = '%C3%B8'; + var urlEncodedNumCheckmark = '%26%2310003%3B'; + var urlEncodedNumSmiley = '%26%239786%3B'; + + t.test('prefers an utf-8 charset specified by the utf8 sentinel to a default charset of iso-8859-1', function (st) { + st.deepEqual(qs.parse('utf8=' + urlEncodedCheckmarkInUtf8 + '&' + urlEncodedOSlashInUtf8 + '=' + urlEncodedOSlashInUtf8, { charsetSentinel: true, charset: 'iso-8859-1' }), { ø: 'ø' }); + st.end(); + }); + + t.test('prefers an iso-8859-1 charset specified by the utf8 sentinel to a default charset of utf-8', function (st) { + st.deepEqual(qs.parse('utf8=' + urlEncodedNumCheckmark + '&' + urlEncodedOSlashInUtf8 + '=' + urlEncodedOSlashInUtf8, { charsetSentinel: true, charset: 'utf-8' }), { 'ø': 'ø' }); + st.end(); + }); + + t.test('does not require the utf8 sentinel to be defined before the parameters whose decoding it affects', function (st) { + st.deepEqual(qs.parse('a=' + urlEncodedOSlashInUtf8 + '&utf8=' + urlEncodedNumCheckmark, { charsetSentinel: true, charset: 'utf-8' }), { a: 'ø' }); + st.end(); + }); + + t.test('should ignore an utf8 sentinel with an unknown value', function (st) { + st.deepEqual(qs.parse('utf8=foo&' + urlEncodedOSlashInUtf8 + '=' + urlEncodedOSlashInUtf8, { charsetSentinel: true, charset: 'utf-8' }), { ø: 'ø' }); + st.end(); + }); + + t.test('uses the utf8 sentinel to switch to utf-8 when no default charset is given', function (st) { + st.deepEqual(qs.parse('utf8=' + urlEncodedCheckmarkInUtf8 + '&' + urlEncodedOSlashInUtf8 + '=' + urlEncodedOSlashInUtf8, { charsetSentinel: true }), { ø: 'ø' }); + st.end(); + }); + + t.test('uses the utf8 sentinel to switch to iso-8859-1 when no default charset is given', function (st) { + st.deepEqual(qs.parse('utf8=' + urlEncodedNumCheckmark + '&' + urlEncodedOSlashInUtf8 + '=' + urlEncodedOSlashInUtf8, { charsetSentinel: true }), { 'ø': 'ø' }); + st.end(); + }); + + t.test('interprets numeric entities in iso-8859-1 when the interpretNumericEntities option is given', function (st) { + st.deepEqual(qs.parse('foo=' + urlEncodedNumSmiley, { charset: 'iso-8859-1', interpretNumericEntities: true }), { foo: '☺' }); + st.end(); + }); + + t.test('does not interpret numeric entities in iso-8859-1 when the interpretNumericEntities option is not given', function (st) { + st.deepEqual(qs.parse('foo=' + urlEncodedNumSmiley, { charset: 'iso-8859-1' }), { foo: '☺' }); + st.end(); + }); + + t.test('does not interpret numeric entities when the charset is utf-8, even when the interpretNumericEntities option is given', function (st) { + st.deepEqual(qs.parse('foo=' + urlEncodedNumSmiley, { charset: 'utf-8', interpretNumericEntities: true }), { foo: '☺' }); + st.end(); + }); + + t.test('does not interpret %uXXXX syntax in iso-8859-1 mode', function (st) { + st.deepEqual(qs.parse('%u263A=%u263A', { charset: 'iso-8859-1' }), { '%u263A': '%u263A' }); + st.end(); + }); + t.end(); }); diff --git a/test/stringify.js b/test/stringify.js index 165ac621..c54c0df3 100644 --- a/test/stringify.js +++ b/test/stringify.js @@ -586,6 +586,38 @@ test('stringify()', function (t) { st.end(); }); + t.test('throws if an invalid charset is specified', function (st) { + st['throws'](function () { + qs.stringify({ a: 'b' }, { charset: 'foobar' }); + }, new TypeError('The charset option must be either utf-8, iso-8859-1, or undefined')); + st.end(); + }); + + t.test('respects a charset of iso-8859-1', function (st) { + st.equal(qs.stringify({ æ: 'æ' }, { charset: 'iso-8859-1' }), '%E6=%E6'); + st.end(); + }); + + t.test('encodes unrepresentable chars as numeric entities in iso-8859-1 mode', function (st) { + st.equal(qs.stringify({ a: '☺' }, { charset: 'iso-8859-1' }), 'a=%26%239786%3B'); + st.end(); + }); + + t.test('respects an explicit charset of utf-8 (the default)', function (st) { + st.equal(qs.stringify({ a: 'æ' }, { charset: 'utf-8' }), 'a=%C3%A6'); + st.end(); + }); + + t.test('adds the right sentinel when instructed to and the charset is utf-8', function (st) { + st.equal(qs.stringify({ a: 'æ' }, { charsetSentinel: true, charset: 'utf-8' }), 'utf8=%E2%9C%93&a=%C3%A6'); + st.end(); + }); + + t.test('adds the right sentinel when instructed to and the charset is iso-8859-1', function (st) { + st.equal(qs.stringify({ a: 'æ' }, { charsetSentinel: true, charset: 'iso-8859-1' }), 'utf8=%26%2310003%3B&a=%E6'); + st.end(); + }); + t.test('does not mutate the options argument', function (st) { var options = {}; qs.stringify({}, options);