From 79147ae0604c3350fa49686b0ba675dcb10d18f0 Mon Sep 17 00:00:00 2001 From: Timothy Gu Date: Wed, 31 May 2017 17:13:43 -0700 Subject: [PATCH] url: update IDNA handling Remove custom tests for invalid IDNA domains in url-idna.js in favor of the more comprehensive official set. PR-URL: https://github.com/nodejs/node/pull/13362 Refs: https://github.com/whatwg/url/pull/309 Refs: https://github.com/w3c/web-platform-tests/pull/5976 Reviewed-By: Refael Ackermann Reviewed-By: James M Snell Reviewed-By: Daijiro Wachi --- lib/url.js | 5 +- src/node_i18n.cc | 88 +++++++++--- src/node_i18n.h | 24 ++++ test/fixtures/url-idna.js | 215 +++++++++++++++++++++++++++++ test/fixtures/url-toascii.js | 157 +++++++++++++++++++++ test/parallel/test-icu-punycode.js | 100 +++++--------- 6 files changed, 509 insertions(+), 80 deletions(-) create mode 100644 test/fixtures/url-idna.js create mode 100644 test/fixtures/url-toascii.js diff --git a/lib/url.js b/lib/url.js index 214c8a93204e70..f084fd372afaf9 100644 --- a/lib/url.js +++ b/lib/url.js @@ -311,7 +311,10 @@ Url.prototype.parse = function(url, parseQueryString, slashesDenoteHost) { // It only converts parts of the domain name that // have non-ASCII characters, i.e. it doesn't matter if // you call it with a domain that already is ASCII-only. - this.hostname = toASCII(this.hostname); + + // Use lenient mode (`true`) to try to support even non-compliant + // URLs. + this.hostname = toASCII(this.hostname, true); } var p = this.port ? ':' + this.port : ''; diff --git a/src/node_i18n.cc b/src/node_i18n.cc index 8e32f1fe08177b..c013027981755f 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -77,12 +77,11 @@ bool InitializeICUDirectory(const std::string& path) { } } -static int32_t ToUnicode(MaybeStackBuffer* buf, - const char* input, - size_t length) { +int32_t ToUnicode(MaybeStackBuffer* buf, + const char* input, + size_t length) { UErrorCode status = U_ZERO_ERROR; - uint32_t options = UIDNA_DEFAULT; - options |= UIDNA_NONTRANSITIONAL_TO_UNICODE; + uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE; UIDNA* uidna = uidna_openUTS46(options, &status); if (U_FAILURE(status)) return -1; @@ -90,33 +89,52 @@ static int32_t ToUnicode(MaybeStackBuffer* buf, int32_t len = uidna_nameToUnicodeUTF8(uidna, input, length, - **buf, buf->length(), + **buf, buf->capacity(), &info, &status); + // Do not check info.errors like we do with ToASCII since ToUnicode always + // returns a string, despite any possible errors that may have occurred. + if (status == U_BUFFER_OVERFLOW_ERROR) { status = U_ZERO_ERROR; buf->AllocateSufficientStorage(len); len = uidna_nameToUnicodeUTF8(uidna, input, length, - **buf, buf->length(), + **buf, buf->capacity(), &info, &status); } - if (U_FAILURE(status)) + // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode + // string, regardless of whether an error occurred. + + if (U_FAILURE(status)) { len = -1; + buf->SetLength(0); + } else { + buf->SetLength(len); + } uidna_close(uidna); return len; } -static int32_t ToASCII(MaybeStackBuffer* buf, - const char* input, - size_t length) { +int32_t ToASCII(MaybeStackBuffer* buf, + const char* input, + size_t length, + enum idna_mode mode) { UErrorCode status = U_ZERO_ERROR; - uint32_t options = UIDNA_DEFAULT; - options |= UIDNA_NONTRANSITIONAL_TO_ASCII; + uint32_t options = // CheckHyphens = false; handled later + UIDNA_CHECK_BIDI | // CheckBidi = true + UIDNA_CHECK_CONTEXTJ | // CheckJoiners = true + UIDNA_NONTRANSITIONAL_TO_ASCII; // Nontransitional_Processing + if (mode == IDNA_STRICT) { + options |= UIDNA_USE_STD3_RULES; // UseSTD3ASCIIRules = beStrict + // VerifyDnsLength = beStrict; + // handled later + } + UIDNA* uidna = uidna_openUTS46(options, &status); if (U_FAILURE(status)) return -1; @@ -124,7 +142,7 @@ static int32_t ToASCII(MaybeStackBuffer* buf, int32_t len = uidna_nameToASCII_UTF8(uidna, input, length, - **buf, buf->length(), + **buf, buf->capacity(), &info, &status); @@ -133,13 +151,45 @@ static int32_t ToASCII(MaybeStackBuffer* buf, buf->AllocateSufficientStorage(len); len = uidna_nameToASCII_UTF8(uidna, input, length, - **buf, buf->length(), + **buf, buf->capacity(), &info, &status); } - if (U_FAILURE(status)) + // In UTS #46 which specifies ToASCII, certain error conditions are + // configurable through options, and the WHATWG URL Standard promptly elects + // to disable some of them to accommodate for real-world use cases. + // Unfortunately, ICU4C's IDNA module does not support disabling some of + // these options through `options` above, and thus continues throwing + // unnecessary errors. To counter this situation, we just filter out the + // errors that may have happened afterwards, before deciding whether to + // return an error from this function. + + // CheckHyphens = false + // (Specified in the current UTS #46 draft rev. 18.) + // Refs: + // - https://github.com/whatwg/url/issues/53 + // - https://github.com/whatwg/url/pull/309 + // - http://www.unicode.org/review/pri317/ + // - http://www.unicode.org/reports/tr46/tr46-18.html + // - https://www.icann.org/news/announcement-2000-01-07-en + info.errors &= ~UIDNA_ERROR_HYPHEN_3_4; + info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN; + info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN; + + if (mode != IDNA_STRICT) { + // VerifyDnsLength = beStrict + info.errors &= ~UIDNA_ERROR_EMPTY_LABEL; + info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG; + info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; + } + + if (U_FAILURE(status) || (mode != IDNA_LENIENT && info.errors != 0)) { len = -1; + buf->SetLength(0); + } else { + buf->SetLength(len); + } uidna_close(uidna); return len; @@ -169,8 +219,12 @@ static void ToASCII(const FunctionCallbackInfo& args) { CHECK_GE(args.Length(), 1); CHECK(args[0]->IsString()); Utf8Value val(env->isolate(), args[0]); + // optional arg + bool lenient = args[1]->BooleanValue(env->context()).FromJust(); + enum idna_mode mode = lenient ? IDNA_LENIENT : IDNA_DEFAULT; + MaybeStackBuffer buf; - int32_t len = ToASCII(&buf, *val, val.length()); + int32_t len = ToASCII(&buf, *val, val.length(), mode); if (len < 0) { return env->ThrowError("Cannot convert name to ASCII"); diff --git a/src/node_i18n.h b/src/node_i18n.h index ff9e87cea7fe83..30acefd1a8a996 100644 --- a/src/node_i18n.h +++ b/src/node_i18n.h @@ -16,6 +16,30 @@ namespace i18n { bool InitializeICUDirectory(const std::string& path); +enum idna_mode { + // Default mode for maximum compatibility. + IDNA_DEFAULT, + // Ignore all errors in IDNA conversion, if possible. + IDNA_LENIENT, + // Enforce STD3 rules (UseSTD3ASCIIRules) and DNS length restrictions + // (VerifyDnsLength). Corresponds to `beStrict` flag in the "domain to ASCII" + // algorithm. + IDNA_STRICT +}; + +// Implements the WHATWG URL Standard "domain to ASCII" algorithm. +// https://url.spec.whatwg.org/#concept-domain-to-ascii +int32_t ToASCII(MaybeStackBuffer* buf, + const char* input, + size_t length, + enum idna_mode mode = IDNA_DEFAULT); + +// Implements the WHATWG URL Standard "domain to Unicode" algorithm. +// https://url.spec.whatwg.org/#concept-domain-to-unicode +int32_t ToUnicode(MaybeStackBuffer* buf, + const char* input, + size_t length); + } // namespace i18n } // namespace node diff --git a/test/fixtures/url-idna.js b/test/fixtures/url-idna.js new file mode 100644 index 00000000000000..4b8f5a48cc9646 --- /dev/null +++ b/test/fixtures/url-idna.js @@ -0,0 +1,215 @@ +'use strict'; + +// Credit for list: http://www.i18nguy.com/markup/idna-examples.html +module.exports = [ + { ascii: 'xn--mgbaal8b0b9b2b.icom.museum', + unicode: 'افغانستا.icom.museum' + }, + { + ascii: 'xn--lgbbat1ad8j.icom.museum', + unicode: 'الجزائر.icom.museum' + }, + { + ascii: 'xn--sterreich-z7a.icom.museum', + unicode: 'österreich.icom.museum' + }, + { + ascii: 'xn--54b6eqazv8bc7e.icom.museum', + unicode: 'বাংলাদেশ.icom.museum' + }, + { + ascii: 'xn--80abmy0agn7e.icom.museum', + unicode: 'беларусь.icom.museum' + }, + { + ascii: 'xn--belgi-rsa.icom.museum', + unicode: 'belgië.icom.museum' + }, + { + ascii: 'xn--80abgvm6a7d2b.icom.museum', + unicode: 'българия.icom.museum' + }, + { + ascii: 'xn--mgbfqim.icom.museum', + unicode: 'تشادر.icom.museum' + }, + { + ascii: 'xn--fiqs8s.icom.museum', + unicode: '中国.icom.museum' + }, + { + ascii: 'xn--mgbu4chg.icom.museum', + unicode: 'القمر.icom.museum' + }, + { + ascii: 'xn--vxakcego.icom.museum', + unicode: 'κυπρος.icom.museum' + }, + { + ascii: 'xn--eskrepublika-ebb62d.icom.museum', + unicode: 'českárepublika.icom.museum' + }, + { + ascii: 'xn--wgbh1c.icom.museum', + unicode: 'مصر.icom.museum' + }, + { + ascii: 'xn--hxakic4aa.icom.museum', + unicode: 'ελλάδα.icom.museum' + }, + { + ascii: 'xn--magyarorszg-t7a.icom.museum', + unicode: 'magyarország.icom.museum' + }, + { + ascii: 'xn--sland-ysa.icom.museum', + unicode: 'ísland.icom.museum' + }, + { + ascii: 'xn--h2brj9c.icom.museum', + unicode: 'भारत.icom.museum' + }, + { + ascii: 'xn--mgba3a4fra.icom.museum', + unicode: 'ايران.icom.museum' + }, + { + ascii: 'xn--ire-9la.icom.museum', + unicode: 'éire.icom.museum' + }, + { + ascii: 'xn--4dbklr2c8d.xn--4dbrk0ce.museum', + unicode: 'איקו״ם.ישראל.museum' + }, + { + ascii: 'xn--wgv71a.icom.museum', + unicode: '日本.icom.museum' + }, + { + ascii: 'xn--igbhzh7gpa.icom.museum', + unicode: 'الأردن.icom.museum' + }, + { + ascii: 'xn--80aaa0a6awh12ed.icom.museum', + unicode: 'қазақстан.icom.museum' + }, + { + ascii: 'xn--3e0b707e.icom.museum', + unicode: '한국.icom.museum' + }, + { + ascii: 'xn--80afmksoji0fc.icom.museum', + unicode: 'кыргызстан.icom.museum' + }, + { + ascii: 'xn--q7ce6a.icom.museum', + unicode: 'ລາວ.icom.museum' + }, + { + ascii: 'xn--mgbb7fjb.icom.museum', + unicode: 'لبنان.icom.museum' + }, + { + ascii: 'xn--80aaldqjmmi6x.icom.museum', + unicode: 'македонија.icom.museum' + }, + { + ascii: 'xn--mgbah1a3hjkrd.icom.museum', + unicode: 'موريتانيا.icom.museum' + }, + { + ascii: 'xn--mxico-bsa.icom.museum', + unicode: 'méxico.icom.museum' + }, + { + ascii: 'xn--c1aqabffc0aq.icom.museum', + unicode: 'монголулс.icom.museum' + }, + { + ascii: 'xn--mgbc0a9azcg.icom.museum', + unicode: 'المغرب.icom.museum' + }, + { + ascii: 'xn--l2bey1c2b.icom.museum', + unicode: 'नेपाल.icom.museum' + }, + { + ascii: 'xn--mgb9awbf.icom.museum', + unicode: 'عمان.icom.museum' + }, + { + ascii: 'xn--wgbl6a.icom.museum', + unicode: 'قطر.icom.museum' + }, + { + ascii: 'xn--romnia-yta.icom.museum', + unicode: 'românia.icom.museum' + }, + { + ascii: 'xn--h1alffa9f.xn--h1aegh.museum', + unicode: 'россия.иком.museum' + }, + { + ascii: 'xn--80aaabm1ab4blmeec9e7n.xn--h1aegh.museum', + unicode: 'србијаицрнагора.иком.museum' + }, + { + ascii: 'xn--xkc2al3hye2a.icom.museum', + unicode: 'இலங்கை.icom.museum' + }, + { + ascii: 'xn--espaa-rta.icom.museum', + unicode: 'españa.icom.museum' + }, + { + ascii: 'xn--o3cw4h.icom.museum', + unicode: 'ไทย.icom.museum' + }, + { + ascii: 'xn--pgbs0dh.icom.museum', + unicode: 'تونس.icom.museum' + }, + { + ascii: 'xn--trkiye-3ya.icom.museum', + unicode: 'türkiye.icom.museum' + }, + { + ascii: 'xn--80aaxgrpt.icom.museum', + unicode: 'украина.icom.museum' + }, + { + ascii: 'xn--vitnam-jk8b.icom.museum', + unicode: 'việtnam.icom.museum' + }, + // long label + { + ascii: `${'a'.repeat(64)}.com`, + unicode: `${'a'.repeat(64)}.com`, + }, + // long URL + { + ascii: `${`${'a'.repeat(64)}.`.repeat(4)}com`, + unicode: `${`${'a'.repeat(64)}.`.repeat(4)}com` + }, + // URLs with hyphen + { + ascii: 'r4---sn-a5mlrn7s.gevideo.com', + unicode: 'r4---sn-a5mlrn7s.gevideo.com' + }, + { + ascii: '-sn-a5mlrn7s.gevideo.com', + unicode: '-sn-a5mlrn7s.gevideo.com' + }, + { + ascii: 'sn-a5mlrn7s-.gevideo.com', + unicode: 'sn-a5mlrn7s-.gevideo.com' + }, + { + ascii: '-sn-a5mlrn7s-.gevideo.com', + unicode: '-sn-a5mlrn7s-.gevideo.com' + }, + { + ascii: '-sn--a5mlrn7s-.gevideo.com', + unicode: '-sn--a5mlrn7s-.gevideo.com' + } +]; diff --git a/test/fixtures/url-toascii.js b/test/fixtures/url-toascii.js new file mode 100644 index 00000000000000..59b76330f867f2 --- /dev/null +++ b/test/fixtures/url-toascii.js @@ -0,0 +1,157 @@ +'use strict'; + +/* The following tests are copied from WPT. Modifications to them should be + upstreamed first. Refs: + https://github.com/w3c/web-platform-tests/blob/4839a0a804/url/toascii.json + License: http://www.w3.org/Consortium/Legal/2008/04-testsuite-copyright.html +*/ +module.exports = +[ + "This resource is focused on highlighting issues with UTS #46 ToASCII", + { + "comment": "Label with hyphens in 3rd and 4th position", + "input": "aa--", + "output": "aa--" + }, + { + "input": "a†--", + "output": "xn--a---kp0a" + }, + { + "input": "ab--c", + "output": "ab--c" + }, + { + "comment": "Label with leading hyphen", + "input": "-x", + "output": "-x" + }, + { + "input": "-†", + "output": "xn----xhn" + }, + { + "input": "-x.xn--nxa", + "output": "-x.xn--nxa" + }, + { + "input": "-x.β", + "output": "-x.xn--nxa" + }, + { + "comment": "Label with trailing hyphen", + "input": "x-.xn--nxa", + "output": "x-.xn--nxa" + }, + { + "input": "x-.β", + "output": "x-.xn--nxa" + }, + { + "comment": "Empty labels", + "input": "x..xn--nxa", + "output": "x..xn--nxa" + }, + { + "input": "x..β", + "output": "x..xn--nxa" + }, + { + "comment": "Invalid Punycode", + "input": "xn--a", + "output": null + }, + { + "input": "xn--a.xn--nxa", + "output": null + }, + { + "input": "xn--a.β", + "output": null + }, + { + "comment": "Valid Punycode", + "input": "xn--nxa.xn--nxa", + "output": "xn--nxa.xn--nxa" + }, + { + "comment": "Mixed", + "input": "xn--nxa.β", + "output": "xn--nxa.xn--nxa" + }, + { + "input": "ab--c.xn--nxa", + "output": "ab--c.xn--nxa" + }, + { + "input": "ab--c.β", + "output": "ab--c.xn--nxa" + }, + { + "comment": "CheckJoiners is true", + "input": "\u200D.example", + "output": null + }, + { + "input": "xn--1ug.example", + "output": null + }, + { + "comment": "CheckBidi is true", + "input": "يa", + "output": null + }, + { + "input": "xn--a-yoc", + "output": null + }, + { + "comment": "processing_option is Nontransitional_Processing", + "input": "ශ්‍රී", + "output": "xn--10cl1a0b660p" + }, + { + "input": "نامه‌ای", + "output": "xn--mgba3gch31f060k" + }, + { + "comment": "U+FFFD", + "input": "\uFFFD.com", + "output": null + }, + { + "comment": "U+FFFD character encoded in Punycode", + "input": "xn--zn7c.com", + "output": null + }, + { + "comment": "Label longer than 63 code points", + "input": "x01234567890123456789012345678901234567890123456789012345678901x", + "output": "x01234567890123456789012345678901234567890123456789012345678901x" + }, + { + "input": "x01234567890123456789012345678901234567890123456789012345678901†", + "output": "xn--x01234567890123456789012345678901234567890123456789012345678901-6963b" + }, + { + "input": "x01234567890123456789012345678901234567890123456789012345678901x.xn--nxa", + "output": "x01234567890123456789012345678901234567890123456789012345678901x.xn--nxa" + }, + { + "input": "x01234567890123456789012345678901234567890123456789012345678901x.β", + "output": "x01234567890123456789012345678901234567890123456789012345678901x.xn--nxa" + }, + { + "comment": "Domain excluding TLD longer than 253 code points", + "input": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.x", + "output": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.x" + }, + { + "input": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.xn--nxa", + "output": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.xn--nxa" + }, + { + "input": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.β", + "output": "01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.01234567890123456789012345678901234567890123456789.0123456789012345678901234567890123456789012345678.xn--nxa" + } +] diff --git a/test/parallel/test-icu-punycode.js b/test/parallel/test-icu-punycode.js index cf83ac66206806..7abcae461774c0 100644 --- a/test/parallel/test-icu-punycode.js +++ b/test/parallel/test-icu-punycode.js @@ -1,70 +1,46 @@ 'use strict'; - const common = require('../common'); -const icu = getPunycode(); + +if (!common.hasIntl) + common.skip('missing Intl'); + +const icu = process.binding('icu'); const assert = require('assert'); -function getPunycode() { - try { - return process.binding('icu'); - } catch (err) { - return undefined; +const tests = require('../fixtures/url-idna.js'); +const wptToASCIITests = require('../fixtures/url-toascii.js'); + +{ + for (const [i, { ascii, unicode }] of tests.entries()) { + assert.strictEqual(ascii, icu.toASCII(unicode), `toASCII(${i + 1})`); + assert.strictEqual(unicode, icu.toUnicode(ascii), `toUnicode(${i + 1})`); + assert.strictEqual(ascii, icu.toASCII(icu.toUnicode(ascii)), + `toASCII(toUnicode(${i + 1}))`); + assert.strictEqual(unicode, icu.toUnicode(icu.toASCII(unicode)), + `toUnicode(toASCII(${i + 1}))`); } } -if (!icu) - common.skip('icu punycode tests because ICU is not present.'); +{ + const errMessage = /^Error: Cannot convert name to ASCII$/; -// Credit for list: http://www.i18nguy.com/markup/idna-examples.html -const tests = [ - 'افغانستا.icom.museum', - 'الجزائر.icom.museum', - 'österreich.icom.museum', - 'বাংলাদেশ.icom.museum', - 'беларусь.icom.museum', - 'belgië.icom.museum', - 'българия.icom.museum', - 'تشادر.icom.museum', - '中国.icom.museum', - 'القمر.icom.museum', - 'κυπρος.icom.museum', - 'českárepublika.icom.museum', - 'مصر.icom.museum', - 'ελλάδα.icom.museum', - 'magyarország.icom.museum', - 'ísland.icom.museum', - 'भारत.icom.museum', - 'ايران.icom.museum', - 'éire.icom.museum', - 'איקו״ם.ישראל.museum', - '日本.icom.museum', - 'الأردن.icom.museum', - 'қазақстан.icom.museum', - '한국.icom.museum', - 'кыргызстан.icom.museum', - 'ລາວ.icom.museum', - 'لبنان.icom.museum', - 'македонија.icom.museum', - 'موريتانيا.icom.museum', - 'méxico.icom.museum', - 'монголулс.icom.museum', - 'المغرب.icom.museum', - 'नेपाल.icom.museum', - 'عمان.icom.museum', - 'قطر.icom.museum', - 'românia.icom.museum', - 'россия.иком.museum', - 'србијаицрнагора.иком.museum', - 'இலங்கை.icom.museum', - 'españa.icom.museum', - 'ไทย.icom.museum', - 'تونس.icom.museum', - 'türkiye.icom.museum', - 'украина.icom.museum', - 'việtnam.icom.museum' -]; - -// Testing the roundtrip -tests.forEach((i) => { - assert.strictEqual(i, icu.toUnicode(icu.toASCII(i))); -}); + for (const [i, test] of wptToASCIITests.entries()) { + if (typeof test === 'string') + continue; // skip comments + const { comment, input, output } = test; + let caseComment = `case ${i + 1}`; + if (comment) + caseComment += ` (${comment})`; + if (output === null) { + assert.throws(() => icu.toASCII(input), + errMessage, `ToASCII ${caseComment}`); + assert.doesNotThrow(() => icu.toASCII(input, true), + `ToASCII ${caseComment} in lenient mode`); + } else { + assert.strictEqual(icu.toASCII(input), output, `ToASCII ${caseComment}`); + assert.strictEqual(icu.toASCII(input, true), output, + `ToASCII ${caseComment} in lenient mode`); + } + assert.doesNotThrow(() => icu.toUnicode(input), `ToUnicode ${caseComment}`); + } +}