Skip to content

Commit

Permalink
url: update IDNA error conditions
Browse files Browse the repository at this point in the history
This commit contains three separate changes:

- Always return a string from ToUnicode no matter if an error occurred.
- Disable CheckHyphens boolean flag. This flag will soon be enabled in
  the URL Standard, but is implemented manually by selectively ignoring
  certain errors.
- Enable CheckBidi boolean flag per URL Standard update.

This allows domain names with hyphens at 3 and 4th position, as well as
those with leading and trailing hyphens. They are technically invalid,
but seen in the wild.

Tests are updated and simplified accordingly.

PR-URL: #12966
Fixes: #12965
Refs: whatwg/url#309
Reviewed-By: Daijiro Wachi <daijiro.wachi@gmail.com>
Reviewed-By: Timothy Gu <timothygu99@gmail.com>
  • Loading branch information
zimbabao authored and TimothyGu committed May 20, 2017
1 parent 841bb4c commit 06a617a
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 51 deletions.
38 changes: 22 additions & 16 deletions src/node_i18n.cc
Original file line number Diff line number Diff line change
Expand Up @@ -436,11 +436,9 @@ bool InitializeICUDirectory(const std::string& path) {

int32_t ToUnicode(MaybeStackBuffer<char>* buf,
const char* input,
size_t length,
bool lenient) {
size_t length) {
UErrorCode status = U_ZERO_ERROR;
uint32_t options = UIDNA_DEFAULT;
options |= UIDNA_NONTRANSITIONAL_TO_UNICODE;
uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE;
UIDNA* uidna = uidna_openUTS46(options, &status);
if (U_FAILURE(status))
return -1;
Expand All @@ -462,14 +460,10 @@ int32_t ToUnicode(MaybeStackBuffer<char>* buf,
&status);
}

// UTS #46's ToUnicode operation applies no validation of domain name length
// (nor a flag requesting it to do so, like VerifyDnsLength for ToASCII). For
// that reason, unlike ToASCII below, ICU4C correctly accepts long domain
// names. However, ICU4C still sets the EMPTY_LABEL error in contrary to UTS
// #46. Therefore, explicitly filters out that error here.
info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
// info.errors is ignored as UTS #46 ToUnicode always produces a Unicode
// string, regardless of whether an error occurred.

if (U_FAILURE(status) || (!lenient && info.errors != 0)) {
if (U_FAILURE(status)) {
len = -1;
buf->SetLength(0);
} else {
Expand All @@ -485,8 +479,7 @@ int32_t ToASCII(MaybeStackBuffer<char>* buf,
size_t length,
bool lenient) {
UErrorCode status = U_ZERO_ERROR;
uint32_t options = UIDNA_DEFAULT;
options |= UIDNA_NONTRANSITIONAL_TO_ASCII;
uint32_t options = UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_CHECK_BIDI;
UIDNA* uidna = uidna_openUTS46(options, &status);
if (U_FAILURE(status))
return -1;
Expand Down Expand Up @@ -518,6 +511,21 @@ int32_t ToASCII(MaybeStackBuffer<char>* buf,
info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;

// These error conditions are mandated unconditionally by UTS #46 version
// 9.0.0 (rev. 17), but were found to be incompatible with actual domain
// names in the wild. As such, in the current UTS #46 draft (rev. 18) these
// checks are made optional depending on the CheckHyphens flag, which will be
// disabled in WHATWG URL's "domain to ASCII" algorithm soon.
// Refs:
// - https://github.com/whatwg/url/issues/53
// - https://github.com/whatwg/url/pull/309
// - http://www.unicode.org/review/pri317/
// - http://www.unicode.org/reports/tr46/tr46-18.html
// - https://www.icann.org/news/announcement-2000-01-07-en
info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;

if (U_FAILURE(status) || (!lenient && info.errors != 0)) {
len = -1;
buf->SetLength(0);
Expand All @@ -534,11 +542,9 @@ static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
CHECK_GE(args.Length(), 1);
CHECK(args[0]->IsString());
Utf8Value val(env->isolate(), args[0]);
// optional arg
bool lenient = args[1]->BooleanValue(env->context()).FromJust();

MaybeStackBuffer<char> buf;
int32_t len = ToUnicode(&buf, *val, val.length(), lenient);
int32_t len = ToUnicode(&buf, *val, val.length());

if (len < 0) {
return env->ThrowError("Cannot convert name to Unicode");
Expand Down
3 changes: 1 addition & 2 deletions src/node_i18n.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ int32_t ToASCII(MaybeStackBuffer<char>* buf,
bool lenient = false);
int32_t ToUnicode(MaybeStackBuffer<char>* buf,
const char* input,
size_t length,
bool lenient = false);
size_t length);

} // namespace i18n
} // namespace node
Expand Down
33 changes: 22 additions & 11 deletions test/fixtures/url-idna.js
Original file line number Diff line number Diff line change
Expand Up @@ -191,22 +191,33 @@ module.exports = {
{
ascii: `${`${'a'.repeat(64)}.`.repeat(4)}com`,
unicode: `${`${'a'.repeat(64)}.`.repeat(4)}com`
}
],
invalid: [
// invalid character
},
// URLs with hyphen
{
ascii: 'r4---sn-a5mlrn7s.gevideo.com',
unicode: 'r4---sn-a5mlrn7s.gevideo.com'
},
{
ascii: '-sn-a5mlrn7s.gevideo.com',
unicode: '-sn-a5mlrn7s.gevideo.com'
},
{
url: '\ufffd.com',
mode: 'ascii'
ascii: 'sn-a5mlrn7s-.gevideo.com',
unicode: 'sn-a5mlrn7s-.gevideo.com'
},
{
url: '\ufffd.com',
mode: 'unicode'
ascii: '-sn-a5mlrn7s-.gevideo.com',
unicode: '-sn-a5mlrn7s-.gevideo.com'
},
// invalid Punycode
{
url: 'xn---abc.com',
mode: 'unicode'
ascii: '-sn--a5mlrn7s-.gevideo.com',
unicode: '-sn--a5mlrn7s-.gevideo.com'
}
],
invalid: [
// invalid character
'\ufffd.com',
// invalid bi-directional character
'تشادرlatin.icom.museum'
]
}
22 changes: 8 additions & 14 deletions test/parallel/test-icu-punycode.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,13 @@ const tests = require('../fixtures/url-idna.js');
}

{
const errorRe = {
ascii: /^Error: Cannot convert name to ASCII$/,
unicode: /^Error: Cannot convert name to Unicode$/
};
const convertFunc = {
ascii: icu.toASCII,
unicode: icu.toUnicode
};

for (const [i, { url, mode }] of tests.invalid.entries()) {
assert.throws(() => convertFunc[mode](url), errorRe[mode],
`Invalid case ${i + 1}`);
assert.doesNotThrow(() => convertFunc[mode](url, true),
`Invalid case ${i + 1} in lenient mode`);
for (const [i, url] of tests.invalid.entries()) {
assert.throws(() => icu.toASCII(url),
/^Error: Cannot convert name to ASCII$/,
`ToASCII invalid case ${i + 1}`);
assert.doesNotThrow(() => icu.toASCII(url, true),
`ToASCII invalid case ${i + 1} in lenient mode`);
assert.doesNotThrow(() => icu.toUnicode(url),
`ToUnicode invalid case ${i + 1}`);
}
}
3 changes: 2 additions & 1 deletion test/parallel/test-url-domain-ascii-unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ const domainToASCII = url.domainToASCII;
const domainToUnicode = url.domainToUnicode;

const domainWithASCII = [
['ıídيٴ', 'xn--d-iga7ro0q9f'],
['ıíd', 'xn--d-iga7r'],
['يٴ', 'xn--mhb8f'],
['www.ϧƽəʐ.com', 'www.xn--cja62apfr6c.com'],
['новини.com', 'xn--b1amarcd.com'],
['名がドメイン.com', 'xn--v8jxj3d1dzdz08w.com'],
Expand Down
11 changes: 4 additions & 7 deletions test/parallel/test-whatwg-url-domainto.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,8 @@ const tests = require('../fixtures/url-idna.js');
}

{
const convertFunc = {
ascii: domainToASCII,
unicode: domainToUnicode
};

for (const [i, { url, mode }] of tests.invalid.entries())
assert.strictEqual(convertFunc[mode](url), '', `Invalid case ${i + 1}`);
for (const [i, url] of tests.invalid.entries()) {
assert.strictEqual(domainToASCII(url), '', `Invalid case ${i + 1}`);
assert.strictEqual(domainToUnicode(url), '', `Invalid case ${i + 1}`);
}
}

0 comments on commit 06a617a

Please sign in to comment.