Skip to content

Commit

Permalink
url: enforce valid UTF-8 in WHATWG parser
Browse files Browse the repository at this point in the history
This commit implements the Web IDL USVString conversion, which mandates
all unpaired Unicode surrogates be turned into U+FFFD REPLACEMENT
CHARACTER. It also disallows Symbols to be used as USVString per spec.

Certain functions call into C++ methods in the binding that use the
Utf8Value class to access string arguments. Utf8Value already does the
normalization using V8's String::Write, so in those cases, instead of
doing the full USVString normalization, only a symbol check is done
(`'' + val`, which uses ES's ToString, versus `String()` which has
special provisions for symbols).

PR-URL: #11436
Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
Reviewed-By: James M Snell <jasnell@gmail.com>
TimothyGu committed Mar 1, 2017
1 parent a7f7724 commit b610a4d
Showing 13 changed files with 509 additions and 44 deletions.
98 changes: 65 additions & 33 deletions lib/internal/url.js
Original file line number Diff line number Diff line change
@@ -23,6 +23,18 @@ const IteratorPrototype = Object.getPrototypeOf(
Object.getPrototypeOf([][Symbol.iterator]())
);

const unpairedSurrogateRe =
/([^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])/;
function toUSVString(val) {
const str = '' + val;

This comment has been minimized.

Copy link
@domenic

domenic Mar 2, 2017

Contributor

This will not perform ToString(), but instead ToPrimitive(). For example, if val is { toString() { return "5"; }, toPrimitive() { return "6"; }, you will get "6".

This comment has been minimized.

Copy link
@TimothyGu

TimothyGu Mar 8, 2017

Author Member

@domenic Multiple things going on here. First I assume you meant [Symbol.toPrimitive] instead of toPrimitive. Second, ToString() in ES calls ToPrimitive() as well. See the table ToString Conversions in the spec:

Object
Apply the following steps:
  1. Let primValue be ? ToPrimitive(argument, hint String).
  2. Return ? ToString(primValue).

Indeed try running the following in Chrome Dev Tools:

const searchParams = new URLSearchParams();
searchParams.set('name', {
  toString() { return "str"; },
  [Symbol.toPrimitive]() { return "prim"; }
});
console.log(searchParams.get('name'));

prim will be printed.

This comment has been minimized.

Copy link
@domenic

domenic Mar 8, 2017

Contributor

Ah sorry, I meant valueOf().

This comment has been minimized.

Copy link
@domenic

domenic Mar 8, 2017

Contributor

In particular note how + passes the Number hint.

This comment has been minimized.

Copy link
@TimothyGu

TimothyGu Mar 8, 2017

Author Member

Ah I see now. #11737

// As of V8 5.5, `str.search()` (and `unpairedSurrogateRe[@@search]()`) are
// slower than `unpairedSurrogateRe.exec()`.
const match = unpairedSurrogateRe.exec(str);
if (!match)
return str;
return binding.toUSVString(str, match.index);
}

class OpaqueOrigin {
toString() {
return 'null';
@@ -104,7 +116,6 @@ function onParseComplete(flags, protocol, username, password,

// Reused by URL constructor and URL#href setter.
function parse(url, input, base) {
input = String(input);
const base_context = base ? base[context] : undefined;
url[context] = new StorageObject();
binding.parse(input.trim(), -1,
@@ -206,8 +217,10 @@ function onParseHashComplete(flags, protocol, username, password,

class URL {
constructor(input, base) {
// toUSVString is not needed.
input = '' + input;
if (base !== undefined && !(base instanceof URL))
base = new URL(String(base));
base = new URL(base);
parse(this, input, base);
}

@@ -315,6 +328,8 @@ Object.defineProperties(URL.prototype, {
return this[kFormat]({});
},
set(input) {
// toUSVString is not needed.
input = '' + input;
parse(this, input);
}
},
@@ -332,7 +347,8 @@ Object.defineProperties(URL.prototype, {
return this[context].scheme;
},
set(scheme) {
scheme = String(scheme);
// toUSVString is not needed.
scheme = '' + scheme;
if (scheme.length === 0)
return;
binding.parse(scheme, binding.kSchemeStart, null, this[context],
@@ -346,7 +362,8 @@ Object.defineProperties(URL.prototype, {
return this[context].username || '';
},
set(username) {
username = String(username);
// toUSVString is not needed.
username = '' + username;
if (!this.hostname)
return;
const ctx = this[context];
@@ -366,7 +383,8 @@ Object.defineProperties(URL.prototype, {
return this[context].password || '';
},
set(password) {
password = String(password);
// toUSVString is not needed.
password = '' + password;
if (!this.hostname)
return;
const ctx = this[context];
@@ -391,7 +409,8 @@ Object.defineProperties(URL.prototype, {
},
set(host) {
const ctx = this[context];
host = String(host);
// toUSVString is not needed.
host = '' + host;
if (this[cannotBeBase] ||
(this[special] && host.length === 0)) {
// Cannot set the host if cannot-be-base is set or
@@ -415,7 +434,8 @@ Object.defineProperties(URL.prototype, {
},
set(host) {
const ctx = this[context];
host = String(host);
// toUSVString is not needed.
host = '' + host;
if (this[cannotBeBase] ||
(this[special] && host.length === 0)) {
// Cannot set the host if cannot-be-base is set or
@@ -439,11 +459,12 @@ Object.defineProperties(URL.prototype, {
return port === undefined ? '' : String(port);
},
set(port) {
// toUSVString is not needed.
port = '' + port;
const ctx = this[context];
if (!ctx.host || this[cannotBeBase] ||
this.protocol === 'file:')
return;
port = String(port);
if (port === '') {
ctx.port = undefined;
return;
@@ -462,9 +483,11 @@ Object.defineProperties(URL.prototype, {
return ctx.path !== undefined ? `/${ctx.path.join('/')}` : '';
},
set(path) {
// toUSVString is not needed.
path = '' + path;
if (this[cannotBeBase])
return;
binding.parse(String(path), binding.kPathStart, null, this[context],
binding.parse(path, binding.kPathStart, null, this[context],
onParsePathComplete.bind(this));
}
},
@@ -477,7 +500,7 @@ Object.defineProperties(URL.prototype, {
},
set(search) {
const ctx = this[context];
search = String(search);
search = toUSVString(search);
if (!search) {
ctx.query = null;
ctx.flags &= ~binding.URL_FLAGS_HAS_QUERY;
@@ -509,7 +532,8 @@ Object.defineProperties(URL.prototype, {
},
set(hash) {
const ctx = this[context];
hash = String(hash);
// toUSVString is not needed.
hash = '' + hash;
if (this.protocol === 'javascript:')
return;
if (!hash) {
@@ -652,19 +676,22 @@ class URLSearchParams {
if (pair.length !== 2) {
throw new TypeError('Each query pair must be a name/value tuple');
}
this[searchParams].push(String(pair[0]), String(pair[1]));
const key = toUSVString(pair[0]);
const value = toUSVString(pair[1]);
this[searchParams].push(key, value);
}
} else {
// record<USVString, USVString>
this[searchParams] = [];
for (const key of Object.keys(init)) {
const value = String(init[key]);
for (var key of Object.keys(init)) {
key = toUSVString(key);
const value = toUSVString(init[key]);
this[searchParams].push(key, value);
}
}
} else {
// USVString
init = String(init);
init = toUSVString(init);
if (init[0] === '?') init = init.slice(1);
initSearchParams(this, init);
}
@@ -743,8 +770,8 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
throw new TypeError('"name" and "value" arguments must be specified');
}

name = String(name);
value = String(value);
name = toUSVString(name);
value = toUSVString(value);
this[searchParams].push(name, value);
update(this[context], this);
},
@@ -758,7 +785,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
}

const list = this[searchParams];
name = String(name);
name = toUSVString(name);
for (var i = 0; i < list.length;) {
const cur = list[i];
if (cur === name) {
@@ -779,7 +806,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
}

const list = this[searchParams];
name = String(name);
name = toUSVString(name);
for (var i = 0; i < list.length; i += 2) {
if (list[i] === name) {
return list[i + 1];
@@ -798,7 +825,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {

const list = this[searchParams];
const values = [];
name = String(name);
name = toUSVString(name);
for (var i = 0; i < list.length; i += 2) {
if (list[i] === name) {
values.push(list[i + 1]);
@@ -816,7 +843,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
}

const list = this[searchParams];
name = String(name);
name = toUSVString(name);
for (var i = 0; i < list.length; i += 2) {
if (list[i] === name) {
return true;
@@ -834,8 +861,8 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
}

const list = this[searchParams];
name = String(name);
value = String(value);
name = toUSVString(name);
value = toUSVString(value);

// If there are any name-value pairs whose name is `name`, in `list`, set
// the value of the first such name-value pair to `value` and remove the
@@ -1098,11 +1125,13 @@ function originFor(url, base) {
}

function domainToASCII(domain) {
return binding.domainToASCII(String(domain));
// toUSVString is not needed.
return binding.domainToASCII('' + domain);
}

function domainToUnicode(domain) {
return binding.domainToUnicode(String(domain));
// toUSVString is not needed.
return binding.domainToUnicode('' + domain);
}

// Utility function that converts a URL object into an ordinary
@@ -1188,11 +1217,14 @@ function getPathFromURL(path) {
return isWindows ? getPathFromURLWin32(path) : getPathFromURLPosix(path);
}

exports.getPathFromURL = getPathFromURL;
exports.URL = URL;
exports.URLSearchParams = URLSearchParams;
exports.domainToASCII = domainToASCII;
exports.domainToUnicode = domainToUnicode;
exports.urlToOptions = urlToOptions;
exports.formatSymbol = kFormat;
exports.searchParamsSymbol = searchParams;
module.exports = {
toUSVString,
getPathFromURL,
URL,
URLSearchParams,
domainToASCII,
domainToUnicode,
urlToOptions,
formatSymbol: kFormat,
searchParamsSymbol: searchParams
};
53 changes: 53 additions & 0 deletions src/node_url.cc
Original file line number Diff line number Diff line change
@@ -20,6 +20,8 @@
#include <unicode/utf.h>
#endif

#define UNICODE_REPLACEMENT_CHARACTER 0xFFFD

namespace node {

using v8::Array;
@@ -143,6 +145,21 @@ namespace url {
}
#endif

// If a UTF-16 character is a low/trailing surrogate.
static inline bool IsUnicodeTrail(uint16_t c) {
return (c & 0xFC00) == 0xDC00;
}

// If a UTF-16 character is a surrogate.
static inline bool IsUnicodeSurrogate(uint16_t c) {
return (c & 0xF800) == 0xD800;
}

// If a UTF-16 surrogate is a low/trailing one.
static inline bool IsUnicodeSurrogateTrail(uint16_t c) {
return (c & 0x400) != 0;
}

static url_host_type ParseIPv6Host(url_host* host,
const char* input,
size_t length) {
@@ -1351,6 +1368,41 @@ namespace url {
v8::NewStringType::kNormal).ToLocalChecked());
}

static void ToUSVString(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 2);
CHECK(args[0]->IsString());
CHECK(args[1]->IsNumber());

TwoByteValue value(env->isolate(), args[0]);
const size_t n = value.length();

const int64_t start = args[1]->IntegerValue(env->context()).FromJust();
CHECK_GE(start, 0);

for (size_t i = start; i < n; i++) {
uint16_t c = value[i];
if (!IsUnicodeSurrogate(c)) {
continue;
} else if (IsUnicodeSurrogateTrail(c) || i == n - 1) {
value[i] = UNICODE_REPLACEMENT_CHARACTER;
} else {
uint16_t d = value[i + 1];
if (IsUnicodeTrail(d)) {
i++;
} else {
value[i] = UNICODE_REPLACEMENT_CHARACTER;
}
}
}

args.GetReturnValue().Set(
String::NewFromTwoByte(env->isolate(),
*value,
v8::NewStringType::kNormal,
n).ToLocalChecked());
}

static void DomainToASCII(const FunctionCallbackInfo<Value>& args) {
Environment* env = Environment::GetCurrent(args);
CHECK_GE(args.Length(), 1);
@@ -1398,6 +1450,7 @@ namespace url {
Environment* env = Environment::GetCurrent(context);
env->SetMethod(target, "parse", Parse);
env->SetMethod(target, "encodeAuth", EncodeAuthSet);
env->SetMethod(target, "toUSVString", ToUSVString);
env->SetMethod(target, "domainToASCII", DomainToASCII);
env->SetMethod(target, "domainToUnicode", DomainToUnicode);

Loading

0 comments on commit b610a4d

Please sign in to comment.