diff --git a/boa_engine/src/builtins/mod.rs b/boa_engine/src/builtins/mod.rs index 89d8f32fed1..d6aeda78608 100644 --- a/boa_engine/src/builtins/mod.rs +++ b/boa_engine/src/builtins/mod.rs @@ -32,6 +32,7 @@ pub mod string; pub mod symbol; pub mod typed_array; pub mod undefined; +pub mod uri; #[cfg(feature = "console")] pub mod console; @@ -81,7 +82,7 @@ use crate::{ builtins::{ array_buffer::ArrayBuffer, async_generator::AsyncGenerator, async_generator_function::AsyncGeneratorFunction, generator::Generator, - generator_function::GeneratorFunction, typed_array::TypedArray, + generator_function::GeneratorFunction, typed_array::TypedArray, uri::Uri, }, property::{Attribute, PropertyDescriptor}, Context, JsValue, @@ -193,7 +194,8 @@ pub fn init(context: &mut Context) { Promise, AsyncFunction, AsyncGenerator, - AsyncGeneratorFunction + AsyncGeneratorFunction, + Uri }; #[cfg(feature = "intl")] diff --git a/boa_engine/src/builtins/regexp/mod.rs b/boa_engine/src/builtins/regexp/mod.rs index 8ca3f71ea95..98064bda9d1 100644 --- a/boa_engine/src/builtins/regexp/mod.rs +++ b/boa_engine/src/builtins/regexp/mod.rs @@ -1745,7 +1745,8 @@ fn advance_string_index(s: &JsString, index: u64, unicode: bool) -> u64 { } // 5. Let cp be ! CodePointAt(S, index). - let (_, offset, _) = crate::builtins::string::code_point_at(s, index); + let cp = crate::builtins::string::code_point_at(s, index); - index + u64::from(offset) + // 6. Return index + cp.[[CodeUnitCount]]. + index + u64::from(cp.code_unit_count) } diff --git a/boa_engine/src/builtins/string/mod.rs b/boa_engine/src/builtins/string/mod.rs index f77315dd40f..900346fc3d3 100644 --- a/boa_engine/src/builtins/string/mod.rs +++ b/boa_engine/src/builtins/string/mod.rs @@ -40,29 +40,87 @@ pub(crate) enum Placement { End, } -pub(crate) fn code_point_at(string: &JsString, position: u64) -> (u32, u8, bool) { +/// Code point information for the `CodePointAt` abstract operation. +#[derive(Debug, Clone, Copy)] +pub(crate) struct CodePointInfo { + pub(crate) code_point: u32, + pub(crate) code_unit_count: u8, + pub(crate) is_unpaired_surrogate: bool, +} + +/// The `CodePointAt ( string, position )` abstract operation. +/// +/// The abstract operation `CodePointAt` takes arguments `string` (a String) and `position` (a +/// non-negative integer) and returns a Record with fields `[[CodePoint]]` (a code point), +/// `[[CodeUnitCount]]` (a positive integer), and `[[IsUnpairedSurrogate]]` (a Boolean). It +/// interprets string as a sequence of UTF-16 encoded code points, as described in 6.1.4, and reads +/// from it a single code point starting with the code unit at index `position`. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#sec-codepointat +pub(crate) fn code_point_at(string: &JsString, position: u64) -> CodePointInfo { let mut encoded = string.encode_utf16(); + + // 1. Let size be the length of string. let size = encoded.clone().count() as u64; + // 2. Assert: position ≥ 0 and position < size. + assert!(position < size); + + // 3. Let first be the code unit at index position within string. let first = encoded .nth(position as usize) .expect("The callers of this function must've already checked bounds."); + + // 4. Let cp be the code point whose numeric value is that of first. + let cp = u32::from(first); + + // 5. If first is not a leading surrogate or trailing surrogate, then if !is_leading_surrogate(first) && !is_trailing_surrogate(first) { - return (u32::from(first), 1, false); + // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: false }. + return CodePointInfo { + code_point: cp, + code_unit_count: 1, + is_unpaired_surrogate: false, + }; } + // 6. If first is a trailing surrogate or position + 1 = size, then if is_trailing_surrogate(first) || position + 1 == size { - return (u32::from(first), 1, true); + // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }. + return CodePointInfo { + code_point: cp, + code_unit_count: 1, + is_unpaired_surrogate: true, + }; } + // 7. Let second be the code unit at index position + 1 within string. let second = encoded .next() .expect("The callers of this function must've already checked bounds."); + + // 8. If second is not a trailing surrogate, then if !is_trailing_surrogate(second) { - return (u32::from(first), 1, true); + // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }. + return CodePointInfo { + code_point: cp, + code_unit_count: 1, + is_unpaired_surrogate: true, + }; } + + // 9. Set cp to UTF16SurrogatePairToCodePoint(first, second). let cp = (u32::from(first) - 0xD800) * 0x400 + (u32::from(second) - 0xDC00) + 0x10000; - (cp, 2, false) + + // 10. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 2, [[IsUnpairedSurrogate]]: false }. + CodePointInfo { + code_point: cp, + code_unit_count: 2, + is_unpaired_surrogate: false, + } } /// Helper function to check if a `char` is trimmable. @@ -86,10 +144,22 @@ pub(crate) fn is_trimmable_whitespace(c: char) -> bool { ) } +/// Checks if the given code unit is a leading surrogate. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#leading-surrogate pub(crate) fn is_leading_surrogate(value: u16) -> bool { (0xD800..=0xDBFF).contains(&value) } +/// Checks if the given code unit is a trailing surrogate. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#trailing-surrogate pub(crate) fn is_trailing_surrogate(value: u16) -> bool { (0xDC00..=0xDFFF).contains(&value) } @@ -369,7 +439,7 @@ impl String { } } - /// `String.fromCharCode(...codePoints)` + /// `String.fromCharCode(...codeUnits)` /// /// Construct a `String` from one or more code points (as numbers). /// More information: @@ -381,21 +451,22 @@ impl String { args: &[JsValue], context: &mut Context, ) -> JsResult { - // 1. Let length be the number of elements in codeUnits. - // 2. Let elements be a new empty List. - let mut elements = Vec::new(); - // 3. For each element next of codeUnits, do + // 1. Let result be the empty String. + let mut result = Vec::new(); + + // 2. For each element next of codeUnits, do for next in args { - // 3a. Let nextCU be ℝ(? ToUint16(next)). - // 3b. Append nextCU to the end of elements. - elements.push(next.to_uint16(context)?); - } + // a. Let nextCU be the code unit whose numeric value is ℝ(? ToUint16(next)). + let next_cu = next.to_uint16(context)?; - // 4. Return the String value whose code units are the elements in the List elements. - // If codeUnits is empty, the empty String is returned. + // b. Set result to the string-concatenation of result and nextCU. + result.push(next_cu); + } - let s = std::string::String::from_utf16_lossy(elements.as_slice()); - Ok(JsValue::String(JsString::new(s))) + // 3. Return result. + Ok(JsValue::String(JsString::new( + std::string::String::from_utf16_lossy(&result), + ))) } /// `String.prototype.toString ( )` @@ -544,7 +615,7 @@ impl String { IntegerOrInfinity::Integer(position) if (0..size).contains(&position) => { // 6. Let cp be ! CodePointAt(S, position). // 7. Return 𝔽(cp.[[CodePoint]]). - Ok(code_point_at(&string, position as u64).0.into()) + Ok(code_point_at(&string, position as u64).code_point.into()) } // 5. If position < 0 or position ≥ size, return undefined. _ => Ok(JsValue::undefined()), diff --git a/boa_engine/src/builtins/string/string_iterator.rs b/boa_engine/src/builtins/string/string_iterator.rs index 09983f532ad..21b1568ad6e 100644 --- a/boa_engine/src/builtins/string/string_iterator.rs +++ b/boa_engine/src/builtins/string/string_iterator.rs @@ -10,6 +10,8 @@ use crate::{ use boa_gc::{Finalize, Trace}; use boa_profiler::Profiler; +use super::CodePointInfo; + #[derive(Debug, Clone, Finalize, Trace)] pub struct StringIterator { string: JsValue, @@ -61,7 +63,11 @@ impl StringIterator { context, )); } - let (_, code_unit_count, _) = code_point_at(&native_string, position as u64); + let CodePointInfo { + code_point: _, + code_unit_count, + is_unpaired_surrogate: _, + } = code_point_at(&native_string, position as u64); string_iterator.next_index += i32::from(code_unit_count); let result_string = crate::builtins::string::String::substring( &string_iterator.string, diff --git a/boa_engine/src/builtins/string/tests.rs b/boa_engine/src/builtins/string/tests.rs index a8f3b2ea809..cefd29a8d87 100644 --- a/boa_engine/src/builtins/string/tests.rs +++ b/boa_engine/src/builtins/string/tests.rs @@ -1,3 +1,4 @@ +use super::{is_leading_surrogate, is_trailing_surrogate}; use crate::{forward, forward_val, Context}; #[test] @@ -1150,3 +1151,17 @@ fn search() { assert_eq!(forward(&mut context, "'aa'.search(/a/g)"), "0"); assert_eq!(forward(&mut context, "'ba'.search(/a/)"), "1"); } + +#[test] +fn ut_is_leading_surrogate() { + for cp in 0xD800..=0xDBFF { + assert!(is_leading_surrogate(cp), "failed: {cp:X}"); + } +} + +#[test] +fn ut_is_trailing_surrogate() { + for cp in 0xDC00..=0xDFFF { + assert!(is_trailing_surrogate(cp), "failed: {cp:X}"); + } +} diff --git a/boa_engine/src/builtins/uri/consts.rs b/boa_engine/src/builtins/uri/consts.rs new file mode 100644 index 00000000000..22ad56f5272 --- /dev/null +++ b/boa_engine/src/builtins/uri/consts.rs @@ -0,0 +1,110 @@ +//! URI handling function constants +//! +//! This module contains a few constants used to handle decoding and encoding for URI handling +//! functions. They make it easier and more performant to compare different ranges and code points. + +use std::ops::RangeInclusive; + +/// A range containing all the lowercase `uriAlpha` code points. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#prod-uriAlpha +const URI_ALPHA_LOWER: RangeInclusive = b'a' as u16..=b'z' as u16; + +/// A range containing all the uppercase `uriAlpha` code points. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#prod-uriAlpha +const URI_ALPHA_UPPER: RangeInclusive = b'A' as u16..=b'Z' as u16; + +/// A range containing all the `DecimalDigit` code points. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#prod-DecimalDigit +const DECIMAL_DIGIT: RangeInclusive = b'0' as u16..=b'9' as u16; + +/// An array containing all the `uriMark` code points. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#prod-uriMark +const URI_MARK: [u16; 9] = [ + b'-' as u16, + b'_' as u16, + b'.' as u16, + b'!' as u16, + b'~' as u16, + b'*' as u16, + b'\'' as u16, + b'(' as u16, + b')' as u16, +]; + +/// An array containing all the `uriReserved` code points. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#prod-uriReserved +const URI_RESERVED: [u16; 10] = [ + b';' as u16, + b'/' as u16, + b'?' as u16, + b':' as u16, + b'@' as u16, + b'&' as u16, + b'=' as u16, + b'+' as u16, + b'$' as u16, + b',' as u16, +]; + +/// The number sign (`#`) symbol as a UTF-16 code potint. +const NUMBER_SIGN: u16 = b'#' as u16; + +/// Constant with all the unescaped URI characters. +/// +/// Contains `uriAlpha`, `DecimalDigit` and `uriMark`. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#prod-uriUnescaped +#[inline] +pub(super) fn is_uri_unescaped(code_point: u16) -> bool { + URI_ALPHA_LOWER.contains(&code_point) + || URI_ALPHA_UPPER.contains(&code_point) + || DECIMAL_DIGIT.contains(&code_point) + || URI_MARK.contains(&code_point) +} + +/// Constant with all the reserved URI characters, plus the number sign symbol (`#`). +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#prod-uriReserved +#[inline] +pub(super) fn is_uri_reserved_or_number_sign(code_point: u16) -> bool { + code_point == NUMBER_SIGN || URI_RESERVED.contains(&code_point) +} + +/// Constant with all the reserved and unescaped URI characters, plus the number sign symbol (`#`). +/// +/// More information: +/// - [`uriReserved` in ECMAScript spec][uri_reserved] +/// - [`uriUnescaped` in ECMAScript spec][uri_unescaped] +/// +/// [uri_reserved]: https://tc39.es/ecma262/#prod-uriReserved +/// [uri_unescaped]: https://tc39.es/ecma262/#prod-uriUnescaped +#[inline] +pub(super) fn is_uri_reserved_or_uri_unescaped_or_number_sign(code_point: u16) -> bool { + code_point == NUMBER_SIGN || is_uri_unescaped(code_point) || URI_RESERVED.contains(&code_point) +} diff --git a/boa_engine/src/builtins/uri/mod.rs b/boa_engine/src/builtins/uri/mod.rs new file mode 100644 index 00000000000..19fb39d9486 --- /dev/null +++ b/boa_engine/src/builtins/uri/mod.rs @@ -0,0 +1,550 @@ +//! URI Handling Functions +//! +//! Uniform Resource Identifiers, or URIs, are Strings that identify resources (e.g. web pages or +//! files) and transport protocols by which to access them (e.g. HTTP or FTP) on the Internet. The +//! ECMAScript language itself does not provide any support for using URIs except for functions +//! that encode and decode URIs as described in 19.2.6.2, 19.2.6.3, 19.2.6.4 and 19.2.6.5 +//! +//! More information: +//! - [ECMAScript reference][spec] +//! +//! [spec]: https://tc39.es/ecma262/#sec-uri-handling-functions + +mod consts; + +use self::consts::{ + is_uri_reserved_or_number_sign, is_uri_reserved_or_uri_unescaped_or_number_sign, + is_uri_unescaped, +}; + +use super::{string::code_point_at, BuiltIn}; +use crate::{ + builtins::JsArgs, object::FunctionBuilder, property::Attribute, Context, JsResult, JsString, + JsValue, +}; + +/// URI Handling Functions +#[derive(Debug, Clone, Copy)] +pub(crate) struct Uri; + +impl BuiltIn for Uri { + const NAME: &'static str = "Uri"; + + fn init(context: &mut Context) -> Option { + let decode_uri = FunctionBuilder::native(context, Self::decode_uri) + .name("decodeURI") + .length(1) + .constructor(false) + .build(); + + context.register_global_property( + "decodeURI", + decode_uri, + Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE, + ); + + let decode_uri_component = FunctionBuilder::native(context, Self::decode_uri_component) + .name("decodeURIComponent") + .length(1) + .constructor(false) + .build(); + + context.register_global_property( + "decodeURIComponent", + decode_uri_component, + Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE, + ); + + let encode_uri = FunctionBuilder::native(context, Self::encode_uri) + .name("encodeURI") + .length(1) + .constructor(false) + .build(); + + context.register_global_property( + "encodeURI", + encode_uri, + Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE, + ); + + let encode_uri_component = FunctionBuilder::native(context, Self::encode_uri_component) + .name("encodeURIComponent") + .length(1) + .constructor(false) + .build(); + + context.register_global_property( + "encodeURIComponent", + encode_uri_component, + Attribute::WRITABLE | Attribute::NON_ENUMERABLE | Attribute::CONFIGURABLE, + ); + + None + } +} + +impl Uri { + /// Builtin JavaScript `decodeURI ( encodedURI )` function. + /// + /// This function computes a new version of a URI in which each escape sequence and UTF-8 + /// encoding of the sort that might be introduced by the `encodeURI` function is replaced with + /// the UTF-16 encoding of the code points that it represents. Escape sequences that could not + /// have been introduced by `encodeURI` are not replaced. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// - [MDN documentation][mdn] + /// + /// [spec]: https://tc39.es/ecma262/#sec-decodeuri-encodeduri + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURI + pub(crate) fn decode_uri( + _: &JsValue, + args: &[JsValue], + context: &mut Context, + ) -> JsResult { + let encoded_uri = args.get_or_undefined(0); + + // 1. Let uriString be ? ToString(encodedURI). + let uri_string = encoded_uri.to_string(context)?; + + // 2. Let reservedURISet be a String containing one instance of each code unit valid in uriReserved plus "#". + let reserved_uri_set = is_uri_reserved_or_number_sign; + + // 3. Return ? Decode(uriString, reservedURISet). + Ok(JsValue::from(decode( + context, + &uri_string, + reserved_uri_set, + )?)) + } + + /// Builtin JavaScript `decodeURIComponent ( encodedURIComponent )` function. + /// + /// This function computes a new version of a URI in which each escape sequence and UTF-8 + /// encoding of the sort that might be introduced by the `encodeURIComponent` function is + /// replaced with the UTF-16 encoding of the code points that it represents. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// - [MDN documentation][mdn] + /// + /// [spec]: https://tc39.es/ecma262/#sec-decodeuricomponent-encodeduricomponent + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent + pub(crate) fn decode_uri_component( + _: &JsValue, + args: &[JsValue], + context: &mut Context, + ) -> JsResult { + let encoded_uri_component = args.get_or_undefined(0); + + // 1. Let componentString be ? ToString(encodedURIComponent). + let component_string = encoded_uri_component.to_string(context)?; + + // 2. Let reservedURIComponentSet be the empty String. + let reserved_uri_component_set = |_: u16| false; + + // 3. Return ? Decode(componentString, reservedURIComponentSet). + Ok(JsValue::from(decode( + context, + &component_string, + reserved_uri_component_set, + )?)) + } + + /// Builtin JavaScript `encodeURI ( uri )` function. + /// + /// This function computes a new version of a UTF-16 encoded (6.1.4) URI in which each instance + /// of certain code points is replaced by one, two, three, or four escape sequences + /// representing the UTF-8 encoding of the code points. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// - [MDN documentation][mdn] + /// + /// [spec]: https://tc39.es/ecma262/#sec-encodeuri-uri + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI + pub(crate) fn encode_uri( + _: &JsValue, + args: &[JsValue], + context: &mut Context, + ) -> JsResult { + let uri = args.get_or_undefined(0); + + // 1. Let uriString be ? ToString(uri). + let uri_string = uri.to_string(context)?; + + // 2. Let unescapedURISet be a String containing one instance of each code unit valid in uriReserved and uriUnescaped plus "#". + let unescaped_uri_set = is_uri_reserved_or_uri_unescaped_or_number_sign; + + // 3. Return ? Encode(uriString, unescapedURISet). + Ok(JsValue::from(encode( + context, + &uri_string, + unescaped_uri_set, + )?)) + } + + /// Builtin JavaScript `encodeURIComponent ( uriComponent )` function. + /// + /// This function computes a new version of a UTF-16 encoded (6.1.4) URI in which each instance + /// of certain code points is replaced by one, two, three, or four escape sequences + /// representing the UTF-8 encoding of the code point. + /// + /// More information: + /// - [ECMAScript reference][spec] + /// - [MDN documentation][mdn] + /// + /// [spec]: https://tc39.es/ecma262/#sec-encodeuricomponent-uricomponent + /// [mdn]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURIComponent + pub(crate) fn encode_uri_component( + _: &JsValue, + args: &[JsValue], + context: &mut Context, + ) -> JsResult { + let uri_component = args.get_or_undefined(0); + + // 1. Let componentString be ? ToString(uriComponent). + let component_string = uri_component.to_string(context)?; + + // 2. Let unescapedURIComponentSet be a String containing one instance of each code unit valid in uriUnescaped. + let unescaped_uri_component_set = is_uri_unescaped; + + // 3. Return ? Encode(componentString, unescapedURIComponentSet). + Ok(JsValue::from(encode( + context, + &component_string, + unescaped_uri_component_set, + )?)) + } +} + +/// The `Encode ( string, unescapedSet )` abstract operation +/// +/// The abstract operation Encode takes arguments `string` (a String) and `unescapedSet` (a String) +/// and returns either a normal completion containing a String or a throw completion. It performs +/// URI encoding and escaping. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#sec-encode +fn encode(context: &mut Context, string: &JsString, unescaped_set: F) -> JsResult +where + F: Fn(u16) -> bool, +{ + let code_units = string.encode_utf16().collect::>(); + + // 1. Let strLen be the length of string. + let str_len = code_units.len(); + + // 2. Let R be the empty String. + let mut r = String::new(); + + // 3. Let k be 0. + let mut k = 0; + // 4. Repeat, + loop { + // a. If k = strLen, return R. + if k == str_len { + return Ok(r); + } + + // b. Let C be the code unit at index k within string. + let c = code_units[k]; + + // c. If C is in unescapedSet, then + if unescaped_set(c) { + // i. Set k to k + 1. + k += 1; + + // ii. Set R to the string-concatenation of R and C. + r.push(char::from_u32(u32::from(c)).expect("char from code point cannot fail here")); + } else { + // d. Else, + // i. Let cp be CodePointAt(string, k). + let cp = code_point_at(string, k as u64); + + // ii. If cp.[[IsUnpairedSurrogate]] is true, throw a URIError exception. + if cp.is_unpaired_surrogate { + context.throw_uri_error("trying to encode an invalid string")?; + } + + // iii. Set k to k + cp.[[CodeUnitCount]]. + k += cp.code_unit_count as usize; + + // iv. Let Octets be the List of octets resulting by applying the UTF-8 transformation + // to cp.[[CodePoint]]. + let mut buff = [0_u8; 4]; // Will never be more than 4 bytes + + let octets = char::from_u32(cp.code_point) + .expect("valid unicode code point to char conversion failed") + .encode_utf8(&mut buff); + + // v. For each element octet of Octets, do + for octet in octets.bytes() { + // 1. Set R to the string-concatenation of: + // R + // "%" + // the String representation of octet, formatted as a two-digit uppercase + // hexadecimal number, padded to the left with a zero if necessary + r = format!("{r}%{octet:0>2X}"); + } + } + } +} + +/// The `Decode ( string, reservedSet )` abstract operation. +/// +/// The abstract operation Decode takes arguments `string` (a String) and `reservedSet` (a String) +/// and returns either a normal completion containing a String or a throw completion. It performs +/// URI unescaping and decoding. +/// +/// More information: +/// - [ECMAScript reference][spec] +/// +/// [spec]: https://tc39.es/ecma262/#sec-decode +#[allow(clippy::many_single_char_names)] +fn decode(context: &mut Context, string: &JsString, reserved_set: F) -> JsResult +where + F: Fn(u16) -> bool, +{ + let code_units = string.encode_utf16().collect::>(); + + // 1. Let strLen be the length of string. + let str_len = code_units.len(); + // 2. Let R be the empty String. + let mut r = Vec::new(); + + // 3. Let k be 0. + let mut k = 0; + // 4. Repeat, + loop { + // a. If k = strLen, return R. + if k == str_len { + return Ok(String::from_utf16(&r).expect("invalid UTF-16 characters found")); + } + + // b. Let C be the code unit at index k within string. + let c = code_units[k]; + + // c. If C is not the code unit 0x0025 (PERCENT SIGN), then + #[allow(clippy::if_not_else)] + let s = if c != 0x0025_u16 { + // i. Let S be the String value containing only the code unit C. + Vec::from([c]) + } else { + // d. Else, + // i. Let start be k. + let start = k; + + // ii. If k + 2 ≥ strLen, throw a URIError exception. + if k + 2 >= str_len { + context.throw_uri_error("invalid escape character found")?; + } + + // iii. If the code units at index (k + 1) and (k + 2) within string do not represent + // hexadecimal digits, throw a URIError exception. + // iv. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2). + let b = decode_hex_byte(code_units[k + 1], code_units[k + 2]) + .ok_or_else(|| context.construct_uri_error("invalid hexadecimal digit found"))?; + + // v. Set k to k + 2. + k += 2; + + // vi. Let n be the number of leading 1 bits in B. + let n = leading_one_bits(b); + + // vii. If n = 0, then + if n == 0 { + // 1. Let C be the code unit whose value is B. + let c = u16::from(b); + + // 2. If C is not in reservedSet, then + if !reserved_set(c) { + // a. Let S be the String value containing only the code unit C. + Vec::from([c]) + } else { + // 3. Else, + // a. Let S be the substring of string from start to k + 1. + Vec::from(&code_units[start..=k]) + } + } else { + // viii. Else, + // 1. If n = 1 or n > 4, throw a URIError exception. + if n == 1 || n > 4 { + context.throw_uri_error("invalid escaped character found")?; + } + + // 2. If k + (3 × (n - 1)) ≥ strLen, throw a URIError exception. + if k + (3 * (n - 1)) > str_len { + context.throw_uri_error("non-terminated escape character found")?; + } + + // 3. Let Octets be « B ». + let mut octets = Vec::from([b]); + + // 4. Let j be 1. + // 5. Repeat, while j < n, + for _j in 1..n { + // a. Set k to k + 1. + k += 1; + + // b. If the code unit at index k within string is not the code unit 0x0025 (PERCENT SIGN), throw a URIError exception. + if code_units[k] != 0x0025 { + context + .throw_uri_error("escape characters must be preceded with a % sign")?; + } + + // c. If the code units at index (k + 1) and (k + 2) within string do not represent hexadecimal digits, throw a URIError exception. + // d. Let B be the 8-bit value represented by the two hexadecimal digits at index (k + 1) and (k + 2). + let b = + decode_hex_byte(code_units[k + 1], code_units[k + 2]).ok_or_else(|| { + context.construct_uri_error("invalid hexadecimal digit found") + })?; + + // e. Set k to k + 2. + k += 2; + + // f. Append B to Octets. + octets.push(b); + + // g. Set j to j + 1. + } + + // 6. Assert: The length of Octets is n. + assert_eq!(octets.len(), n); + + // 7. If Octets does not contain a valid UTF-8 encoding of a Unicode code point, throw a URIError exception. + match String::from_utf8(octets) { + Err(_) => { + return Err(context.construct_uri_error("invalid UTF-8 encoding found")) + } + Ok(v) => { + // 8. Let V be the code point obtained by applying the UTF-8 transformation to Octets, that is, from a List of octets into a 21-bit value. + + // 9. Let S be UTF16EncodeCodePoint(V). + // utf16_encode_codepoint(v) + v.encode_utf16().collect::>() + } + } + } + }; + + // e. Set R to the string-concatenation of R and S. + r.extend_from_slice(&s); + + // f. Set k to k + 1. + k += 1; + } +} + +/// Decodes a byte from two unicode code units. +fn decode_hex_byte(high: u16, low: u16) -> Option { + match ( + char::from_u32(u32::from(high)), + char::from_u32(u32::from(low)), + ) { + (Some(high), Some(low)) => match (high.to_digit(16), low.to_digit(16)) { + (Some(high), Some(low)) => Some(((high as u8) << 4) + low as u8), + _ => None, + }, + _ => None, + } +} + +/// Counts the number of leading 1 bits in a given byte. +#[inline] +fn leading_one_bits(byte: u8) -> usize { + // This uses a value table for speed + if byte == u8::MAX { + 8 + } else if byte == 0b1111_1110 { + 7 + } else if byte & 0b1111_1100 == 0b1111_1100 { + 6 + } else if byte & 0b1111_1000 == 0b1111_1000 { + 5 + } else if byte & 0b1111_0000 == 0b1111_0000 { + 4 + } else if byte & 0b1110_0000 == 0b1110_0000 { + 3 + } else if byte & 0b1100_0000 == 0b1100_0000 { + 2 + } else if byte & 0b1000_0000 == 0b1000_0000 { + 1 + } else { + 0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Checks if the `leading_one_bits()` function works as expected. + #[test] + fn ut_leading_one_bits() { + assert_eq!(leading_one_bits(0b1111_1111), 8); + assert_eq!(leading_one_bits(0b1111_1110), 7); + + assert_eq!(leading_one_bits(0b1111_1100), 6); + assert_eq!(leading_one_bits(0b1111_1101), 6); + + assert_eq!(leading_one_bits(0b1111_1011), 5); + assert_eq!(leading_one_bits(0b1111_1000), 5); + + assert_eq!(leading_one_bits(0b1111_0000), 4); + assert_eq!(leading_one_bits(0b1111_0111), 4); + + assert_eq!(leading_one_bits(0b1110_0000), 3); + assert_eq!(leading_one_bits(0b1110_1111), 3); + + assert_eq!(leading_one_bits(0b1100_0000), 2); + assert_eq!(leading_one_bits(0b1101_1111), 2); + + assert_eq!(leading_one_bits(0b1000_0000), 1); + assert_eq!(leading_one_bits(0b1011_1111), 1); + + assert_eq!(leading_one_bits(0b0000_0000), 0); + assert_eq!(leading_one_bits(0b0111_1111), 0); + } + + /// Checks that the `decode_byte()` function works as expected. + #[test] + fn ut_decode_byte() { + // Sunny day tests + assert_eq!( + decode_hex_byte(u16::from(b'2'), u16::from(b'0')).unwrap(), + 0x20 + ); + assert_eq!( + decode_hex_byte(u16::from(b'2'), u16::from(b'A')).unwrap(), + 0x2A + ); + assert_eq!( + decode_hex_byte(u16::from(b'3'), u16::from(b'C')).unwrap(), + 0x3C + ); + assert_eq!( + decode_hex_byte(u16::from(b'4'), u16::from(b'0')).unwrap(), + 0x40 + ); + assert_eq!( + decode_hex_byte(u16::from(b'7'), u16::from(b'E')).unwrap(), + 0x7E + ); + assert_eq!( + decode_hex_byte(u16::from(b'0'), u16::from(b'0')).unwrap(), + 0x00 + ); + + // Rainy day tests + assert!(decode_hex_byte(u16::from(b'-'), u16::from(b'0')).is_none()); + assert!(decode_hex_byte(u16::from(b'f'), u16::from(b'~')).is_none()); + assert!(decode_hex_byte(u16::from(b'A'), 0_u16).is_none()); + assert!(decode_hex_byte(u16::from(b'%'), u16::from(b'&')).is_none()); + + assert!(decode_hex_byte(0xFACD_u16, u16::from(b'-')).is_none()); + assert!(decode_hex_byte(u16::from(b'-'), 0xA0FD_u16).is_none()); + } +}