Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Merged by Bors] - Add URI encoding and decoding functions #2267

Closed
wants to merge 9 commits into from
Closed
6 changes: 4 additions & 2 deletions boa_engine/src/builtins/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ pub mod string;
pub mod symbol;
pub mod typed_array;
pub mod undefined;
pub mod uri;

#[cfg(feature = "console")]
pub mod console;
Expand Down Expand Up @@ -81,7 +82,7 @@ use crate::{
builtins::{
array_buffer::ArrayBuffer, async_generator::AsyncGenerator,
async_generator_function::AsyncGeneratorFunction, generator::Generator,
generator_function::GeneratorFunction, typed_array::TypedArray,
generator_function::GeneratorFunction, typed_array::TypedArray, uri::Uri,
},
property::{Attribute, PropertyDescriptor},
Context, JsValue,
Expand Down Expand Up @@ -193,7 +194,8 @@ pub fn init(context: &mut Context) {
Promise,
AsyncFunction,
AsyncGenerator,
AsyncGeneratorFunction
AsyncGeneratorFunction,
Uri
};

#[cfg(feature = "intl")]
Expand Down
5 changes: 3 additions & 2 deletions boa_engine/src/builtins/regexp/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1745,7 +1745,8 @@ fn advance_string_index(s: &JsString, index: u64, unicode: bool) -> u64 {
}

// 5. Let cp be ! CodePointAt(S, index).
let (_, offset, _) = crate::builtins::string::code_point_at(s, index);
let cp = crate::builtins::string::code_point_at(s, index);

index + u64::from(offset)
// 6. Return index + cp.[[CodeUnitCount]].
index + u64::from(cp.code_unit_count)
}
109 changes: 90 additions & 19 deletions boa_engine/src/builtins/string/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,29 +40,87 @@ pub(crate) enum Placement {
End,
}

pub(crate) fn code_point_at(string: &JsString, position: u64) -> (u32, u8, bool) {
/// Code point information for the `CodePointAt` abstract operation.
#[derive(Debug, Clone, Copy)]
pub(crate) struct CodePointInfo {
pub(crate) code_point: u32,
pub(crate) code_unit_count: u8,
pub(crate) is_unpaired_surrogate: bool,
}

/// The `CodePointAt ( string, position )` abstract operation.
///
/// The abstract operation `CodePointAt` takes arguments `string` (a String) and `position` (a
/// non-negative integer) and returns a Record with fields `[[CodePoint]]` (a code point),
/// `[[CodeUnitCount]]` (a positive integer), and `[[IsUnpairedSurrogate]]` (a Boolean). It
/// interprets string as a sequence of UTF-16 encoded code points, as described in 6.1.4, and reads
/// from it a single code point starting with the code unit at index `position`.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#sec-codepointat
pub(crate) fn code_point_at(string: &JsString, position: u64) -> CodePointInfo {
let mut encoded = string.encode_utf16();

// 1. Let size be the length of string.
let size = encoded.clone().count() as u64;

// 2. Assert: position ≥ 0 and position < size.
assert!(position < size);

// 3. Let first be the code unit at index position within string.
let first = encoded
.nth(position as usize)
.expect("The callers of this function must've already checked bounds.");

// 4. Let cp be the code point whose numeric value is that of first.
let cp = u32::from(first);

// 5. If first is not a leading surrogate or trailing surrogate, then
if !is_leading_surrogate(first) && !is_trailing_surrogate(first) {
return (u32::from(first), 1, false);
// a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: false }.
return CodePointInfo {
code_point: cp,
code_unit_count: 1,
is_unpaired_surrogate: false,
};
}

// 6. If first is a trailing surrogate or position + 1 = size, then
if is_trailing_surrogate(first) || position + 1 == size {
return (u32::from(first), 1, true);
// a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }.
return CodePointInfo {
code_point: cp,
code_unit_count: 1,
is_unpaired_surrogate: true,
};
}

// 7. Let second be the code unit at index position + 1 within string.
let second = encoded
.next()
.expect("The callers of this function must've already checked bounds.");

// 8. If second is not a trailing surrogate, then
if !is_trailing_surrogate(second) {
return (u32::from(first), 1, true);
// a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }.
return CodePointInfo {
code_point: cp,
code_unit_count: 1,
is_unpaired_surrogate: true,
};
}

// 9. Set cp to UTF16SurrogatePairToCodePoint(first, second).
let cp = (u32::from(first) - 0xD800) * 0x400 + (u32::from(second) - 0xDC00) + 0x10000;
(cp, 2, false)

// 10. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 2, [[IsUnpairedSurrogate]]: false }.
CodePointInfo {
code_point: cp,
code_unit_count: 2,
is_unpaired_surrogate: false,
}
}

/// Helper function to check if a `char` is trimmable.
Expand All @@ -86,10 +144,22 @@ pub(crate) fn is_trimmable_whitespace(c: char) -> bool {
)
}

/// Checks if the given code unit is a leading surrogate.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#leading-surrogate
pub(crate) fn is_leading_surrogate(value: u16) -> bool {
(0xD800..=0xDBFF).contains(&value)
}

/// Checks if the given code unit is a trailing surrogate.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#trailing-surrogate
pub(crate) fn is_trailing_surrogate(value: u16) -> bool {
(0xDC00..=0xDFFF).contains(&value)
}
Expand Down Expand Up @@ -369,7 +439,7 @@ impl String {
}
}

/// `String.fromCharCode(...codePoints)`
/// `String.fromCharCode(...codeUnits)`
///
/// Construct a `String` from one or more code points (as numbers).
/// More information:
Expand All @@ -381,21 +451,22 @@ impl String {
args: &[JsValue],
context: &mut Context,
) -> JsResult<JsValue> {
// 1. Let length be the number of elements in codeUnits.
// 2. Let elements be a new empty List.
let mut elements = Vec::new();
// 3. For each element next of codeUnits, do
// 1. Let result be the empty String.
let mut result = Vec::new();

// 2. For each element next of codeUnits, do
for next in args {
// 3a. Let nextCU be ℝ(? ToUint16(next)).
// 3b. Append nextCU to the end of elements.
elements.push(next.to_uint16(context)?);
}
// a. Let nextCU be the code unit whose numeric value is ℝ(? ToUint16(next)).
let next_cu = next.to_uint16(context)?;

// 4. Return the String value whose code units are the elements in the List elements.
// If codeUnits is empty, the empty String is returned.
// b. Set result to the string-concatenation of result and nextCU.
result.push(next_cu);
}

let s = std::string::String::from_utf16_lossy(elements.as_slice());
Ok(JsValue::String(JsString::new(s)))
// 3. Return result.
Ok(JsValue::String(JsString::new(
std::string::String::from_utf16_lossy(&result),
)))
}

/// `String.prototype.toString ( )`
Expand Down Expand Up @@ -544,7 +615,7 @@ impl String {
IntegerOrInfinity::Integer(position) if (0..size).contains(&position) => {
// 6. Let cp be ! CodePointAt(S, position).
// 7. Return 𝔽(cp.[[CodePoint]]).
Ok(code_point_at(&string, position as u64).0.into())
Ok(code_point_at(&string, position as u64).code_point.into())
}
// 5. If position < 0 or position ≥ size, return undefined.
_ => Ok(JsValue::undefined()),
Expand Down
8 changes: 7 additions & 1 deletion boa_engine/src/builtins/string/string_iterator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ use crate::{
use boa_gc::{Finalize, Trace};
use boa_profiler::Profiler;

use super::CodePointInfo;

#[derive(Debug, Clone, Finalize, Trace)]
pub struct StringIterator {
string: JsValue,
Expand Down Expand Up @@ -61,7 +63,11 @@ impl StringIterator {
context,
));
}
let (_, code_unit_count, _) = code_point_at(&native_string, position as u64);
let CodePointInfo {
code_point: _,
code_unit_count,
is_unpaired_surrogate: _,
} = code_point_at(&native_string, position as u64);
string_iterator.next_index += i32::from(code_unit_count);
let result_string = crate::builtins::string::String::substring(
&string_iterator.string,
Expand Down
15 changes: 15 additions & 0 deletions boa_engine/src/builtins/string/tests.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use super::{is_leading_surrogate, is_trailing_surrogate};
use crate::{forward, forward_val, Context};

#[test]
Expand Down Expand Up @@ -1150,3 +1151,17 @@ fn search() {
assert_eq!(forward(&mut context, "'aa'.search(/a/g)"), "0");
assert_eq!(forward(&mut context, "'ba'.search(/a/)"), "1");
}

#[test]
fn ut_is_leading_surrogate() {
for cp in 0xD800..=0xDBFF {
assert!(is_leading_surrogate(cp), "failed: {cp:X}");
}
}

#[test]
fn ut_is_trailing_surrogate() {
for cp in 0xDC00..=0xDFFF {
assert!(is_trailing_surrogate(cp), "failed: {cp:X}");
}
}
110 changes: 110 additions & 0 deletions boa_engine/src/builtins/uri/consts.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
//! URI handling function constants
//!
//! This module contains a few constants used to handle decoding and encoding for URI handling
//! functions. They make it easier and more performant to compare different ranges and code points.

use std::ops::RangeInclusive;

/// A range containing all the lowercase `uriAlpha` code points.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-uriAlpha
const URI_ALPHA_LOWER: RangeInclusive<u16> = b'a' as u16..=b'z' as u16;

/// A range containing all the uppercase `uriAlpha` code points.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-uriAlpha
const URI_ALPHA_UPPER: RangeInclusive<u16> = b'A' as u16..=b'Z' as u16;

/// A range containing all the `DecimalDigit` code points.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-DecimalDigit
const DECIMAL_DIGIT: RangeInclusive<u16> = b'0' as u16..=b'9' as u16;

/// An array containing all the `uriMark` code points.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-uriMark
const URI_MARK: [u16; 9] = [
b'-' as u16,
b'_' as u16,
b'.' as u16,
b'!' as u16,
b'~' as u16,
b'*' as u16,
b'\'' as u16,
b'(' as u16,
b')' as u16,
];

/// An array containing all the `uriReserved` code points.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-uriReserved
const URI_RESERVED: [u16; 10] = [
b';' as u16,
b'/' as u16,
b'?' as u16,
b':' as u16,
b'@' as u16,
b'&' as u16,
b'=' as u16,
b'+' as u16,
b'$' as u16,
b',' as u16,
];

/// The number sign (`#`) symbol as a UTF-16 code potint.
const NUMBER_SIGN: u16 = b'#' as u16;

/// Constant with all the unescaped URI characters.
///
/// Contains `uriAlpha`, `DecimalDigit` and `uriMark`.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-uriUnescaped
#[inline]
pub(super) fn is_uri_unescaped(code_point: u16) -> bool {
jedel1043 marked this conversation as resolved.
Show resolved Hide resolved
URI_ALPHA_LOWER.contains(&code_point)
|| URI_ALPHA_UPPER.contains(&code_point)
|| DECIMAL_DIGIT.contains(&code_point)
|| URI_MARK.contains(&code_point)
}

/// Constant with all the reserved URI characters, plus the number sign symbol (`#`).
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#prod-uriReserved
#[inline]
pub(super) fn is_uri_reserved_or_number_sign(code_point: u16) -> bool {
code_point == NUMBER_SIGN || URI_RESERVED.contains(&code_point)
}

/// Constant with all the reserved and unescaped URI characters, plus the number sign symbol (`#`).
///
/// More information:
/// - [`uriReserved` in ECMAScript spec][uri_reserved]
/// - [`uriUnescaped` in ECMAScript spec][uri_unescaped]
///
/// [uri_reserved]: https://tc39.es/ecma262/#prod-uriReserved
/// [uri_unescaped]: https://tc39.es/ecma262/#prod-uriUnescaped
#[inline]
pub(super) fn is_uri_reserved_or_uri_unescaped_or_number_sign(code_point: u16) -> bool {
code_point == NUMBER_SIGN || is_uri_unescaped(code_point) || URI_RESERVED.contains(&code_point)
}
Loading