diff --git a/crates/js-sys/src/lib.rs b/crates/js-sys/src/lib.rs index 85fcab47ce7..c96b6ae0d74 100644 --- a/crates/js-sys/src/lib.rs +++ b/crates/js-sys/src/lib.rs @@ -3522,6 +3522,37 @@ impl JsString { None } } + + /// Returns whether this string is a valid UTF-16 string. + /// + /// This is useful for learning whether `String::from(..)` will return a + /// lossless representation of the JS string. If this string contains + /// unpaired surrogates then `String::from` will succeed but it will be a + /// lossy representation of the JS string because unpaired surrogates will + /// become replacement characters. + /// + /// If this function returns `false` then to get a lossless representation + /// of the string you'll need to manually use the `iter` method (or the + /// `char_code_at` accessor) to view the raw character codes. + /// + /// For more information, see the documentation on [JS strings vs Rust + /// strings][docs] + /// + /// [docs]: https://rustwasm.github.io/docs/wasm-bindgen/reference/types/str.html + pub fn is_valid_utf16(&self) -> bool { + std::char::decode_utf16(self.iter()).all(|i| i.is_ok()) + } + + /// Returns an iterator over the `u16` character codes that make up this JS + /// string. + /// + /// This method will call `char_code_at` for each code in this JS string, + /// returning an iterator of the codes in sequence. + pub fn iter<'a>( + &'a self, + ) -> impl ExactSizeIterator + DoubleEndedIterator + 'a { + (0..self.length()).map(move |i| self.char_code_at(i) as u16) + } } impl PartialEq for JsString { diff --git a/crates/js-sys/tests/wasm/JsString.rs b/crates/js-sys/tests/wasm/JsString.rs index bb4a6ac0510..c7f229f1613 100644 --- a/crates/js-sys/tests/wasm/JsString.rs +++ b/crates/js-sys/tests/wasm/JsString.rs @@ -541,3 +541,15 @@ fn raw() { ); assert!(JsString::raw_0(&JsValue::null().unchecked_into()).is_err()); } + +#[wasm_bindgen_test] +fn is_valid_utf16() { + assert!(JsString::from("a").is_valid_utf16()); + assert!(JsString::from("").is_valid_utf16()); + assert!(JsString::from("🥑").is_valid_utf16()); + assert!(JsString::from("Why hello there this, 🥑, is 🥑 and is 🥑").is_valid_utf16()); + + assert!(JsString::from_char_code1(0x00).is_valid_utf16()); + assert!(!JsString::from_char_code1(0xd800).is_valid_utf16()); + assert!(!JsString::from_char_code1(0xdc00).is_valid_utf16()); +} diff --git a/examples/without-a-bundler/index.html b/examples/without-a-bundler/index.html index ee71e3d31a1..bb5352783b8 100644 --- a/examples/without-a-bundler/index.html +++ b/examples/without-a-bundler/index.html @@ -25,7 +25,12 @@ // Also note that the promise, when resolved, yields the wasm module's // exports which is the same as importing the `*_bg` module in other // modes - await init('./pkg/without_a_bundler_bg.wasm'); + // await init('./pkg/without_a_bundler_bg.wasm'); + + const url = await fetch('http://localhost:8001/pkg/without_a_bundler_bg.wasm'); + const body = await url.arrayBuffer(); + const module = await WebAssembly.compile(body); + await init(module); // And afterwards we can use all the functionality defined in wasm. const result = add(1, 2); diff --git a/guide/src/reference/types/str.md b/guide/src/reference/types/str.md index 999bbc183f9..5de5f166155 100644 --- a/guide/src/reference/types/str.md +++ b/guide/src/reference/types/str.md @@ -20,3 +20,30 @@ with handles to JavaScript string values, use the `js_sys::JsString` type. ```js {{#include ../../../../examples/guide-supported-types-examples/str.js}} ``` + +## UTF-16 vs UTF-8 + +Strings in JavaScript are encoded as UTF-16, but with one major exception: they +can contain unpaired surrogates. For some Unicode characters UTF-16 uses two +16-byte values. These are called "surrogate pairs" because they always come in +pairs. In JavaScript, it is possible for these surrogate pairs to be missing the +other half, creating an "unpaired surrogate". + +When passing a string from JavaScript to Rust, it uses the `TextEncoder` API to +convert from UTF-16 to UTF-8. This is normally perfectly fine... unless there +are unpaired surrogates. In that case it will replace the unpaired surrogates +with U+FFFD (�, the replacement character). That means the string in Rust is +now different from the string in JavaScript! + +If you want to guarantee that the Rust string is the same as the JavaScript +string, you should instead use `js_sys::JsString` (which keeps the string in +JavaScript and doesn't copy it into Rust). + +If you want to access the raw value of a JS string, you can use `JsString::iter`, +which returns an `Iterator`. This perfectly preserves everything +(including unpaired surrogates), but it does not do any encoding (so you +have to do that yourself!). + +If you simply want to ignore strings which contain unpaired surrogates, you can +use `JsString::is_valid_utf16` to test whether the string contains unpaired +surrogates or not. diff --git a/guide/src/reference/types/string.md b/guide/src/reference/types/string.md index 568e20b63e1..3b846704abf 100644 --- a/guide/src/reference/types/string.md +++ b/guide/src/reference/types/string.md @@ -8,6 +8,9 @@ Copies the string's contents back and forth between the JavaScript garbage-collected heap and the Wasm linear memory with `TextDecoder` and `TextEncoder` +> **Note**: Be sure to check out the [documentation for `str`](str.html) to +> learn about some caveats when working with strings between JS and Rust. + ## Example Rust Usage ```rust diff --git a/src/lib.rs b/src/lib.rs index 0cd5034b812..1c1f78ff276 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -260,6 +260,16 @@ impl JsValue { /// /// If this JS value is not an instance of a string or if it's not valid /// utf-8 then this returns `None`. + /// + /// # UTF-16 vs UTF-8 + /// + /// JavaScript strings in general are encoded as UTF-16, but Rust strings + /// are encoded as UTF-8. This can cause the Rust string to look a bit + /// different than the JS string sometimes. For more details see the + /// [documentation about the `str` type][caveats] which contains a few + /// caveats about the encodings. + /// + /// [caveats]: https://rustwasm.github.io/docs/wasm-bindgen/reference/types/str.html #[cfg(feature = "std")] pub fn as_string(&self) -> Option { unsafe {