From 89be04c1ca504700463e282fa253c3fcc3ec2652 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Wed, 27 Mar 2024 09:46:27 -0600 Subject: [PATCH 1/9] fast path for ascii strings --- src/py_string_cache.rs | 54 ++++++++++++++++++++++++++++++------------ src/python.rs | 6 ++--- src/string_decoder.rs | 13 ++++++++++ 3 files changed, 55 insertions(+), 18 deletions(-) diff --git a/src/py_string_cache.rs b/src/py_string_cache.rs index 92b70860..4791ae8e 100644 --- a/src/py_string_cache.rs +++ b/src/py_string_cache.rs @@ -2,10 +2,13 @@ use std::cell::RefCell; use ahash::random_state::RandomState; use pyo3::exceptions::{PyTypeError, PyValueError}; +use pyo3::ffi; use pyo3::prelude::*; use pyo3::sync::{GILOnceCell, GILProtected}; use pyo3::types::{PyBool, PyString}; +use crate::string_decoder::StrType; + #[derive(Debug, Clone, Copy)] pub enum StringCacheMode { All, @@ -45,9 +48,9 @@ impl From for StringCacheMode { } pub trait StringMaybeCache { - fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString>; + fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString>; - fn get_value<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { Self::get_key(py, json_str) } } @@ -55,7 +58,7 @@ pub trait StringMaybeCache { pub struct StringCacheAll; impl StringMaybeCache for StringCacheAll { - fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { cached_py_string(py, json_str) } } @@ -63,20 +66,20 @@ impl StringMaybeCache for StringCacheAll { pub struct StringCacheKeys; impl StringMaybeCache for StringCacheKeys { - fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { cached_py_string(py, json_str) } - fn get_value<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { - PyString::new_bound(py, json_str) + fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { + pystring_unicode_known(py, json_str) } } pub struct StringNoCache; impl StringMaybeCache for StringNoCache { - fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { - PyString::new_bound(py, json_str) + fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { + pystring_unicode_known(py, json_str) } } @@ -98,12 +101,18 @@ pub fn cache_clear(py: Python) { get_string_cache!(py).borrow_mut().clear() } -pub fn cached_py_string<'py>(py: Python<'py>, raw_str: &str) -> Bound<'py, PyString> { +static EMPTY_STRING: GILOnceCell> = GILOnceCell::new(); + +pub fn cached_py_string<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { // from tests, 0 and 1 character strings are faster not cached - if (2..64).contains(&raw_str.len()) { + let len = raw_str.s.len(); + if len == 0 { + let s = EMPTY_STRING.get_or_init(py, || unsafe { pystring_unicode(py, "") }.into_py(py)); + s.clone_ref(py).into_bound(py) + } else if (2..64).contains(&len) { get_string_cache!(py).borrow_mut().get_or_insert(py, raw_str) } else { - PyString::new_bound(py, raw_str) + pystring_unicode_known(py, raw_str) } } @@ -135,13 +144,13 @@ impl Default for PyStringCache { impl PyStringCache { /// Lookup the cache for an entry with the given string. If it exists, return it. /// If it is not set or has a different string, insert it and return it. - fn get_or_insert<'py>(&mut self, py: Python<'py>, s: &str) -> Bound<'py, PyString> { - let hash = self.hash_builder.hash_one(s); + fn get_or_insert<'py>(&mut self, py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { + let hash = self.hash_builder.hash_one(raw_str.s); let hash_index = hash as usize % CAPACITY; let set_entry = |entry: &mut Entry| { - let py_str = PyString::new_bound(py, s); + let py_str = pystring_unicode_known(py, raw_str); *entry = Some((hash, py_str.to_owned().unbind())); py_str }; @@ -153,7 +162,7 @@ impl PyStringCache { // to avoid a string comparison, we first compare the hashes if *entry_hash == hash { // if the hashes match, we compare the strings to be absolutely sure - as a hashmap would do - if py_str_ob.bind(py).to_str().ok() == Some(s) { + if py_str_ob.bind(py).to_str().ok() == Some(raw_str.s) { // the strings matched, return the cached string object return py_str_ob.bind(py).to_owned(); } @@ -183,3 +192,18 @@ impl PyStringCache { self.entries.fill(None); } } + +pub fn pystring_unicode_known<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { + if raw_str.known_ascii { + unsafe { pystring_unicode(py, raw_str.s) } + } else { + PyString::new_bound(py, raw_str.s) + } +} +pub unsafe fn pystring_unicode<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { + let ptr = ffi::PyUnicode_New(s.len() as isize, 127); + let data_ptr = ptr.cast::().offset(1) as *mut u8; + core::ptr::copy_nonoverlapping(s.as_ptr(), data_ptr, s.len()); + core::ptr::write(data_ptr.add(s.len()), 0); + Bound::from_owned_ptr(py, ptr).downcast_into_unchecked() +} diff --git a/src/python.rs b/src/python.rs index 4cd7421c..caea115e 100644 --- a/src/python.rs +++ b/src/python.rs @@ -85,7 +85,7 @@ impl<'j> PythonParser<'j> { } Peek::String => { let s = self.parser.consume_string::(&mut self.tape)?; - Ok(StringCache::get_value(py, s.as_str()).into_any()) + Ok(StringCache::get_value(py, s.as_str_type()).into_any()) } Peek::Array => { let peek_first = match self.parser.array_first() { @@ -162,12 +162,12 @@ impl<'j> PythonParser<'j> { } }; if let Some(first_key) = self.parser.object_first::(&mut self.tape)? { - let first_key = StringCache::get_key(py, first_key.as_str()); + let first_key = StringCache::get_key(py, first_key.as_str_type()); let peek = self.parser.peek()?; let first_value = self._check_take_value::(py, peek)?; set_item(first_key, first_value); while let Some(key) = self.parser.object_step::(&mut self.tape)? { - let key = StringCache::get_key(py, key.as_str()); + let key = StringCache::get_key(py, key.as_str_type()); let peek = self.parser.peek()?; let value = self._check_take_value::(py, peek)?; set_item(key, value); diff --git a/src/string_decoder.rs b/src/string_decoder.rs index bc102084..342a84d2 100644 --- a/src/string_decoder.rs +++ b/src/string_decoder.rs @@ -47,6 +47,12 @@ impl<'t, 'j> From> for Cow<'j, str> { } } +#[derive(Debug, Clone, Copy)] +pub struct StrType<'a> { + pub s: &'a str, + pub known_ascii: bool, +} + impl<'t, 'j> StringOutput<'t, 'j> { pub fn as_str(&self) -> &'t str { match self { @@ -54,6 +60,13 @@ impl<'t, 'j> StringOutput<'t, 'j> { Self::Data(s) => s, } } + + pub fn as_str_type(&self) -> StrType<'t> { + match self { + Self::Tape(s) => StrType { s, known_ascii: false }, + Self::Data(s) => StrType { s, known_ascii: true }, + } + } } // taken serde-rs/json but altered From 3c85f97cef5a398d864ca39b1ad788a8def755fc Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Wed, 27 Mar 2024 11:40:41 -0600 Subject: [PATCH 2/9] use bytecount::num_chars --- Cargo.lock | 7 ++++++ Cargo.toml | 1 + src/lib.rs | 2 +- src/py_string_cache.rs | 49 ++++++++++++++++++------------------------ src/python.rs | 6 +++--- src/string_decoder.rs | 13 ----------- tests/python.rs | 20 +++++++++++++++++ 7 files changed, 53 insertions(+), 45 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 50526555..2ef27049 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -33,6 +33,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bytecount" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205" + [[package]] name = "cfg-if" version = "1.0.0" @@ -115,6 +121,7 @@ version = "0.1.1" dependencies = [ "ahash", "bencher", + "bytecount", "codspeed-bencher-compat", "lexical-parse-float", "num-bigint", diff --git a/Cargo.toml b/Cargo.toml index 7abb6d5d..7db40d9b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ ahash = "0.8.0" smallvec = "1.11.0" pyo3 = { version = "0.21.0", default-features=false, features = ["num-bigint"], optional = true } lexical-parse-float = { version = "0.8.5", features = ["format"] } +bytecount = { version = "0.6.7", default_features = false, features = ["runtime-dispatch-simd"] } [features] python = ["dep:pyo3"] diff --git a/src/lib.rs b/src/lib.rs index dc3080f0..7f943ad6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,6 +20,6 @@ pub use parse::Peek; pub use value::{JsonArray, JsonObject, JsonValue}; #[cfg(feature = "python")] -pub use py_string_cache::{cache_clear, cache_usage, cached_py_string, StringCacheMode}; +pub use py_string_cache::{cache_clear, cache_usage, cached_py_string, pystring_fast_new, StringCacheMode}; #[cfg(feature = "python")] pub use python::{map_json_error, python_parse}; diff --git a/src/py_string_cache.rs b/src/py_string_cache.rs index 4791ae8e..c9631104 100644 --- a/src/py_string_cache.rs +++ b/src/py_string_cache.rs @@ -7,8 +7,6 @@ use pyo3::prelude::*; use pyo3::sync::{GILOnceCell, GILProtected}; use pyo3::types::{PyBool, PyString}; -use crate::string_decoder::StrType; - #[derive(Debug, Clone, Copy)] pub enum StringCacheMode { All, @@ -48,9 +46,9 @@ impl From for StringCacheMode { } pub trait StringMaybeCache { - fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString>; + fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString>; - fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { + fn get_value<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { Self::get_key(py, json_str) } } @@ -58,7 +56,7 @@ pub trait StringMaybeCache { pub struct StringCacheAll; impl StringMaybeCache for StringCacheAll { - fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { + fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { cached_py_string(py, json_str) } } @@ -66,20 +64,20 @@ impl StringMaybeCache for StringCacheAll { pub struct StringCacheKeys; impl StringMaybeCache for StringCacheKeys { - fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { + fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { cached_py_string(py, json_str) } - fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { - pystring_unicode_known(py, json_str) + fn get_value<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + pystring_fast_new(py, json_str) } } pub struct StringNoCache; impl StringMaybeCache for StringNoCache { - fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { - pystring_unicode_known(py, json_str) + fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + pystring_fast_new(py, json_str) } } @@ -101,18 +99,12 @@ pub fn cache_clear(py: Python) { get_string_cache!(py).borrow_mut().clear() } -static EMPTY_STRING: GILOnceCell> = GILOnceCell::new(); - -pub fn cached_py_string<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { +pub fn cached_py_string<'py>(py: Python<'py>, raw_str: &str) -> Bound<'py, PyString> { // from tests, 0 and 1 character strings are faster not cached - let len = raw_str.s.len(); - if len == 0 { - let s = EMPTY_STRING.get_or_init(py, || unsafe { pystring_unicode(py, "") }.into_py(py)); - s.clone_ref(py).into_bound(py) - } else if (2..64).contains(&len) { + if (2..64).contains(&raw_str.len()) { get_string_cache!(py).borrow_mut().get_or_insert(py, raw_str) } else { - pystring_unicode_known(py, raw_str) + pystring_fast_new(py, raw_str) } } @@ -144,13 +136,13 @@ impl Default for PyStringCache { impl PyStringCache { /// Lookup the cache for an entry with the given string. If it exists, return it. /// If it is not set or has a different string, insert it and return it. - fn get_or_insert<'py>(&mut self, py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { - let hash = self.hash_builder.hash_one(raw_str.s); + fn get_or_insert<'py>(&mut self, py: Python<'py>, raw_str: &str) -> Bound<'py, PyString> { + let hash = self.hash_builder.hash_one(raw_str); let hash_index = hash as usize % CAPACITY; let set_entry = |entry: &mut Entry| { - let py_str = pystring_unicode_known(py, raw_str); + let py_str = pystring_fast_new(py, raw_str); *entry = Some((hash, py_str.to_owned().unbind())); py_str }; @@ -162,7 +154,7 @@ impl PyStringCache { // to avoid a string comparison, we first compare the hashes if *entry_hash == hash { // if the hashes match, we compare the strings to be absolutely sure - as a hashmap would do - if py_str_ob.bind(py).to_str().ok() == Some(raw_str.s) { + if py_str_ob.bind(py).to_str().ok() == Some(raw_str) { // the strings matched, return the cached string object return py_str_ob.bind(py).to_owned(); } @@ -193,14 +185,15 @@ impl PyStringCache { } } -pub fn pystring_unicode_known<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { - if raw_str.known_ascii { - unsafe { pystring_unicode(py, raw_str.s) } +pub fn pystring_fast_new<'py>(py: Python<'py>, raw_str: &str) -> Bound<'py, PyString> { + if bytecount::num_chars(raw_str.as_bytes()) == raw_str.len() { + unsafe { pystring_unicode_ascii(py, raw_str) } } else { - PyString::new_bound(py, raw_str.s) + PyString::new_bound(py, raw_str) } } -pub unsafe fn pystring_unicode<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { + +unsafe fn pystring_unicode_ascii<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { let ptr = ffi::PyUnicode_New(s.len() as isize, 127); let data_ptr = ptr.cast::().offset(1) as *mut u8; core::ptr::copy_nonoverlapping(s.as_ptr(), data_ptr, s.len()); diff --git a/src/python.rs b/src/python.rs index caea115e..4cd7421c 100644 --- a/src/python.rs +++ b/src/python.rs @@ -85,7 +85,7 @@ impl<'j> PythonParser<'j> { } Peek::String => { let s = self.parser.consume_string::(&mut self.tape)?; - Ok(StringCache::get_value(py, s.as_str_type()).into_any()) + Ok(StringCache::get_value(py, s.as_str()).into_any()) } Peek::Array => { let peek_first = match self.parser.array_first() { @@ -162,12 +162,12 @@ impl<'j> PythonParser<'j> { } }; if let Some(first_key) = self.parser.object_first::(&mut self.tape)? { - let first_key = StringCache::get_key(py, first_key.as_str_type()); + let first_key = StringCache::get_key(py, first_key.as_str()); let peek = self.parser.peek()?; let first_value = self._check_take_value::(py, peek)?; set_item(first_key, first_value); while let Some(key) = self.parser.object_step::(&mut self.tape)? { - let key = StringCache::get_key(py, key.as_str_type()); + let key = StringCache::get_key(py, key.as_str()); let peek = self.parser.peek()?; let value = self._check_take_value::(py, peek)?; set_item(key, value); diff --git a/src/string_decoder.rs b/src/string_decoder.rs index 342a84d2..bc102084 100644 --- a/src/string_decoder.rs +++ b/src/string_decoder.rs @@ -47,12 +47,6 @@ impl<'t, 'j> From> for Cow<'j, str> { } } -#[derive(Debug, Clone, Copy)] -pub struct StrType<'a> { - pub s: &'a str, - pub known_ascii: bool, -} - impl<'t, 'j> StringOutput<'t, 'j> { pub fn as_str(&self) -> &'t str { match self { @@ -60,13 +54,6 @@ impl<'t, 'j> StringOutput<'t, 'j> { Self::Data(s) => s, } } - - pub fn as_str_type(&self) -> StrType<'t> { - match self { - Self::Tape(s) => StrType { s, known_ascii: false }, - Self::Data(s) => StrType { s, known_ascii: true }, - } - } } // taken serde-rs/json but altered diff --git a/tests/python.rs b/tests/python.rs index 3920dc56..f981070c 100644 --- a/tests/python.rs +++ b/tests/python.rs @@ -269,3 +269,23 @@ fn test_cache_into() { ); }) } + +#[test] +fn test_unicode() { + let json = r#"["💩", "£"]"#; + Python::with_gil(|py| { + cache_clear(py); + let obj = python_parse(py, json.as_bytes(), false, StringCacheMode::None, false).unwrap(); + assert_eq!(obj.to_string(), "['💩', '£']"); + }) +} + +#[test] +fn test_unicode_cache() { + let json = r#"["💩", "£"]"#; + Python::with_gil(|py| { + cache_clear(py); + let obj = python_parse(py, json.as_bytes(), false, StringCacheMode::All, false).unwrap(); + assert_eq!(obj.to_string(), "['💩', '£']"); + }) +} From a5aea635cfb64370203c61a74f0cb29f9d5cb68a Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Wed, 27 Mar 2024 11:45:39 -0600 Subject: [PATCH 3/9] Revert "use bytecount::num_chars" This reverts commit 569fe11f47df9d520df4d92b9751b5f339a23084. --- Cargo.lock | 7 ------ Cargo.toml | 1 - src/lib.rs | 2 +- src/py_string_cache.rs | 49 ++++++++++++++++++++++++------------------ src/python.rs | 6 +++--- src/string_decoder.rs | 13 +++++++++++ tests/python.rs | 20 ----------------- 7 files changed, 45 insertions(+), 53 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2ef27049..50526555 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -33,12 +33,6 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" -[[package]] -name = "bytecount" -version = "0.6.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205" - [[package]] name = "cfg-if" version = "1.0.0" @@ -121,7 +115,6 @@ version = "0.1.1" dependencies = [ "ahash", "bencher", - "bytecount", "codspeed-bencher-compat", "lexical-parse-float", "num-bigint", diff --git a/Cargo.toml b/Cargo.toml index 7db40d9b..7abb6d5d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,6 @@ ahash = "0.8.0" smallvec = "1.11.0" pyo3 = { version = "0.21.0", default-features=false, features = ["num-bigint"], optional = true } lexical-parse-float = { version = "0.8.5", features = ["format"] } -bytecount = { version = "0.6.7", default_features = false, features = ["runtime-dispatch-simd"] } [features] python = ["dep:pyo3"] diff --git a/src/lib.rs b/src/lib.rs index 7f943ad6..dc3080f0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,6 +20,6 @@ pub use parse::Peek; pub use value::{JsonArray, JsonObject, JsonValue}; #[cfg(feature = "python")] -pub use py_string_cache::{cache_clear, cache_usage, cached_py_string, pystring_fast_new, StringCacheMode}; +pub use py_string_cache::{cache_clear, cache_usage, cached_py_string, StringCacheMode}; #[cfg(feature = "python")] pub use python::{map_json_error, python_parse}; diff --git a/src/py_string_cache.rs b/src/py_string_cache.rs index c9631104..4791ae8e 100644 --- a/src/py_string_cache.rs +++ b/src/py_string_cache.rs @@ -7,6 +7,8 @@ use pyo3::prelude::*; use pyo3::sync::{GILOnceCell, GILProtected}; use pyo3::types::{PyBool, PyString}; +use crate::string_decoder::StrType; + #[derive(Debug, Clone, Copy)] pub enum StringCacheMode { All, @@ -46,9 +48,9 @@ impl From for StringCacheMode { } pub trait StringMaybeCache { - fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString>; + fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString>; - fn get_value<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { Self::get_key(py, json_str) } } @@ -56,7 +58,7 @@ pub trait StringMaybeCache { pub struct StringCacheAll; impl StringMaybeCache for StringCacheAll { - fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { cached_py_string(py, json_str) } } @@ -64,20 +66,20 @@ impl StringMaybeCache for StringCacheAll { pub struct StringCacheKeys; impl StringMaybeCache for StringCacheKeys { - fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { cached_py_string(py, json_str) } - fn get_value<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { - pystring_fast_new(py, json_str) + fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { + pystring_unicode_known(py, json_str) } } pub struct StringNoCache; impl StringMaybeCache for StringNoCache { - fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { - pystring_fast_new(py, json_str) + fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { + pystring_unicode_known(py, json_str) } } @@ -99,12 +101,18 @@ pub fn cache_clear(py: Python) { get_string_cache!(py).borrow_mut().clear() } -pub fn cached_py_string<'py>(py: Python<'py>, raw_str: &str) -> Bound<'py, PyString> { +static EMPTY_STRING: GILOnceCell> = GILOnceCell::new(); + +pub fn cached_py_string<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { // from tests, 0 and 1 character strings are faster not cached - if (2..64).contains(&raw_str.len()) { + let len = raw_str.s.len(); + if len == 0 { + let s = EMPTY_STRING.get_or_init(py, || unsafe { pystring_unicode(py, "") }.into_py(py)); + s.clone_ref(py).into_bound(py) + } else if (2..64).contains(&len) { get_string_cache!(py).borrow_mut().get_or_insert(py, raw_str) } else { - pystring_fast_new(py, raw_str) + pystring_unicode_known(py, raw_str) } } @@ -136,13 +144,13 @@ impl Default for PyStringCache { impl PyStringCache { /// Lookup the cache for an entry with the given string. If it exists, return it. /// If it is not set or has a different string, insert it and return it. - fn get_or_insert<'py>(&mut self, py: Python<'py>, raw_str: &str) -> Bound<'py, PyString> { - let hash = self.hash_builder.hash_one(raw_str); + fn get_or_insert<'py>(&mut self, py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { + let hash = self.hash_builder.hash_one(raw_str.s); let hash_index = hash as usize % CAPACITY; let set_entry = |entry: &mut Entry| { - let py_str = pystring_fast_new(py, raw_str); + let py_str = pystring_unicode_known(py, raw_str); *entry = Some((hash, py_str.to_owned().unbind())); py_str }; @@ -154,7 +162,7 @@ impl PyStringCache { // to avoid a string comparison, we first compare the hashes if *entry_hash == hash { // if the hashes match, we compare the strings to be absolutely sure - as a hashmap would do - if py_str_ob.bind(py).to_str().ok() == Some(raw_str) { + if py_str_ob.bind(py).to_str().ok() == Some(raw_str.s) { // the strings matched, return the cached string object return py_str_ob.bind(py).to_owned(); } @@ -185,15 +193,14 @@ impl PyStringCache { } } -pub fn pystring_fast_new<'py>(py: Python<'py>, raw_str: &str) -> Bound<'py, PyString> { - if bytecount::num_chars(raw_str.as_bytes()) == raw_str.len() { - unsafe { pystring_unicode_ascii(py, raw_str) } +pub fn pystring_unicode_known<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { + if raw_str.known_ascii { + unsafe { pystring_unicode(py, raw_str.s) } } else { - PyString::new_bound(py, raw_str) + PyString::new_bound(py, raw_str.s) } } - -unsafe fn pystring_unicode_ascii<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { +pub unsafe fn pystring_unicode<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { let ptr = ffi::PyUnicode_New(s.len() as isize, 127); let data_ptr = ptr.cast::().offset(1) as *mut u8; core::ptr::copy_nonoverlapping(s.as_ptr(), data_ptr, s.len()); diff --git a/src/python.rs b/src/python.rs index 4cd7421c..caea115e 100644 --- a/src/python.rs +++ b/src/python.rs @@ -85,7 +85,7 @@ impl<'j> PythonParser<'j> { } Peek::String => { let s = self.parser.consume_string::(&mut self.tape)?; - Ok(StringCache::get_value(py, s.as_str()).into_any()) + Ok(StringCache::get_value(py, s.as_str_type()).into_any()) } Peek::Array => { let peek_first = match self.parser.array_first() { @@ -162,12 +162,12 @@ impl<'j> PythonParser<'j> { } }; if let Some(first_key) = self.parser.object_first::(&mut self.tape)? { - let first_key = StringCache::get_key(py, first_key.as_str()); + let first_key = StringCache::get_key(py, first_key.as_str_type()); let peek = self.parser.peek()?; let first_value = self._check_take_value::(py, peek)?; set_item(first_key, first_value); while let Some(key) = self.parser.object_step::(&mut self.tape)? { - let key = StringCache::get_key(py, key.as_str()); + let key = StringCache::get_key(py, key.as_str_type()); let peek = self.parser.peek()?; let value = self._check_take_value::(py, peek)?; set_item(key, value); diff --git a/src/string_decoder.rs b/src/string_decoder.rs index bc102084..342a84d2 100644 --- a/src/string_decoder.rs +++ b/src/string_decoder.rs @@ -47,6 +47,12 @@ impl<'t, 'j> From> for Cow<'j, str> { } } +#[derive(Debug, Clone, Copy)] +pub struct StrType<'a> { + pub s: &'a str, + pub known_ascii: bool, +} + impl<'t, 'j> StringOutput<'t, 'j> { pub fn as_str(&self) -> &'t str { match self { @@ -54,6 +60,13 @@ impl<'t, 'j> StringOutput<'t, 'j> { Self::Data(s) => s, } } + + pub fn as_str_type(&self) -> StrType<'t> { + match self { + Self::Tape(s) => StrType { s, known_ascii: false }, + Self::Data(s) => StrType { s, known_ascii: true }, + } + } } // taken serde-rs/json but altered diff --git a/tests/python.rs b/tests/python.rs index f981070c..3920dc56 100644 --- a/tests/python.rs +++ b/tests/python.rs @@ -269,23 +269,3 @@ fn test_cache_into() { ); }) } - -#[test] -fn test_unicode() { - let json = r#"["💩", "£"]"#; - Python::with_gil(|py| { - cache_clear(py); - let obj = python_parse(py, json.as_bytes(), false, StringCacheMode::None, false).unwrap(); - assert_eq!(obj.to_string(), "['💩', '£']"); - }) -} - -#[test] -fn test_unicode_cache() { - let json = r#"["💩", "£"]"#; - Python::with_gil(|py| { - cache_clear(py); - let obj = python_parse(py, json.as_bytes(), false, StringCacheMode::All, false).unwrap(); - assert_eq!(obj.to_string(), "['💩', '£']"); - }) -} From f31a6aaaa42104015088da9b8fb38763f9b18408 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Wed, 27 Mar 2024 11:47:05 -0600 Subject: [PATCH 4/9] no empty string check --- src/py_string_cache.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/py_string_cache.rs b/src/py_string_cache.rs index 4791ae8e..6e8fec40 100644 --- a/src/py_string_cache.rs +++ b/src/py_string_cache.rs @@ -101,15 +101,10 @@ pub fn cache_clear(py: Python) { get_string_cache!(py).borrow_mut().clear() } -static EMPTY_STRING: GILOnceCell> = GILOnceCell::new(); - pub fn cached_py_string<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { // from tests, 0 and 1 character strings are faster not cached let len = raw_str.s.len(); - if len == 0 { - let s = EMPTY_STRING.get_or_init(py, || unsafe { pystring_unicode(py, "") }.into_py(py)); - s.clone_ref(py).into_bound(py) - } else if (2..64).contains(&len) { + if (2..64).contains(&len) { get_string_cache!(py).borrow_mut().get_or_insert(py, raw_str) } else { pystring_unicode_known(py, raw_str) @@ -200,6 +195,7 @@ pub fn pystring_unicode_known<'py>(py: Python<'py>, raw_str: StrType) -> Bound<' PyString::new_bound(py, raw_str.s) } } + pub unsafe fn pystring_unicode<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { let ptr = ffi::PyUnicode_New(s.len() as isize, 127); let data_ptr = ptr.cast::().offset(1) as *mut u8; From 0b71ca08d0e1c49e4c9c262f662b3b8860af7080 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Wed, 27 Mar 2024 17:39:50 -0600 Subject: [PATCH 5/9] separate argument --- src/lib.rs | 2 +- src/py_string_cache.rs | 51 ++++++++++++++++++++---------------------- src/python.rs | 6 ++--- src/string_decoder.rs | 32 +++++++++++--------------- tests/python.rs | 40 ++++++++++++++++++++++++++++++++- 5 files changed, 80 insertions(+), 51 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index dc3080f0..7f943ad6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,6 +20,6 @@ pub use parse::Peek; pub use value::{JsonArray, JsonObject, JsonValue}; #[cfg(feature = "python")] -pub use py_string_cache::{cache_clear, cache_usage, cached_py_string, StringCacheMode}; +pub use py_string_cache::{cache_clear, cache_usage, cached_py_string, pystring_fast_new, StringCacheMode}; #[cfg(feature = "python")] pub use python::{map_json_error, python_parse}; diff --git a/src/py_string_cache.rs b/src/py_string_cache.rs index 6e8fec40..4d1c90df 100644 --- a/src/py_string_cache.rs +++ b/src/py_string_cache.rs @@ -7,8 +7,6 @@ use pyo3::prelude::*; use pyo3::sync::{GILOnceCell, GILProtected}; use pyo3::types::{PyBool, PyString}; -use crate::string_decoder::StrType; - #[derive(Debug, Clone, Copy)] pub enum StringCacheMode { All, @@ -48,38 +46,38 @@ impl From for StringCacheMode { } pub trait StringMaybeCache { - fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString>; + fn get_key<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString>; - fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { - Self::get_key(py, json_str) + fn get_value<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> { + Self::get_key(py, json_str, ascii_only) } } pub struct StringCacheAll; impl StringMaybeCache for StringCacheAll { - fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { - cached_py_string(py, json_str) + fn get_key<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> { + cached_py_string(py, json_str, ascii_only) } } pub struct StringCacheKeys; impl StringMaybeCache for StringCacheKeys { - fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { - cached_py_string(py, json_str) + fn get_key<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> { + cached_py_string(py, json_str, ascii_only) } - fn get_value<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { - pystring_unicode_known(py, json_str) + fn get_value<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> { + pystring_fast_new(py, json_str, ascii_only) } } pub struct StringNoCache; impl StringMaybeCache for StringNoCache { - fn get_key<'py>(py: Python<'py>, json_str: StrType) -> Bound<'py, PyString> { - pystring_unicode_known(py, json_str) + fn get_key<'py>(py: Python<'py>, json_str: &str, ascii_only: bool) -> Bound<'py, PyString> { + pystring_fast_new(py, json_str, ascii_only) } } @@ -101,13 +99,12 @@ pub fn cache_clear(py: Python) { get_string_cache!(py).borrow_mut().clear() } -pub fn cached_py_string<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { +pub fn cached_py_string<'py>(py: Python<'py>, s: &str, ascii_only: bool) -> Bound<'py, PyString> { // from tests, 0 and 1 character strings are faster not cached - let len = raw_str.s.len(); - if (2..64).contains(&len) { - get_string_cache!(py).borrow_mut().get_or_insert(py, raw_str) + if (2..64).contains(&s.len()) { + get_string_cache!(py).borrow_mut().get_or_insert(py, s, ascii_only) } else { - pystring_unicode_known(py, raw_str) + pystring_fast_new(py, s, ascii_only) } } @@ -139,13 +136,13 @@ impl Default for PyStringCache { impl PyStringCache { /// Lookup the cache for an entry with the given string. If it exists, return it. /// If it is not set or has a different string, insert it and return it. - fn get_or_insert<'py>(&mut self, py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { - let hash = self.hash_builder.hash_one(raw_str.s); + fn get_or_insert<'py>(&mut self, py: Python<'py>, s: &str, ascii_only: bool) -> Bound<'py, PyString> { + let hash = self.hash_builder.hash_one(s); let hash_index = hash as usize % CAPACITY; let set_entry = |entry: &mut Entry| { - let py_str = pystring_unicode_known(py, raw_str); + let py_str = pystring_fast_new(py, s, ascii_only); *entry = Some((hash, py_str.to_owned().unbind())); py_str }; @@ -157,7 +154,7 @@ impl PyStringCache { // to avoid a string comparison, we first compare the hashes if *entry_hash == hash { // if the hashes match, we compare the strings to be absolutely sure - as a hashmap would do - if py_str_ob.bind(py).to_str().ok() == Some(raw_str.s) { + if py_str_ob.bind(py).to_str().ok() == Some(s) { // the strings matched, return the cached string object return py_str_ob.bind(py).to_owned(); } @@ -188,15 +185,15 @@ impl PyStringCache { } } -pub fn pystring_unicode_known<'py>(py: Python<'py>, raw_str: StrType) -> Bound<'py, PyString> { - if raw_str.known_ascii { - unsafe { pystring_unicode(py, raw_str.s) } +pub fn pystring_fast_new<'py>(py: Python<'py>, s: &str, ascii_only: bool) -> Bound<'py, PyString> { + if ascii_only { + unsafe { pystring_ascii_new(py, s) } } else { - PyString::new_bound(py, raw_str.s) + PyString::new_bound(py, s) } } -pub unsafe fn pystring_unicode<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { +unsafe fn pystring_ascii_new<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { let ptr = ffi::PyUnicode_New(s.len() as isize, 127); let data_ptr = ptr.cast::().offset(1) as *mut u8; core::ptr::copy_nonoverlapping(s.as_ptr(), data_ptr, s.len()); diff --git a/src/python.rs b/src/python.rs index caea115e..3cc88a38 100644 --- a/src/python.rs +++ b/src/python.rs @@ -85,7 +85,7 @@ impl<'j> PythonParser<'j> { } Peek::String => { let s = self.parser.consume_string::(&mut self.tape)?; - Ok(StringCache::get_value(py, s.as_str_type()).into_any()) + Ok(StringCache::get_value(py, s.as_str(), s.ascii_only()).into_any()) } Peek::Array => { let peek_first = match self.parser.array_first() { @@ -162,12 +162,12 @@ impl<'j> PythonParser<'j> { } }; if let Some(first_key) = self.parser.object_first::(&mut self.tape)? { - let first_key = StringCache::get_key(py, first_key.as_str_type()); + let first_key = StringCache::get_key(py, first_key.as_str(), first_key.ascii_only()); let peek = self.parser.peek()?; let first_value = self._check_take_value::(py, peek)?; set_item(first_key, first_value); while let Some(key) = self.parser.object_step::(&mut self.tape)? { - let key = StringCache::get_key(py, key.as_str_type()); + let key = StringCache::get_key(py, key.as_str(), key.ascii_only()); let peek = self.parser.peek()?; let value = self._check_take_value::(py, peek)?; set_item(key, value); diff --git a/src/string_decoder.rs b/src/string_decoder.rs index 342a84d2..cbf1623f 100644 --- a/src/string_decoder.rs +++ b/src/string_decoder.rs @@ -25,15 +25,15 @@ pub enum StringOutput<'t, 'j> where 'j: 't, { - Tape(&'t str), - Data(&'j str), + Tape(&'t str, bool), + Data(&'j str, bool), } impl From> for String { fn from(val: StringOutput) -> Self { match val { - StringOutput::Tape(s) => s.to_owned(), - StringOutput::Data(s) => s.to_owned(), + StringOutput::Tape(s, _) => s.to_owned(), + StringOutput::Data(s, _) => s.to_owned(), } } } @@ -41,30 +41,24 @@ impl From> for String { impl<'t, 'j> From> for Cow<'j, str> { fn from(val: StringOutput<'t, 'j>) -> Self { match val { - StringOutput::Tape(s) => s.to_owned().into(), - StringOutput::Data(s) => s.into(), + StringOutput::Tape(s, _) => s.to_owned().into(), + StringOutput::Data(s, _) => s.into(), } } } -#[derive(Debug, Clone, Copy)] -pub struct StrType<'a> { - pub s: &'a str, - pub known_ascii: bool, -} - impl<'t, 'j> StringOutput<'t, 'j> { pub fn as_str(&self) -> &'t str { match self { - Self::Tape(s) => s, - Self::Data(s) => s, + Self::Tape(s, _) => s, + Self::Data(s, _) => s, } } - pub fn as_str_type(&self) -> StrType<'t> { + pub fn ascii_only(&self) -> bool { match self { - Self::Tape(s) => StrType { s, known_ascii: false }, - Self::Data(s) => StrType { s, known_ascii: true }, + Self::Tape(_, ascii_only) => *ascii_only, + Self::Data(_, ascii_only) => *ascii_only, } } } @@ -156,7 +150,7 @@ where CharType::Quote => { let s = to_str(&data[start..index], ascii_only, start)?; index += 1; - return Ok((StringOutput::Data(s), index)); + return Ok((StringOutput::Data(s, ascii_only), index)); } CharType::Backslash => return decode_to_tape(data, index, tape, start, ascii_only), CharType::ControlChar => return json_err!(ControlCharacterWhileParsingString, index), @@ -217,7 +211,7 @@ fn decode_to_tape<'t, 'j>( tape.extend_from_slice(&data[last_escape..index]); index += 1; let s = to_str(tape, ascii_only, start)?; - return Ok((StringOutput::Tape(s), index)); + return Ok((StringOutput::Tape(s, ascii_only), index)); } CharType::Backslash => on_backslash!(), CharType::ControlChar => return json_err!(ControlCharacterWhileParsingString, index), diff --git a/tests/python.rs b/tests/python.rs index 3920dc56..97ba7540 100644 --- a/tests/python.rs +++ b/tests/python.rs @@ -2,7 +2,7 @@ use pyo3::prelude::*; use pyo3::types::{PyDict, PyList, PyString}; use pyo3::ToPyObject; -use jiter::{cache_clear, cache_usage, map_json_error, python_parse, JsonValue, StringCacheMode}; +use jiter::{cache_clear, cache_usage, map_json_error, pystring_fast_new, python_parse, JsonValue, StringCacheMode}; #[test] fn test_to_py_object_numeric() { @@ -269,3 +269,41 @@ fn test_cache_into() { ); }) } + +#[test] +fn test_unicode() { + let json = r#"{"💩": "£"}"#; + Python::with_gil(|py| { + cache_clear(py); + let obj = python_parse(py, json.as_bytes(), false, StringCacheMode::None, false).unwrap(); + assert_eq!(obj.to_string(), "{'💩': '£'}"); + }) +} + +#[test] +fn test_unicode_cache() { + let json = r#"{"💩": "£"}"#; + Python::with_gil(|py| { + cache_clear(py); + let obj = python_parse(py, json.as_bytes(), false, StringCacheMode::All, false).unwrap(); + assert_eq!(obj.to_string(), "{'💩': '£'}"); + }) +} + +#[test] +fn test_pystring_fast_new_non_ascii() { + let json = "£100 💩"; + Python::with_gil(|py| { + let s = pystring_fast_new(py, json, false); + assert_eq!(s.to_string(), "£100 💩"); + }) +} + +#[test] +fn test_pystring_fast_new_ascii() { + let json = "100abc"; + Python::with_gil(|py| { + let s = pystring_fast_new(py, json, true); + assert_eq!(s.to_string(), "100abc"); + }) +} From c241c5db6ca3032c8677624d35027a85dfc5e840 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Wed, 27 Mar 2024 18:21:48 -0600 Subject: [PATCH 6/9] add test --- tests/python.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/python.rs b/tests/python.rs index 97ba7540..2250bd9d 100644 --- a/tests/python.rs +++ b/tests/python.rs @@ -270,6 +270,16 @@ fn test_cache_into() { }) } +#[test] +fn test_use_tape() { + let json = r#" "foo\nbar" "#; + Python::with_gil(|py| { + cache_clear(py); + let obj = python_parse(py, json.as_bytes(), false, StringCacheMode::None, false).unwrap(); + assert_eq!(obj.to_string(), "foo\nbar"); + }) +} + #[test] fn test_unicode() { let json = r#"{"💩": "£"}"#; From 71a7775ebde3eed7842ec2d4e32d5850c770f310 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Wed, 27 Mar 2024 21:22:46 -0600 Subject: [PATCH 7/9] fix pyo3 version --- jiter-python/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jiter-python/Cargo.toml b/jiter-python/Cargo.toml index cb68896a..dd3f415d 100644 --- a/jiter-python/Cargo.toml +++ b/jiter-python/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -pyo3 = { version = "0.21.0-beta.0", features = ["num-bigint", "auto-initialize"] } +pyo3 = { version = "0.21.0", features = ["num-bigint", "auto-initialize"] } jiter = { path = "..", features = ["python"] } [features] From e31f7205972d40f74bf7e40c5dd8f1f6bdb58ede Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Mon, 1 Apr 2024 16:49:47 +0100 Subject: [PATCH 8/9] check PyUnicode_KIND, use PyUnicode_DATA --- src/py_string_cache.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/py_string_cache.rs b/src/py_string_cache.rs index 4d1c90df..94488422 100644 --- a/src/py_string_cache.rs +++ b/src/py_string_cache.rs @@ -193,9 +193,13 @@ pub fn pystring_fast_new<'py>(py: Python<'py>, s: &str, ascii_only: bool) -> Bou } } +/// Faster creation of PyString from an ASCII string, inspired by +/// https://github.com/ijl/orjson/blob/3.10.0/src/str/create.rs#L41 unsafe fn pystring_ascii_new<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> { let ptr = ffi::PyUnicode_New(s.len() as isize, 127); - let data_ptr = ptr.cast::().offset(1) as *mut u8; + // see https://github.com/pydantic/jiter/pull/72#discussion_r1545485907 + debug_assert_eq!(ffi::PyUnicode_KIND(ptr), ffi::PyUnicode_1BYTE_KIND); + let data_ptr = ffi::PyUnicode_DATA(ptr) as *mut u8; core::ptr::copy_nonoverlapping(s.as_ptr(), data_ptr, s.len()); core::ptr::write(data_ptr.add(s.len()), 0); Bound::from_owned_ptr(py, ptr).downcast_into_unchecked() From 95315f9343359861bdedc1916e93103f430383ea Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Tue, 2 Apr 2024 09:51:15 +0100 Subject: [PATCH 9/9] Cast not as Co-authored-by: David Hewitt --- src/py_string_cache.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/py_string_cache.rs b/src/py_string_cache.rs index 94488422..485c4951 100644 --- a/src/py_string_cache.rs +++ b/src/py_string_cache.rs @@ -199,7 +199,7 @@ unsafe fn pystring_ascii_new<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyStri let ptr = ffi::PyUnicode_New(s.len() as isize, 127); // see https://github.com/pydantic/jiter/pull/72#discussion_r1545485907 debug_assert_eq!(ffi::PyUnicode_KIND(ptr), ffi::PyUnicode_1BYTE_KIND); - let data_ptr = ffi::PyUnicode_DATA(ptr) as *mut u8; + let data_ptr = ffi::PyUnicode_DATA(ptr).cast(); core::ptr::copy_nonoverlapping(s.as_ptr(), data_ptr, s.len()); core::ptr::write(data_ptr.add(s.len()), 0); Bound::from_owned_ptr(py, ptr).downcast_into_unchecked()