Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement wrapper for PyASCIIObject.state bitfield accesses #3015

Merged
merged 1 commit into from
Mar 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions newsfragments/3015.added.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Support `PyASCIIObject` / `PyUnicode` and associated methods on big-endian architectures.
273 changes: 251 additions & 22 deletions pyo3-ffi/src/cpython/unicodeobject.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,184 @@ use std::os::raw::{c_char, c_int, c_uint, c_void};
// skipped Py_UNICODE_HIGH_SURROGATE
// skipped Py_UNICODE_LOW_SURROGATE

// generated by bindgen v0.63.0 (with small adaptations)
#[repr(C)]
struct BitfieldUnit<Storage> {
storage: Storage,
}

impl<Storage> BitfieldUnit<Storage> {
#[inline]
pub const fn new(storage: Storage) -> Self {
Self { storage }
}
}

impl<Storage> BitfieldUnit<Storage>
where
Storage: AsRef<[u8]> + AsMut<[u8]>,
{
#[inline]
fn get_bit(&self, index: usize) -> bool {
debug_assert!(index / 8 < self.storage.as_ref().len());
let byte_index = index / 8;
let byte = self.storage.as_ref()[byte_index];
let bit_index = if cfg!(target_endian = "big") {
7 - (index % 8)
} else {
index % 8
};
let mask = 1 << bit_index;
byte & mask == mask
}

#[inline]
fn set_bit(&mut self, index: usize, val: bool) {
debug_assert!(index / 8 < self.storage.as_ref().len());
let byte_index = index / 8;
let byte = &mut self.storage.as_mut()[byte_index];
let bit_index = if cfg!(target_endian = "big") {
7 - (index % 8)
} else {
index % 8
};
let mask = 1 << bit_index;
if val {
*byte |= mask;
} else {
*byte &= !mask;
}
}

#[inline]
fn get(&self, bit_offset: usize, bit_width: u8) -> u64 {
debug_assert!(bit_width <= 64);
debug_assert!(bit_offset / 8 < self.storage.as_ref().len());
debug_assert!((bit_offset + (bit_width as usize)) / 8 <= self.storage.as_ref().len());
let mut val = 0;
for i in 0..(bit_width as usize) {
if self.get_bit(i + bit_offset) {
let index = if cfg!(target_endian = "big") {
bit_width as usize - 1 - i
} else {
i
};
val |= 1 << index;
}
}
val
}

#[inline]
fn set(&mut self, bit_offset: usize, bit_width: u8, val: u64) {
debug_assert!(bit_width <= 64);
debug_assert!(bit_offset / 8 < self.storage.as_ref().len());
debug_assert!((bit_offset + (bit_width as usize)) / 8 <= self.storage.as_ref().len());
for i in 0..(bit_width as usize) {
let mask = 1 << i;
let val_bit_is_set = val & mask == mask;
let index = if cfg!(target_endian = "big") {
bit_width as usize - 1 - i
} else {
i
};
self.set_bit(index + bit_offset, val_bit_is_set);
}
}
}

// generated by bindgen v0.63.0 (with small adaptations)
// The same code is generated for Python 3.7, 3.8, 3.9, 3.10, and 3.11, but the "ready" field
// has been removed from Python 3.12.

/// Wrapper around the `PyASCIIObject.state` bitfield with getters and setters that work
/// on most little- and big-endian architectures.
///
/// Memory layout of C bitfields is implementation defined, so these functions are still
/// unsafe. Users must verify that they work as expected on the architectures they target.
#[repr(C)]
#[repr(align(4))]
struct PyASCIIObjectState {
_bitfield_align: [u8; 0],
_bitfield: BitfieldUnit<[u8; 4usize]>,
}

// c_uint and u32 are not necessarily the same type on all targets / architectures
#[allow(clippy::useless_transmute)]
impl PyASCIIObjectState {
#[inline]
unsafe fn interned(&self) -> c_uint {
std::mem::transmute(self._bitfield.get(0usize, 2u8) as u32)
}

#[inline]
unsafe fn set_interned(&mut self, val: c_uint) {
let val: u32 = std::mem::transmute(val);
self._bitfield.set(0usize, 2u8, val as u64)
}

#[inline]
unsafe fn kind(&self) -> c_uint {
std::mem::transmute(self._bitfield.get(2usize, 3u8) as u32)
}

#[inline]
unsafe fn set_kind(&mut self, val: c_uint) {
let val: u32 = std::mem::transmute(val);
self._bitfield.set(2usize, 3u8, val as u64)
}

#[inline]
unsafe fn compact(&self) -> c_uint {
std::mem::transmute(self._bitfield.get(5usize, 1u8) as u32)
}

#[inline]
unsafe fn set_compact(&mut self, val: c_uint) {
let val: u32 = std::mem::transmute(val);
self._bitfield.set(5usize, 1u8, val as u64)
}

#[inline]
unsafe fn ascii(&self) -> c_uint {
std::mem::transmute(self._bitfield.get(6usize, 1u8) as u32)
}

#[inline]
unsafe fn set_ascii(&mut self, val: c_uint) {
let val: u32 = std::mem::transmute(val);
self._bitfield.set(6usize, 1u8, val as u64)
}

#[inline]
unsafe fn ready(&self) -> c_uint {
std::mem::transmute(self._bitfield.get(7usize, 1u8) as u32)
}

#[inline]
unsafe fn set_ready(&mut self, val: c_uint) {
let val: u32 = std::mem::transmute(val);
self._bitfield.set(7usize, 1u8, val as u64)
}
}

impl From<u32> for PyASCIIObjectState {
#[inline]
fn from(value: u32) -> Self {
PyASCIIObjectState {
_bitfield_align: [],
_bitfield: BitfieldUnit::new(value.to_ne_bytes()),
}
}
}

impl From<PyASCIIObjectState> for u32 {
#[inline]
fn from(value: PyASCIIObjectState) -> Self {
u32::from_ne_bytes(value._bitfield.storage)
}
}

#[repr(C)]
pub struct PyASCIIObject {
pub ob_base: PyObject,
Expand All @@ -52,34 +230,98 @@ pub struct PyASCIIObject {
}

/// Interacting with the bitfield is not actually well-defined, so we mark these APIs unsafe.
///
/// In addition, they are disabled on big-endian architectures to restrict this to most "common"
/// platforms, which are at least tested on CI and appear to be sound.
#[cfg(target_endian = "little")]
impl PyASCIIObject {
/// Get the `interned` field of the [`PyASCIIObject`] state bitfield.
///
/// Returns one of: [`SSTATE_NOT_INTERNED`], [`SSTATE_INTERNED_MORTAL`], [`SSTATE_INTERNED_IMMORTAL`]
#[inline]
pub unsafe fn interned(&self) -> c_uint {
self.state & 3
PyASCIIObjectState::from(self.state).interned()
}

/// Set the `interned` field of the [`PyASCIIObject`] state bitfield.
///
/// Calling this function with an argument that is not [`SSTATE_NOT_INTERNED`],
/// [`SSTATE_INTERNED_MORTAL`], or [`SSTATE_INTERNED_IMMORTAL`] is invalid.
#[inline]
pub unsafe fn set_interned(&mut self, val: c_uint) {
let mut state = PyASCIIObjectState::from(self.state);
state.set_interned(val);
self.state = u32::from(state);
}

/// Get the `kind` field of the [`PyASCIIObject`] state bitfield.
///
/// Returns one of: [`PyUnicode_WCHAR_KIND`], [`PyUnicode_1BYTE_KIND`], [`PyUnicode_2BYTE_KIND`],
/// [`PyUnicode_4BYTE_KIND`]
#[inline]
pub unsafe fn kind(&self) -> c_uint {
(self.state >> 2) & 7
PyASCIIObjectState::from(self.state).kind()
}

/// Set the `kind` field of the [`PyASCIIObject`] state bitfield.
///
/// Calling this function with an argument that is not [`PyUnicode_WCHAR_KIND`], [`PyUnicode_1BYTE_KIND`],
/// [`PyUnicode_2BYTE_KIND`], or [`PyUnicode_4BYTE_KIND`] is invalid.
#[inline]
pub unsafe fn set_kind(&mut self, val: c_uint) {
let mut state = PyASCIIObjectState::from(self.state);
state.set_kind(val);
self.state = u32::from(state);
}

/// Get the `compact` field of the [`PyASCIIObject`] state bitfield.
///
/// Returns either `0` or `1`.
#[inline]
pub unsafe fn compact(&self) -> c_uint {
(self.state >> 5) & 1
PyASCIIObjectState::from(self.state).compact()
}

/// Set the `compact` flag of the [`PyASCIIObject`] state bitfield.
///
/// Calling this function with an argument that is neither `0` nor `1` is invalid.
#[inline]
pub unsafe fn set_compact(&mut self, val: c_uint) {
let mut state = PyASCIIObjectState::from(self.state);
state.set_compact(val);
self.state = u32::from(state);
}

/// Get the `ascii` field of the [`PyASCIIObject`] state bitfield.
///
/// Returns either `0` or `1`.
#[inline]
pub unsafe fn ascii(&self) -> c_uint {
(self.state >> 6) & 1
PyASCIIObjectState::from(self.state).ascii()
}

/// Set the `ascii` flag of the [`PyASCIIObject`] state bitfield.
///
/// Calling this function with an argument that is neither `0` nor `1` is invalid.
#[inline]
pub unsafe fn set_ascii(&mut self, val: c_uint) {
let mut state = PyASCIIObjectState::from(self.state);
state.set_ascii(val);
self.state = u32::from(state);
}

/// Get the `ready` field of the [`PyASCIIObject`] state bitfield.
///
/// Returns either `0` or `1`.
#[inline]
pub unsafe fn ready(&self) -> c_uint {
(self.state >> 7) & 1
PyASCIIObjectState::from(self.state).ready()
}

/// Set the `ready` flag of the [`PyASCIIObject`] state bitfield.
///
/// Calling this function with an argument that is neither `0` nor `1` is invalid.
#[inline]
pub unsafe fn set_ready(&mut self, val: c_uint) {
let mut state = PyASCIIObjectState::from(self.state);
state.set_ready(val);
self.state = u32::from(state);
}
}

Expand Down Expand Up @@ -120,7 +362,6 @@ pub const SSTATE_INTERNED_MORTAL: c_uint = 1;
pub const SSTATE_INTERNED_IMMORTAL: c_uint = 2;

#[inline]
#[cfg(target_endian = "little")]
pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint {
debug_assert!(crate::PyUnicode_Check(op) != 0);
debug_assert!(PyUnicode_IS_READY(op) != 0);
Expand All @@ -129,13 +370,11 @@ pub unsafe fn PyUnicode_IS_ASCII(op: *mut PyObject) -> c_uint {
}

#[inline]
#[cfg(target_endian = "little")]
pub unsafe fn PyUnicode_IS_COMPACT(op: *mut PyObject) -> c_uint {
(*(op as *mut PyASCIIObject)).compact()
}

#[inline]
#[cfg(target_endian = "little")]
pub unsafe fn PyUnicode_IS_COMPACT_ASCII(op: *mut PyObject) -> c_uint {
((*(op as *mut PyASCIIObject)).ascii() != 0 && PyUnicode_IS_COMPACT(op) != 0).into()
}
Expand All @@ -149,25 +388,21 @@ pub const PyUnicode_2BYTE_KIND: c_uint = 2;
pub const PyUnicode_4BYTE_KIND: c_uint = 4;

#[inline]
#[cfg(target_endian = "little")]
pub unsafe fn PyUnicode_1BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS1 {
PyUnicode_DATA(op) as *mut Py_UCS1
}

#[inline]
#[cfg(target_endian = "little")]
pub unsafe fn PyUnicode_2BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS2 {
PyUnicode_DATA(op) as *mut Py_UCS2
}

#[inline]
#[cfg(target_endian = "little")]
pub unsafe fn PyUnicode_4BYTE_DATA(op: *mut PyObject) -> *mut Py_UCS4 {
PyUnicode_DATA(op) as *mut Py_UCS4
}

#[inline]
#[cfg(target_endian = "little")]
pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint {
debug_assert!(crate::PyUnicode_Check(op) != 0);
debug_assert!(PyUnicode_IS_READY(op) != 0);
Expand All @@ -176,7 +411,6 @@ pub unsafe fn PyUnicode_KIND(op: *mut PyObject) -> c_uint {
}

#[inline]
#[cfg(target_endian = "little")]
pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void {
if PyUnicode_IS_ASCII(op) != 0 {
(op as *mut PyASCIIObject).offset(1) as *mut c_void
Expand All @@ -186,15 +420,13 @@ pub unsafe fn _PyUnicode_COMPACT_DATA(op: *mut PyObject) -> *mut c_void {
}

#[inline]
#[cfg(target_endian = "little")]
pub unsafe fn _PyUnicode_NONCOMPACT_DATA(op: *mut PyObject) -> *mut c_void {
debug_assert!(!(*(op as *mut PyUnicodeObject)).data.any.is_null());

(*(op as *mut PyUnicodeObject)).data.any
}

#[inline]
#[cfg(target_endian = "little")]
pub unsafe fn PyUnicode_DATA(op: *mut PyObject) -> *mut c_void {
debug_assert!(crate::PyUnicode_Check(op) != 0);

Expand All @@ -210,7 +442,6 @@ pub unsafe fn PyUnicode_DATA(op: *mut PyObject) -> *mut c_void {
// skipped PyUnicode_READ_CHAR

#[inline]
#[cfg(target_endian = "little")]
pub unsafe fn PyUnicode_GET_LENGTH(op: *mut PyObject) -> Py_ssize_t {
debug_assert!(crate::PyUnicode_Check(op) != 0);
debug_assert!(PyUnicode_IS_READY(op) != 0);
Expand All @@ -219,15 +450,13 @@ pub unsafe fn PyUnicode_GET_LENGTH(op: *mut PyObject) -> Py_ssize_t {
}

#[inline]
#[cfg(target_endian = "little")]
pub unsafe fn PyUnicode_IS_READY(op: *mut PyObject) -> c_uint {
(*(op as *mut PyASCIIObject)).ready()
}

#[cfg(not(Py_3_12))]
#[cfg_attr(Py_3_10, deprecated(note = "Python 3.10"))]
#[inline]
#[cfg(target_endian = "little")]
pub unsafe fn PyUnicode_READY(op: *mut PyObject) -> c_int {
debug_assert!(crate::PyUnicode_Check(op) != 0);

Expand Down
Loading