Skip to content

Commit

Permalink
Allow limited access to OsString bytes
Browse files Browse the repository at this point in the history
This extends rust-lang#109698 to allow no-cost conversion between `Vec<u8>` and `OsString`
as suggested in feedback from `os_str_bytes` crate in rust-lang#111544.
  • Loading branch information
epage committed Jul 7, 2023
1 parent 85bf079 commit ee604fc
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 0 deletions.
65 changes: 65 additions & 0 deletions library/std/src/ffi/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,51 @@ impl OsString {
OsString { inner: Buf::from_string(String::new()) }
}

/// Converts bytes to an `OsString` without checking that the bytes contains
/// valid [`OsStr`]-encoded data.
///
/// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
/// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
/// ASCII.
///
/// See the [module's toplevel documentation about conversions][conversions] for safe,
/// cross-platform [conversions] from/to native representations.
///
/// # Safety
///
/// As the encoding is unspecified, callers must pass in bytes that originated as a mixture of
/// validated UTF-8 and bytes from [`OsStr::as_os_str_bytes`] from within the same rust version
/// built for the same target platform. For example, reconstructing an `OsString` from bytes sent
/// over the network or stored in a file will likely violate these safety rules.
///
/// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_os_str_bytes`] can be
/// split either immediately before or immediately after any valid non-empty UTF-8 substring.
///
/// # Example
///
/// ```
/// #![feature(os_str_bytes)]
///
/// use std::ffi::OsStr;
///
/// let os_str = OsStr::new("Mary had a little lamb");
/// let bytes = os_str.as_os_str_bytes();
/// let words = bytes.split(|b| *b == b' ');
/// let words: Vec<&OsStr> = words.map(|word| {
/// // SAFETY:
/// // - Each `word` only contains content that originated from `OsStr::as_os_str_bytes`
/// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring
/// unsafe { OsStr::from_os_str_bytes_unchecked(word) }
/// }).collect();
/// ```
///
/// [conversions]: super#conversions
#[inline]
#[unstable(feature = "os_str_bytes", issue = "111544")]
pub unsafe fn from_os_str_bytes_unchecked(bytes: Vec<u8>) -> Self {
OsString { inner: Buf::from_os_str_bytes_unchecked(bytes) }
}

/// Converts to an [`OsStr`] slice.
///
/// # Examples
Expand All @@ -159,6 +204,26 @@ impl OsString {
self
}

/// Converts the `OsString` into a byte slice. To convert the byte slice back into an
/// `OsString`, use the [`OsStr::from_os_str_bytes_unchecked`] function.
///
/// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8.
/// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit
/// ASCII.
///
/// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should
/// be treated as opaque and only comparable within the same rust version built for the same
/// target platform. For example, sending the bytes over the network or storing it in a file
/// will likely result in incompatible data. See [`OsString`] for more encoding details
/// and [`std::ffi`] for platform-specific, specified conversions.
///
/// [`std::ffi`]: crate::ffi
#[inline]
#[unstable(feature = "os_str_bytes", issue = "111544")]
pub fn into_os_str_bytes(self) -> Vec<u8> {
self.inner.into_os_str_bytes()
}

/// Converts the `OsString` into a [`String`] if it contains valid Unicode data.
///
/// On failure, ownership of the original `OsString` is returned.
Expand Down
10 changes: 10 additions & 0 deletions library/std/src/sys/unix/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,16 @@ impl AsInner<[u8]> for Buf {
}

impl Buf {
#[inline]
pub fn into_os_str_bytes(self) -> Vec<u8> {
self.inner
}

#[inline]
pub unsafe fn from_os_str_bytes_unchecked(s: Vec<u8>) -> Self {
Self { inner: s }
}

pub fn from_string(s: String) -> Buf {
Buf { inner: s.into_bytes() }
}
Expand Down
10 changes: 10 additions & 0 deletions library/std/src/sys/windows/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,16 @@ impl fmt::Display for Slice {
}

impl Buf {
#[inline]
pub fn into_os_str_bytes(self) -> Vec<u8> {
self.inner.into_bytes()
}

#[inline]
pub unsafe fn from_os_str_bytes_unchecked(s: Vec<u8>) -> Self {
Self { inner: Wtf8Buf::from_bytes_unchecked(s) }
}

pub fn with_capacity(capacity: usize) -> Buf {
Buf { inner: Wtf8Buf::with_capacity(capacity) }
}
Expand Down
15 changes: 15 additions & 0 deletions library/std/src/sys_common/wtf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,15 @@ impl Wtf8Buf {
Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true }
}

/// Creates a WTF-8 string from a WTF-8 byte vec.
///
/// Since the byte vec is not checked for valid WTF-8, this functions is
/// marked unsafe.
#[inline]
pub unsafe fn from_bytes_unchecked(value: Vec<u8>) -> Wtf8Buf {
Wtf8Buf { bytes: value, is_known_utf8: false }
}

/// Creates a WTF-8 string from a UTF-8 `String`.
///
/// This takes ownership of the `String` and does not copy.
Expand Down Expand Up @@ -402,6 +411,12 @@ impl Wtf8Buf {
self.bytes.truncate(new_len)
}

/// Consumes the WTF-8 string and tries to convert it to a vec of bytes.
#[inline]
pub fn into_bytes(self) -> Vec<u8> {
self.bytes
}

/// Consumes the WTF-8 string and tries to convert it to UTF-8.
///
/// This does not copy the data.
Expand Down

0 comments on commit ee604fc

Please sign in to comment.