Skip to content

Commit

Permalink
Rollup merge of rust-lang#90607 - WaffleLapkin:const_str_from_utf8, r…
Browse files Browse the repository at this point in the history
…=oli-obk

Make slice->str conversion and related functions `const`

This PR marks the following APIs as `const`:
```rust
// core::str
pub const fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error>;
pub const fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error>;
pub const unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str;

impl Utf8Error {
    pub const fn valid_up_to(&self) -> usize;
    pub const fn error_len(&self) -> Option<usize>;
}
```

Everything but `from_utf8_unchecked_mut` uses `const_str_from_utf8` feature gate, `from_utf8_unchecked_mut` uses `const_str_from_utf8_unchecked_mut` feature gate.

---

I'm not sure why `from_utf8_unchecked_mut` was left out being  non-`const`, considering that `from_utf8_unchecked` is not only `const`, but **`const` stable**.

---

r? `@oli-obk` (performance-only `const_eval_select` use)
  • Loading branch information
JohnTitor authored Nov 18, 2021
2 parents b79d0e3 + 573a00e commit b8ef644
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 24 deletions.
1 change: 1 addition & 0 deletions library/alloc/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#![feature(const_btree_new)]
#![feature(const_default_impls)]
#![feature(const_trait_impl)]
#![feature(const_str_from_utf8)]

use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
Expand Down
64 changes: 61 additions & 3 deletions library/alloc/tests/str.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::assert_matches::assert_matches;
use std::borrow::Cow;
use std::cmp::Ordering::{Equal, Greater, Less};
use std::str::{from_utf8, from_utf8_unchecked};
Expand Down Expand Up @@ -883,6 +884,33 @@ fn test_is_utf8() {
assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok());
}

#[test]
fn test_const_is_utf8() {
const _: () = {
// deny overlong encodings
assert!(from_utf8(&[0xc0, 0x80]).is_err());
assert!(from_utf8(&[0xc0, 0xae]).is_err());
assert!(from_utf8(&[0xe0, 0x80, 0x80]).is_err());
assert!(from_utf8(&[0xe0, 0x80, 0xaf]).is_err());
assert!(from_utf8(&[0xe0, 0x81, 0x81]).is_err());
assert!(from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err());
assert!(from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err());

// deny surrogates
assert!(from_utf8(&[0xED, 0xA0, 0x80]).is_err());
assert!(from_utf8(&[0xED, 0xBF, 0xBF]).is_err());

assert!(from_utf8(&[0xC2, 0x80]).is_ok());
assert!(from_utf8(&[0xDF, 0xBF]).is_ok());
assert!(from_utf8(&[0xE0, 0xA0, 0x80]).is_ok());
assert!(from_utf8(&[0xED, 0x9F, 0xBF]).is_ok());
assert!(from_utf8(&[0xEE, 0x80, 0x80]).is_ok());
assert!(from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok());
assert!(from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok());
assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok());
};
}

#[test]
fn from_utf8_mostly_ascii() {
// deny invalid bytes embedded in long stretches of ascii
Expand All @@ -895,13 +923,43 @@ fn from_utf8_mostly_ascii() {
}
}

#[test]
fn const_from_utf8_mostly_ascii() {
const _: () = {
// deny invalid bytes embedded in long stretches of ascii
let mut i = 32;
while i < 64 {
let mut data = [0; 128];
data[i] = 0xC0;
assert!(from_utf8(&data).is_err());
data[i] = 0xC2;
assert!(from_utf8(&data).is_err());

i = i + 1;
}
};
}

#[test]
fn from_utf8_error() {
macro_rules! test {
($input: expr, $expected_valid_up_to: expr, $expected_error_len: expr) => {
($input: expr, $expected_valid_up_to:pat, $expected_error_len:pat) => {
let error = from_utf8($input).unwrap_err();
assert_eq!(error.valid_up_to(), $expected_valid_up_to);
assert_eq!(error.error_len(), $expected_error_len);
assert_matches!(error.valid_up_to(), $expected_valid_up_to);
assert_matches!(error.error_len(), $expected_error_len);

const _: () = {
match from_utf8($input) {
Err(error) => {
let valid_up_to = error.valid_up_to();
let error_len = error.error_len();

assert!(matches!(valid_up_to, $expected_valid_up_to));
assert!(matches!(error_len, $expected_error_len));
}
Ok(_) => unreachable!(),
}
};
};
}
test!(b"A\xC3\xA9 \xFF ", 4, Some(1));
Expand Down
3 changes: 3 additions & 0 deletions library/core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
#![allow(explicit_outlives_requirements)]
//
// Library features for const fns:
#![feature(const_align_offset)]
#![feature(const_align_of_val)]
#![feature(const_alloc_layout)]
#![feature(const_arguments_as_str)]
Expand Down Expand Up @@ -130,6 +131,7 @@
#![feature(const_size_of_val)]
#![feature(const_slice_from_raw_parts)]
#![feature(const_slice_ptr_len)]
#![feature(const_str_from_utf8_unchecked_mut)]
#![feature(const_swap)]
#![feature(const_trait_impl)]
#![feature(const_type_id)]
Expand All @@ -138,6 +140,7 @@
#![feature(duration_consts_2)]
#![feature(ptr_metadata)]
#![feature(slice_ptr_get)]
#![feature(str_internals)]
#![feature(variant_count)]
#![feature(const_array_from_ref)]
#![feature(const_slice_from_ref)]
Expand Down
31 changes: 22 additions & 9 deletions library/core/src/str/converts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,16 @@ use super::Utf8Error;
/// assert_eq!("💖", sparkle_heart);
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
run_utf8_validation(v)?;
// SAFETY: Just ran validation.
Ok(unsafe { from_utf8_unchecked(v) })
#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")]
pub const fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
// This should use `?` again, once it's `const`
match run_utf8_validation(v) {
Ok(_) => {
// SAFETY: validation succeeded.
Ok(unsafe { from_utf8_unchecked(v) })
}
Err(err) => Err(err),
}
}

/// Converts a mutable slice of bytes to a mutable string slice.
Expand Down Expand Up @@ -119,10 +125,16 @@ pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
/// See the docs for [`Utf8Error`] for more details on the kinds of
/// errors that can be returned.
#[stable(feature = "str_mut_extras", since = "1.20.0")]
pub fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> {
run_utf8_validation(v)?;
// SAFETY: Just ran validation.
Ok(unsafe { from_utf8_unchecked_mut(v) })
#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")]
pub const fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> {
// This should use `?` again, once it's `const`
match run_utf8_validation(v) {
Ok(_) => {
// SAFETY: validation succeeded.
Ok(unsafe { from_utf8_unchecked_mut(v) })
}
Err(err) => Err(err),
}
}

/// Converts a slice of bytes to a string slice without checking
Expand Down Expand Up @@ -184,7 +196,8 @@ pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
#[inline]
#[must_use]
#[stable(feature = "str_mut_extras", since = "1.20.0")]
pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str {
#[rustc_const_unstable(feature = "const_str_from_utf8_unchecked_mut", issue = "91005")]
pub const unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str {
// SAFETY: the caller must guarantee that the bytes `v`
// are valid UTF-8, thus the cast to `*mut str` is safe.
// Also, the pointer dereference is safe because that pointer
Expand Down
12 changes: 9 additions & 3 deletions library/core/src/str/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,10 @@ impl Utf8Error {
/// assert_eq!(1, error.valid_up_to());
/// ```
#[stable(feature = "utf8_error", since = "1.5.0")]
#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")]
#[must_use]
#[inline]
pub fn valid_up_to(&self) -> usize {
pub const fn valid_up_to(&self) -> usize {
self.valid_up_to
}

Expand All @@ -94,10 +95,15 @@ impl Utf8Error {
///
/// [U+FFFD]: ../../std/char/constant.REPLACEMENT_CHARACTER.html
#[stable(feature = "utf8_error_error_len", since = "1.20.0")]
#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")]
#[must_use]
#[inline]
pub fn error_len(&self) -> Option<usize> {
self.error_len.map(|len| len as usize)
pub const fn error_len(&self) -> Option<usize> {
// This should become `map` again, once it's `const`
match self.error_len {
Some(len) => Some(len as usize),
None => None,
}
}
}

Expand Down
19 changes: 10 additions & 9 deletions library/core/src/str/validations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,25 @@ use super::Utf8Error;
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
/// for width 3, and 3 bits for width 4.
#[inline]
fn utf8_first_byte(byte: u8, width: u32) -> u32 {
const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
(byte & (0x7F >> width)) as u32
}

/// Returns the value of `ch` updated with continuation byte `byte`.
#[inline]
fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
(ch << 6) | (byte & CONT_MASK) as u32
}

/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
/// bits `10`).
#[inline]
pub(super) fn utf8_is_cont_byte(byte: u8) -> bool {
pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
(byte as i8) < -64
}

#[inline]
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
const fn unwrap_or_0(opt: Option<&u8>) -> u8 {
match opt {
Some(&byte) => byte,
None => 0,
Expand Down Expand Up @@ -105,14 +105,15 @@ const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize;

/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
#[inline]
fn contains_nonascii(x: usize) -> bool {
const fn contains_nonascii(x: usize) -> bool {
(x & NONASCII_MASK) != 0
}

/// Walks through `v` checking that it's a valid UTF-8 sequence,
/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
#[inline(always)]
pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
#[rustc_const_unstable(feature = "str_internals", issue = "none")]
pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
let mut index = 0;
let len = v.len();

Expand Down Expand Up @@ -142,7 +143,7 @@ pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {

let first = v[index];
if first >= 128 {
let w = UTF8_CHAR_WIDTH[first as usize];
let w = utf8_char_width(first);
// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
// first C2 80 last DF BF
// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
Expand Down Expand Up @@ -230,7 +231,7 @@ pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
}

// https://tools.ietf.org/html/rfc3629
static UTF8_CHAR_WIDTH: [u8; 256] = [
const UTF8_CHAR_WIDTH: &[u8; 256] = &[
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, // 0x1F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Expand All @@ -253,7 +254,7 @@ static UTF8_CHAR_WIDTH: [u8; 256] = [
#[unstable(feature = "str_internals", issue = "none")]
#[must_use]
#[inline]
pub fn utf8_char_width(b: u8) -> usize {
pub const fn utf8_char_width(b: u8) -> usize {
UTF8_CHAR_WIDTH[b as usize] as usize
}

Expand Down

0 comments on commit b8ef644

Please sign in to comment.