Skip to content

Commit

Permalink
Encode core::str::CharSearcher::utf8_size as enum
Browse files Browse the repository at this point in the history
  • Loading branch information
GnomedDev committed Jan 10, 2024
1 parent e927184 commit adddd38
Showing 1 changed file with 50 additions and 13 deletions.
63 changes: 50 additions & 13 deletions library/core/src/str/pattern.rs
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,40 @@ pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {}
// Impl for char
/////////////////////////////////////////////////////////////////////////////

#[derive(Clone, Copy, Debug)]
enum Utf8Size {
// Values are indexes, so `- 1`
One = 0,
Two = 1,
Three = 2,
Four = 3,
}

impl Utf8Size {
fn new(size: usize) -> Option<Self> {
match size {
1 => Some(Self::One),
2 => Some(Self::Two),
3 => Some(Self::Three),
4 => Some(Self::Four),
_ => None,
}
}

// # Safety
//
// `size` must be more than `0` and less than `5`
unsafe fn new_unchecked(size: usize) -> Self {
// SAFETY: Invariant held by caller
unsafe { Self::new(size).unwrap_unchecked() }
}

fn index(self, arr: &[u8; 4]) -> &u8 {
// SAFETY: max value is 3, which indexes to the 4th element.
unsafe { arr.get_unchecked(self as usize) }
}
}

/// Associated type for `<char as Pattern<'a>>::Searcher`.
#[derive(Clone, Debug)]
pub struct CharSearcher<'a> {
Expand All @@ -368,9 +402,8 @@ pub struct CharSearcher<'a> {
/// The character being searched for
needle: char,

// safety invariant: `utf8_size` must be less than 5
/// The number of bytes `needle` takes up when encoded in utf8.
utf8_size: usize,
utf8_size: Utf8Size,
/// A utf8 encoded copy of the `needle`
utf8_encoded: [u8; 4],
}
Expand Down Expand Up @@ -413,8 +446,7 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
// get the haystack after the last character found
let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?;
// the last byte of the utf8 encoded needle
// SAFETY: we have an invariant that `utf8_size < 5`
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
let last_byte = *self.utf8_size.index(&self.utf8_encoded);
if let Some(index) = memchr::memchr(last_byte, bytes) {
// The new finger is the index of the byte we found,
// plus one, since we memchr'd for the last byte of the character.
Expand All @@ -434,10 +466,12 @@ unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
// find something. When we find something the `finger` will be set
// to a UTF8 boundary.
self.finger += index + 1;
if self.finger >= self.utf8_size {
let found_char = self.finger - self.utf8_size;

let utf8_size = self.utf8_size as usize;
if self.finger >= utf8_size {
let found_char = self.finger - utf8_size;
if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) {
if slice == &self.utf8_encoded[0..self.utf8_size] {
if slice == &self.utf8_encoded[0..utf8_size] {
return Some((found_char, self.finger));
}
}
Expand Down Expand Up @@ -481,8 +515,7 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
// get the haystack up to but not including the last character searched
let bytes = haystack.get(self.finger..self.finger_back)?;
// the last byte of the utf8 encoded needle
// SAFETY: we have an invariant that `utf8_size < 5`
let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
let last_byte = *self.utf8_size.index(&self.utf8_encoded);
if let Some(index) = memchr::memrchr(last_byte, bytes) {
// we searched a slice that was offset by self.finger,
// add self.finger to recoup the original index
Expand All @@ -493,14 +526,15 @@ unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
// char in the paradigm of reverse iteration). For
// multibyte chars we need to skip down by the number of more
// bytes they have than ASCII
let shift = self.utf8_size - 1;
let utf8_size = self.utf8_size as usize;
let shift = utf8_size - 1;
if index >= shift {
let found_char = index - shift;
if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) {
if slice == &self.utf8_encoded[0..self.utf8_size] {
if let Some(slice) = haystack.get(found_char..(found_char + utf8_size)) {
if slice == &self.utf8_encoded[0..utf8_size] {
// move finger to before the character found (i.e., at its start index)
self.finger_back = found_char;
return Some((self.finger_back, self.finger_back + self.utf8_size));
return Some((self.finger_back, self.finger_back + utf8_size));
}
}
}
Expand Down Expand Up @@ -543,6 +577,9 @@ impl<'a> Pattern<'a> for char {
fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
let mut utf8_encoded = [0; 4];
let utf8_size = self.encode_utf8(&mut utf8_encoded).len();

// SAFETY: utf8_size is below 5
let utf8_size = unsafe { Utf8Size::new_unchecked(utf8_size) };
CharSearcher {
haystack,
finger: 0,
Expand Down

0 comments on commit adddd38

Please sign in to comment.