Skip to content

Commit

Permalink
memmem: add new HeuristicFrequencyRank
Browse files Browse the repository at this point in the history
This makes it possible for the caller to provide their own ranking
function for individual bytes. This can potentially speed up searches if
one has a better guess than the default for the frequency distribution
of bytes in a particular haystack.

There is a lot of ceremony here, and it basically boils down to
supporting this in no-std no-alloc configurations. I was tempted to
just require alloc for this sort of thing and ask for something like
`Arc<dyn Fn(u8) -> u8>`, but that would require some ceremony of its
own internally to deal with in the no-alloc case. And forcing an
allocation for every searcher construction that uses a customer ranker
feels like bad juju to me.

Another choice would be to just ask for a `fn(u8) -> u8`, but this makes
the case of "I analyzed a haystack at runtime to build my ranker" more
difficult. Not impossible. But annoying.

Yet another choice was to add the trait as in this commit, and then add
it as a new type parameter to `FinderBuilder`. I believe this would
work, but it requires complicating the public API even more and imposes
constraints on the trait (for example, it would want to be `Clone` at
least in order to avoid backwards incompatible changes in the
`FinderBuilder` API). There's also just generally more ceremony with
having to add a type parameter everywhere. Since we only need the
ranking function at searcher construction time, we can ask for it at the
time of construction and then get rid of it, thus avoiding it infecting
everything else.

Fixes #117, Closes #118, Closes #119
  • Loading branch information
sentrip authored and BurntSushi committed Jul 11, 2023
1 parent be564d4 commit a1b3233
Show file tree
Hide file tree
Showing 8 changed files with 261 additions and 30 deletions.
116 changes: 116 additions & 0 deletions bench/src/memmem/byterank.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
use criterion::Criterion;
use memchr::memmem::HeuristicFrequencyRank;

use crate::define;

pub(crate) fn all(c: &mut Criterion) {
finder_construction(c);
byte_frequencies(c);
}

fn finder_construction(c: &mut Criterion) {
// This benchmark is purely for measuring the time taken to create a
// `Finder`. It is here to prevent regressions when adding new features
// to the `Finder`, such as the ability to construct with a custom
// `HeuristicFrequencyRank`.
const NEEDLES: [&str; 3] = ["a", "abcd", "abcdefgh12345678"];

for needle in NEEDLES {
define(
c,
&format!(
"memmem/krate/bytefreq/construct-finder/default(len={})",
needle.len()
),
needle.as_bytes(),
Box::new(move |b| {
b.iter(|| {
memchr::memmem::FinderBuilder::new()
.build_forward(needle.as_bytes())
});
}),
);
define(
c,
&format!(
"memmem/krate/bytefreq/construct-finder/custom(len={})",
needle.len()
),
needle.as_bytes(),
Box::new(move |b| {
b.iter(|| {
memchr::memmem::FinderBuilder::new()
.build_forward_with_ranker(Binary, needle.as_bytes())
});
}),
);
}
}

fn byte_frequencies(c: &mut Criterion) {
// This benchmark exists to demonstrate a common use case for
// customizing the byte frequency table used by a `Finder`
// and the relative performance gain from using an optimal table.
// This is essentially why `HeuristicFrequencyRank` was added.

// Bytes we want to scan for that are rare in strings but common in
// executables.
const NEEDLE: &[u8] = b"\x00\x00\xdd\xdd'";

// The input for the benchmark is the benchmark binary itself
let exe = std::env::args().next().unwrap();
let corpus = std::fs::read(exe).unwrap();

let bin = corpus.clone();
define(
c,
&format!("memmem/krate/bytefreq/default"),
&corpus,
Box::new(move |b| {
let finder =
memchr::memmem::FinderBuilder::new().build_forward(NEEDLE);
b.iter(|| {
assert_eq!(1, finder.find_iter(&bin).count());
});
}),
);

let bin = corpus.clone();
define(
c,
&format!("memmem/krate/bytefreq/custom"),
&corpus,
Box::new(move |b| {
let finder = memchr::memmem::FinderBuilder::new()
.build_forward_with_ranker(Binary, NEEDLE);
b.iter(|| {
assert_eq!(1, finder.find_iter(&bin).count());
});
}),
);
}

/// A byte-frequency table that is good for scanning binary executables.
struct Binary;

impl HeuristicFrequencyRank for Binary {
fn rank(&self, byte: u8) -> u8 {
const TABLE: [u8; 256] = [
255, 128, 61, 43, 50, 41, 27, 28, 57, 15, 21, 13, 24, 17, 17, 89,
58, 16, 11, 7, 14, 23, 7, 6, 24, 9, 6, 5, 9, 4, 7, 16, 68, 11, 9,
6, 88, 7, 4, 4, 23, 9, 4, 8, 8, 5, 10, 4, 30, 11, 9, 24, 11, 5, 5,
5, 19, 11, 6, 17, 9, 9, 6, 8, 48, 58, 11, 14, 53, 40, 9, 9, 254,
35, 3, 6, 52, 23, 6, 6, 27, 4, 7, 11, 14, 13, 10, 11, 11, 5, 2,
10, 16, 12, 6, 19, 19, 20, 5, 14, 16, 31, 19, 7, 14, 20, 4, 4, 19,
8, 18, 20, 24, 1, 25, 19, 58, 29, 10, 5, 15, 20, 2, 2, 9, 4, 3, 5,
51, 11, 4, 53, 23, 39, 6, 4, 13, 81, 4, 186, 5, 67, 3, 2, 15, 0,
0, 1, 3, 2, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 12, 2, 1, 1, 3, 1, 1, 1,
6, 1, 2, 1, 3, 1, 1, 2, 9, 1, 1, 0, 2, 2, 4, 4, 11, 6, 7, 3, 6, 9,
4, 5, 46, 18, 8, 18, 17, 3, 8, 20, 16, 10, 3, 7, 175, 4, 6, 7, 13,
3, 7, 3, 3, 1, 3, 3, 10, 3, 1, 5, 2, 0, 1, 2, 16, 3, 5, 1, 6, 1,
1, 2, 58, 20, 3, 14, 12, 2, 1, 3, 16, 3, 5, 8, 3, 1, 8, 6, 17, 6,
5, 3, 8, 6, 13, 175,
];
TABLE[byte as usize]
}
}
2 changes: 2 additions & 0 deletions bench/src/memmem/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ use criterion::Criterion;

use crate::{define, memmem::inputs::INPUTS};

mod byterank;
mod imp;
mod inputs;
mod sliceslice;
Expand All @@ -106,6 +107,7 @@ pub fn all(c: &mut Criterion) {
oneshot_iter(c);
prebuilt_iter(c);
sliceslice::all(c);
byterank::all(c);
}

fn oneshot(c: &mut Criterion) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pub const BYTE_FREQUENCIES: [u8; 256] = [
pub const RANK: [u8; 256] = [
55, // '\x00'
52, // '\x01'
51, // '\x02'
Expand Down
89 changes: 89 additions & 0 deletions src/memmem/byterank/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
mod default;

/// This trait allows the user to customize the heuristic used to determine the
/// relative frequency of a given byte in the dataset being searched.
///
/// The use of this trait can have a dramatic impact on performance depending
/// on the type of data being searched. The details of why are explained in the
/// docs of [`prefilter::Prefilter`]. To summarize, the core algorithm uses a
/// prefilter to quickly identify candidate matches that are later verified
/// more slowly. This prefilter is implemented in terms of trying to find
/// `rare` bytes at specific offsets that will occur less frequently in the
/// dataset. While the concept of a `rare` byte is similar for most datasets,
/// there are some specific datasets (like binary executables) that have
/// dramatically different byte distributions. For these datasets customizing
/// the byte frequency heuristic can have a massive impact on performance, and
/// might even need to be done at runtime.
///
/// The default implementation of `HeuristicFrequencyRank` reads from the
/// static frequency table defined in `src/memmem/byte_frequencies.rs`. This
/// is optimal for most inputs, so if you are unsure of the impact of using a
/// custom `HeuristicFrequencyRank` you should probably just use the default.
///
/// # Example
///
/// ```
/// use memchr::memmem::{FinderBuilder, HeuristicFrequencyRank};
///
/// /// A byte-frequency table that is good for scanning binary executables.
/// struct Binary;
///
/// impl HeuristicFrequencyRank for Binary {
/// fn rank(&self, byte: u8) -> u8 {
/// const TABLE: [u8; 256] = [
/// 255, 128, 61, 43, 50, 41, 27, 28, 57, 15, 21, 13, 24, 17, 17,
/// 89, 58, 16, 11, 7, 14, 23, 7, 6, 24, 9, 6, 5, 9, 4, 7, 16,
/// 68, 11, 9, 6, 88, 7, 4, 4, 23, 9, 4, 8, 8, 5, 10, 4, 30, 11,
/// 9, 24, 11, 5, 5, 5, 19, 11, 6, 17, 9, 9, 6, 8,
/// 48, 58, 11, 14, 53, 40, 9, 9, 254, 35, 3, 6, 52, 23, 6, 6, 27,
/// 4, 7, 11, 14, 13, 10, 11, 11, 5, 2, 10, 16, 12, 6, 19,
/// 19, 20, 5, 14, 16, 31, 19, 7, 14, 20, 4, 4, 19, 8, 18, 20, 24,
/// 1, 25, 19, 58, 29, 10, 5, 15, 20, 2, 2, 9, 4, 3, 5,
/// 51, 11, 4, 53, 23, 39, 6, 4, 13, 81, 4, 186, 5, 67, 3, 2, 15,
/// 0, 0, 1, 3, 2, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0,
/// 12, 2, 1, 1, 3, 1, 1, 1, 6, 1, 2, 1, 3, 1, 1, 2, 9, 1, 1, 0,
/// 2, 2, 4, 4, 11, 6, 7, 3, 6, 9, 4, 5,
/// 46, 18, 8, 18, 17, 3, 8, 20, 16, 10, 3, 7, 175, 4, 6, 7, 13,
/// 3, 7, 3, 3, 1, 3, 3, 10, 3, 1, 5, 2, 0, 1, 2,
/// 16, 3, 5, 1, 6, 1, 1, 2, 58, 20, 3, 14, 12, 2, 1, 3, 16, 3, 5,
/// 8, 3, 1, 8, 6, 17, 6, 5, 3, 8, 6, 13, 175,
/// ];
/// TABLE[byte as usize]
/// }
/// }
/// // Create a new finder with the custom heuristic.
/// let finder = FinderBuilder::new()
/// .build_forward_with_ranker(Binary, b"\x00\x00\xdd\xdd");
/// // Find needle with custom heuristic.
/// assert!(finder.find(b"\x00\x00\x00\xdd\xdd").is_some());
/// ```
pub trait HeuristicFrequencyRank {
/// Return the heuristic frequency rank of the given byte. A lower rank
/// means the byte is believed to occur less frequently in the haystack.
///
/// Some uses of this heuristic may treat arbitrary absolute rank values as
/// significant. For example, an implementation detail in this crate may
/// determine that heuristic prefilters are inappropriate if every byte in
/// the needle has a "high" rank.
fn rank(&self, byte: u8) -> u8;
}

/// The default byte frequency heuristic that is good for most haystacks.
pub(crate) struct DefaultFrequencyRank;

impl HeuristicFrequencyRank for DefaultFrequencyRank {
fn rank(&self, byte: u8) -> u8 {
self::default::RANK[usize::from(byte)]
}
}

/// This permits passing any implementation of `HeuristicFrequencyRank` as a
/// borrowed version of itself.
impl<'a, R> HeuristicFrequencyRank for &'a R
where
R: HeuristicFrequencyRank,
{
fn rank(&self, byte: u8) -> u8 {
(**self).rank(byte)
}
}
40 changes: 33 additions & 7 deletions src/memmem/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,12 @@ assert_eq!(None, finder.find(b"quux baz bar"));
```
*/

pub use self::prefilter::Prefilter;
pub use self::{byterank::HeuristicFrequencyRank, prefilter::Prefilter};

use crate::{
cow::CowBytes,
memmem::{
byterank::DefaultFrequencyRank,
prefilter::{Pre, PrefilterFn, PrefilterState},
rabinkarp::NeedleHash,
rarebytes::RareNeedleBytes,
Expand Down Expand Up @@ -145,7 +146,7 @@ macro_rules! define_memmem_simple_tests {
};
}

mod byte_frequencies;
mod byterank;
#[cfg(memchr_runtime_simd)]
mod genericsimd;
mod prefilter;
Expand Down Expand Up @@ -712,7 +713,24 @@ impl FinderBuilder {
&self,
needle: &'n B,
) -> Finder<'n> {
Finder { searcher: Searcher::new(self.config, needle.as_ref()) }
self.build_forward_with_ranker(DefaultFrequencyRank, needle)
}

/// Build a forward finder using the given needle and a custom heuristic for
/// determining the frequency of a given byte in the dataset.
/// See [`HeuristicFrequencyRank`] for more details.
pub fn build_forward_with_ranker<
'n,
R: HeuristicFrequencyRank,
B: ?Sized + AsRef<[u8]>,
>(
&self,
ranker: R,
needle: &'n B,
) -> Finder<'n> {
Finder {
searcher: Searcher::new(self.config, ranker, needle.as_ref()),
}
}

/// Build a reverse finder using the given needle from the current
Expand Down Expand Up @@ -817,14 +835,19 @@ enum SearcherKind {
}

impl<'n> Searcher<'n> {
fn new(config: SearcherConfig, needle: &'n [u8]) -> Searcher<'n> {
fn new<R: HeuristicFrequencyRank>(
config: SearcherConfig,
ranker: R,
needle: &'n [u8],
) -> Searcher<'n> {
use self::SearcherKind::*;

let ninfo = NeedleInfo::new(needle);
let ninfo = NeedleInfo::new(&ranker, needle);
let mk = |kind: SearcherKind| {
let prefn = prefilter::forward(
&config.prefilter,
&ninfo.rarebytes,
ranker,
needle,
);
Searcher { needle: CowBytes::new(needle), ninfo, prefn, kind }
Expand Down Expand Up @@ -1010,9 +1033,12 @@ impl<'n> Searcher<'n> {
}

impl NeedleInfo {
pub(crate) fn new(needle: &[u8]) -> NeedleInfo {
pub(crate) fn new<R: HeuristicFrequencyRank>(
ranker: &R,
needle: &[u8],
) -> NeedleInfo {
NeedleInfo {
rarebytes: RareNeedleBytes::forward(needle),
rarebytes: RareNeedleBytes::forward(ranker, needle),
nhash: NeedleHash::forward(needle),
}
}
Expand Down
5 changes: 4 additions & 1 deletion src/memmem/prefilter/fallback.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,10 @@ mod tests {
use super::*;

fn freqy_find(haystack: &[u8], needle: &[u8]) -> Option<usize> {
let ninfo = NeedleInfo::new(needle);
let ninfo = NeedleInfo::new(
&crate::memmem::byterank::DefaultFrequencyRank,
needle,
);
let mut prestate = PrefilterState::new();
find(&mut prestate, &ninfo, haystack, needle)
}
Expand Down
10 changes: 7 additions & 3 deletions src/memmem/prefilter/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use crate::memmem::{rarebytes::RareNeedleBytes, NeedleInfo};
use crate::memmem::{
rarebytes::RareNeedleBytes, HeuristicFrequencyRank, NeedleInfo,
};

mod fallback;
#[cfg(memchr_runtime_simd)]
Expand Down Expand Up @@ -287,9 +289,10 @@ impl PrefilterState {
/// is the default). In general, we try to use an AVX prefilter, followed by
/// SSE and then followed by a generic one based on memchr.
#[inline(always)]
pub(crate) fn forward(
pub(crate) fn forward<R: HeuristicFrequencyRank>(
config: &Prefilter,
rare: &RareNeedleBytes,
ranker: R,
needle: &[u8],
) -> Option<PrefilterFn> {
if config.is_none() || needle.len() <= 1 {
Expand Down Expand Up @@ -327,7 +330,8 @@ pub(crate) fn forward(
// Check that our rarest byte has a reasonably low rank. The main issue
// here is that the fallback prefilter can perform pretty poorly if it's
// given common bytes. So we try to avoid the worst cases here.
let (rare1_rank, _) = rare.as_ranks(needle);
let (rare1, _) = rare.as_rare_bytes(needle);
let rare1_rank = usize::from(ranker.rank(rare1));
if rare1_rank <= MAX_FALLBACK_RANK {
// SAFETY: fallback::find is safe to call in all environments.
return unsafe { Some(PrefilterFn::new(fallback::find)) };
Expand Down
Loading

0 comments on commit a1b3233

Please sign in to comment.