-
-
Notifications
You must be signed in to change notification settings - Fork 99
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
memmem: add new HeuristicFrequencyRank
This makes it possible for the caller to provide their own ranking function for individual bytes. This can potentially speed up searches if one has a better guess than the default for the frequency distribution of bytes in a particular haystack. There is a lot of ceremony here, and it basically boils down to supporting this in no-std no-alloc configurations. I was tempted to just require alloc for this sort of thing and ask for something like `Arc<dyn Fn(u8) -> u8>`, but that would require some ceremony of its own internally to deal with in the no-alloc case. And forcing an allocation for every searcher construction that uses a customer ranker feels like bad juju to me. Another choice would be to just ask for a `fn(u8) -> u8`, but this makes the case of "I analyzed a haystack at runtime to build my ranker" more difficult. Not impossible. But annoying. Yet another choice was to add the trait as in this commit, and then add it as a new type parameter to `FinderBuilder`. I believe this would work, but it requires complicating the public API even more and imposes constraints on the trait (for example, it would want to be `Clone` at least in order to avoid backwards incompatible changes in the `FinderBuilder` API). There's also just generally more ceremony with having to add a type parameter everywhere. Since we only need the ranking function at searcher construction time, we can ask for it at the time of construction and then get rid of it, thus avoiding it infecting everything else. Fixes #117, Closes #118, Closes #119
- Loading branch information
1 parent
be564d4
commit a1b3233
Showing
8 changed files
with
261 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
use criterion::Criterion; | ||
use memchr::memmem::HeuristicFrequencyRank; | ||
|
||
use crate::define; | ||
|
||
pub(crate) fn all(c: &mut Criterion) { | ||
finder_construction(c); | ||
byte_frequencies(c); | ||
} | ||
|
||
fn finder_construction(c: &mut Criterion) { | ||
// This benchmark is purely for measuring the time taken to create a | ||
// `Finder`. It is here to prevent regressions when adding new features | ||
// to the `Finder`, such as the ability to construct with a custom | ||
// `HeuristicFrequencyRank`. | ||
const NEEDLES: [&str; 3] = ["a", "abcd", "abcdefgh12345678"]; | ||
|
||
for needle in NEEDLES { | ||
define( | ||
c, | ||
&format!( | ||
"memmem/krate/bytefreq/construct-finder/default(len={})", | ||
needle.len() | ||
), | ||
needle.as_bytes(), | ||
Box::new(move |b| { | ||
b.iter(|| { | ||
memchr::memmem::FinderBuilder::new() | ||
.build_forward(needle.as_bytes()) | ||
}); | ||
}), | ||
); | ||
define( | ||
c, | ||
&format!( | ||
"memmem/krate/bytefreq/construct-finder/custom(len={})", | ||
needle.len() | ||
), | ||
needle.as_bytes(), | ||
Box::new(move |b| { | ||
b.iter(|| { | ||
memchr::memmem::FinderBuilder::new() | ||
.build_forward_with_ranker(Binary, needle.as_bytes()) | ||
}); | ||
}), | ||
); | ||
} | ||
} | ||
|
||
fn byte_frequencies(c: &mut Criterion) { | ||
// This benchmark exists to demonstrate a common use case for | ||
// customizing the byte frequency table used by a `Finder` | ||
// and the relative performance gain from using an optimal table. | ||
// This is essentially why `HeuristicFrequencyRank` was added. | ||
|
||
// Bytes we want to scan for that are rare in strings but common in | ||
// executables. | ||
const NEEDLE: &[u8] = b"\x00\x00\xdd\xdd'"; | ||
|
||
// The input for the benchmark is the benchmark binary itself | ||
let exe = std::env::args().next().unwrap(); | ||
let corpus = std::fs::read(exe).unwrap(); | ||
|
||
let bin = corpus.clone(); | ||
define( | ||
c, | ||
&format!("memmem/krate/bytefreq/default"), | ||
&corpus, | ||
Box::new(move |b| { | ||
let finder = | ||
memchr::memmem::FinderBuilder::new().build_forward(NEEDLE); | ||
b.iter(|| { | ||
assert_eq!(1, finder.find_iter(&bin).count()); | ||
}); | ||
}), | ||
); | ||
|
||
let bin = corpus.clone(); | ||
define( | ||
c, | ||
&format!("memmem/krate/bytefreq/custom"), | ||
&corpus, | ||
Box::new(move |b| { | ||
let finder = memchr::memmem::FinderBuilder::new() | ||
.build_forward_with_ranker(Binary, NEEDLE); | ||
b.iter(|| { | ||
assert_eq!(1, finder.find_iter(&bin).count()); | ||
}); | ||
}), | ||
); | ||
} | ||
|
||
/// A byte-frequency table that is good for scanning binary executables. | ||
struct Binary; | ||
|
||
impl HeuristicFrequencyRank for Binary { | ||
fn rank(&self, byte: u8) -> u8 { | ||
const TABLE: [u8; 256] = [ | ||
255, 128, 61, 43, 50, 41, 27, 28, 57, 15, 21, 13, 24, 17, 17, 89, | ||
58, 16, 11, 7, 14, 23, 7, 6, 24, 9, 6, 5, 9, 4, 7, 16, 68, 11, 9, | ||
6, 88, 7, 4, 4, 23, 9, 4, 8, 8, 5, 10, 4, 30, 11, 9, 24, 11, 5, 5, | ||
5, 19, 11, 6, 17, 9, 9, 6, 8, 48, 58, 11, 14, 53, 40, 9, 9, 254, | ||
35, 3, 6, 52, 23, 6, 6, 27, 4, 7, 11, 14, 13, 10, 11, 11, 5, 2, | ||
10, 16, 12, 6, 19, 19, 20, 5, 14, 16, 31, 19, 7, 14, 20, 4, 4, 19, | ||
8, 18, 20, 24, 1, 25, 19, 58, 29, 10, 5, 15, 20, 2, 2, 9, 4, 3, 5, | ||
51, 11, 4, 53, 23, 39, 6, 4, 13, 81, 4, 186, 5, 67, 3, 2, 15, 0, | ||
0, 1, 3, 2, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 12, 2, 1, 1, 3, 1, 1, 1, | ||
6, 1, 2, 1, 3, 1, 1, 2, 9, 1, 1, 0, 2, 2, 4, 4, 11, 6, 7, 3, 6, 9, | ||
4, 5, 46, 18, 8, 18, 17, 3, 8, 20, 16, 10, 3, 7, 175, 4, 6, 7, 13, | ||
3, 7, 3, 3, 1, 3, 3, 10, 3, 1, 5, 2, 0, 1, 2, 16, 3, 5, 1, 6, 1, | ||
1, 2, 58, 20, 3, 14, 12, 2, 1, 3, 16, 3, 5, 8, 3, 1, 8, 6, 17, 6, | ||
5, 3, 8, 6, 13, 175, | ||
]; | ||
TABLE[byte as usize] | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
src/memmem/byte_frequencies.rs → src/memmem/byterank/default.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
mod default; | ||
|
||
/// This trait allows the user to customize the heuristic used to determine the | ||
/// relative frequency of a given byte in the dataset being searched. | ||
/// | ||
/// The use of this trait can have a dramatic impact on performance depending | ||
/// on the type of data being searched. The details of why are explained in the | ||
/// docs of [`prefilter::Prefilter`]. To summarize, the core algorithm uses a | ||
/// prefilter to quickly identify candidate matches that are later verified | ||
/// more slowly. This prefilter is implemented in terms of trying to find | ||
/// `rare` bytes at specific offsets that will occur less frequently in the | ||
/// dataset. While the concept of a `rare` byte is similar for most datasets, | ||
/// there are some specific datasets (like binary executables) that have | ||
/// dramatically different byte distributions. For these datasets customizing | ||
/// the byte frequency heuristic can have a massive impact on performance, and | ||
/// might even need to be done at runtime. | ||
/// | ||
/// The default implementation of `HeuristicFrequencyRank` reads from the | ||
/// static frequency table defined in `src/memmem/byte_frequencies.rs`. This | ||
/// is optimal for most inputs, so if you are unsure of the impact of using a | ||
/// custom `HeuristicFrequencyRank` you should probably just use the default. | ||
/// | ||
/// # Example | ||
/// | ||
/// ``` | ||
/// use memchr::memmem::{FinderBuilder, HeuristicFrequencyRank}; | ||
/// | ||
/// /// A byte-frequency table that is good for scanning binary executables. | ||
/// struct Binary; | ||
/// | ||
/// impl HeuristicFrequencyRank for Binary { | ||
/// fn rank(&self, byte: u8) -> u8 { | ||
/// const TABLE: [u8; 256] = [ | ||
/// 255, 128, 61, 43, 50, 41, 27, 28, 57, 15, 21, 13, 24, 17, 17, | ||
/// 89, 58, 16, 11, 7, 14, 23, 7, 6, 24, 9, 6, 5, 9, 4, 7, 16, | ||
/// 68, 11, 9, 6, 88, 7, 4, 4, 23, 9, 4, 8, 8, 5, 10, 4, 30, 11, | ||
/// 9, 24, 11, 5, 5, 5, 19, 11, 6, 17, 9, 9, 6, 8, | ||
/// 48, 58, 11, 14, 53, 40, 9, 9, 254, 35, 3, 6, 52, 23, 6, 6, 27, | ||
/// 4, 7, 11, 14, 13, 10, 11, 11, 5, 2, 10, 16, 12, 6, 19, | ||
/// 19, 20, 5, 14, 16, 31, 19, 7, 14, 20, 4, 4, 19, 8, 18, 20, 24, | ||
/// 1, 25, 19, 58, 29, 10, 5, 15, 20, 2, 2, 9, 4, 3, 5, | ||
/// 51, 11, 4, 53, 23, 39, 6, 4, 13, 81, 4, 186, 5, 67, 3, 2, 15, | ||
/// 0, 0, 1, 3, 2, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, | ||
/// 12, 2, 1, 1, 3, 1, 1, 1, 6, 1, 2, 1, 3, 1, 1, 2, 9, 1, 1, 0, | ||
/// 2, 2, 4, 4, 11, 6, 7, 3, 6, 9, 4, 5, | ||
/// 46, 18, 8, 18, 17, 3, 8, 20, 16, 10, 3, 7, 175, 4, 6, 7, 13, | ||
/// 3, 7, 3, 3, 1, 3, 3, 10, 3, 1, 5, 2, 0, 1, 2, | ||
/// 16, 3, 5, 1, 6, 1, 1, 2, 58, 20, 3, 14, 12, 2, 1, 3, 16, 3, 5, | ||
/// 8, 3, 1, 8, 6, 17, 6, 5, 3, 8, 6, 13, 175, | ||
/// ]; | ||
/// TABLE[byte as usize] | ||
/// } | ||
/// } | ||
/// // Create a new finder with the custom heuristic. | ||
/// let finder = FinderBuilder::new() | ||
/// .build_forward_with_ranker(Binary, b"\x00\x00\xdd\xdd"); | ||
/// // Find needle with custom heuristic. | ||
/// assert!(finder.find(b"\x00\x00\x00\xdd\xdd").is_some()); | ||
/// ``` | ||
pub trait HeuristicFrequencyRank { | ||
/// Return the heuristic frequency rank of the given byte. A lower rank | ||
/// means the byte is believed to occur less frequently in the haystack. | ||
/// | ||
/// Some uses of this heuristic may treat arbitrary absolute rank values as | ||
/// significant. For example, an implementation detail in this crate may | ||
/// determine that heuristic prefilters are inappropriate if every byte in | ||
/// the needle has a "high" rank. | ||
fn rank(&self, byte: u8) -> u8; | ||
} | ||
|
||
/// The default byte frequency heuristic that is good for most haystacks. | ||
pub(crate) struct DefaultFrequencyRank; | ||
|
||
impl HeuristicFrequencyRank for DefaultFrequencyRank { | ||
fn rank(&self, byte: u8) -> u8 { | ||
self::default::RANK[usize::from(byte)] | ||
} | ||
} | ||
|
||
/// This permits passing any implementation of `HeuristicFrequencyRank` as a | ||
/// borrowed version of itself. | ||
impl<'a, R> HeuristicFrequencyRank for &'a R | ||
where | ||
R: HeuristicFrequencyRank, | ||
{ | ||
fn rank(&self, byte: u8) -> u8 { | ||
(**self).rank(byte) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.