From 392b3d63a25cceb356b95924e10f01b6ab2050b1 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 30 Dec 2017 14:02:19 -0500
Subject: [PATCH] literals: tweak the TBM heuristic

This commit tweaks the heuristic employed to determine whether to use TBM
or not. For the most part, the heuristic was tweaked by combining the
actual benchmark results with a bit of hand waving. In particular, the
primary change here is that the frequency rank cutoff is no longer a
constant, but rather, a function of the pattern length. That is, we guess
that TBM will do well with longer patterns, even if it contains somewhat
infrequent bytes. We do put a constant cap on this heuristic. That is,
regardless of the length of the pattern, if a "very rare" byte is found
in the pattern, then we won't use TBM.
---
 src/literals.rs | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/src/literals.rs b/src/literals.rs
index 29cb7d3d60..d1902fa89a 100644
--- a/src/literals.rs
+++ b/src/literals.rs
@@ -8,6 +8,7 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
+use std::cmp;
 use std::mem;
 
 use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton};
@@ -695,14 +696,33 @@ impl BoyerMooreSearch {
     /// I had trouble proving a useful turnover point. Hopefully,
     /// we can find one in the future.
     fn should_use(pattern: &[u8]) -> bool {
-        const CUTOFF_FREQ: usize = 242;
-
-        // all the bytes must be more common than the cutoff.
-        pattern.iter().all(|c| freq_rank(*c) >= CUTOFF_FREQ)
-            // and the pattern must be long enough to be worthwhile.
-            // memchr will be faster on `e` because it is short
-            // even though e is quite common.
-            && pattern.len() > 7
+        // The minimum pattern length required to use TBM.
+        const MIN_LEN: usize = 9;
+        // The minimum frequency rank (lower is rarer) that every byte in the
+        // pattern must have in order to use TBM. That is, if the pattern
+        // contains _any_ byte with a lower rank, then TBM won't be used.
+        const MIN_CUTOFF: usize = 150;
+        // The maximum frequency rank for any byte.
+        const MAX_CUTOFF: usize = 255;
+        // The scaling factor used to determine the actual cutoff frequency
+        // to use (keeping in mind that the minimum frequency rank is bounded
+        // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more
+        // likely to be used as the pattern grows longer. That is, longer
+        // patterns permit somewhat less frequent bytes than shorter patterns,
+        // under the assumption that TBM gets better as the pattern gets
+        // longer.
+        const LEN_CUTOFF_PROPORTION: usize = 4;
+
+        let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION);
+        let cutoff = cmp::max(
+            MIN_CUTOFF,
+            MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank),
+        );
+        // The pattern must be long enough to be worthwhile. e.g., memchr will
+        // be faster on `e` because it is short even though e is quite common.
+        pattern.len() > MIN_LEN
+            // all the bytes must be more common than the cutoff.
+            && pattern.iter().all(|c| freq_rank(*c) >= cutoff)
     }
 
     /// Check to see if there is a match at the given position