Skip to content

Commit

Permalink
search: skip dfa for anchored pats with captures
Browse files Browse the repository at this point in the history
The DFA can't produce captures, but is still faster than the Pike VM
NFA, so the normal approach to finding capture groups is to look for
the entire match with the DFA and then run the NFA on the substring
of the input that matched. In cases where the regex in anchored, the
match always starts at the beginning of the input, so there is never
any point to trying the DFA first.

The DFA can still be useful for rejecting inputs which are not in the
language of the regular expression, but anchored regex with capture
groups are most commonly used in a parsing context, so it seems like a
fair trade-off.

Fixes #348
  • Loading branch information
Ethan Pailes authored and BurntSushi committed Dec 30, 2017
1 parent ba3cf03 commit 8138366
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 5 deletions.
35 changes: 35 additions & 0 deletions bench/src/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,41 @@ macro_rules! bench_find {
}
}

// USAGE: bench_captures!(name, pattern, groups, haystack);
//
// CONTRACT:
// Given:
// ident, the desired benchmarking function name
// pattern : ::Regex, the regular expression to be executed
// groups : usize, the number of capture groups
// haystack : String, the string to search
// bench_captures will benchmark how fast re.captures() produces
// the capture groups in question.
macro_rules! bench_captures {
($name:ident, $pattern:expr, $count:expr, $haystack:expr) => {

#[cfg(feature = "re-rust")]
#[bench]
fn $name(b: &mut Bencher) {
use std::sync::Mutex;

lazy_static! {
static ref RE: Mutex<Regex> = Mutex::new($pattern);
static ref TEXT: Mutex<Text> = Mutex::new(text!($haystack));
};
let re = RE.lock().unwrap();
let text = TEXT.lock().unwrap();
b.bytes = text.len() as u64;
b.iter(|| {
match re.captures(&text) {
None => assert!(false, "no captures"),
Some(caps) => assert_eq!($count + 1, caps.len()),
}
});
}
}
}

mod ffi;
mod misc;
mod regexdna;
Expand Down
82 changes: 82 additions & 0 deletions bench/src/misc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,85 @@ macro_rules! reallyhard2 { () => (r"\w+\s+Holmes") }

bench_match!(reallyhard2_1K, reallyhard2!(),
get_text(TXT_1K, reallyhard2_suffix()));


//
// Benchmarks to justify the short-haystack NFA fallthrough optimization
// implemented by `read_captures_at` in regex/src/exec.rs. See github issue
// #348.
//
// The procedure used to try to determine the right hardcoded cutoff
// for the short-haystack optimization in issue #348 is as follows.
//
// ```
// > cd bench
// > cargo bench --features re-rust short_hay | tee dfa-nfa.res
// > # modify the `MatchType::Dfa` branch in exec.rs:read_captures_at
// > # to just execute the nfa
// > cargo bench --features re-rust short_hay | tee nfa-only.res
// > cargo benchcmp dfa-nfa.res nfa-only.res
// ```
//
// The expected result is that short inputs will go faster under
// the nfa-only mode, but at some turnover point the dfa-nfa mode
// will start to win again. Unfortunately, that is not what happened.
// Instead there was no noticeable change in the bench results, so
// I've opted to just do the more conservative anchor optimization.
//
bench_captures!(short_haystack_1x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
String::from("aaaabbbbccccbbbdddd"));
bench_captures!(short_haystack_2x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(2).collect::<String>(),
repeat("dddd").take(2).collect::<String>(),
));
bench_captures!(short_haystack_3x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(3).collect::<String>(),
repeat("dddd").take(3).collect::<String>(),
));
bench_captures!(short_haystack_4x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(4).collect::<String>(),
repeat("dddd").take(4).collect::<String>(),
));
bench_captures!(short_haystack_10x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(10).collect::<String>(),
repeat("dddd").take(10).collect::<String>(),
));
bench_captures!(short_haystack_100x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(100).collect::<String>(),
repeat("dddd").take(100).collect::<String>(),
));
bench_captures!(short_haystack_1000x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(1000).collect::<String>(),
repeat("dddd").take(1000).collect::<String>(),
));
bench_captures!(short_haystack_10000x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(10000).collect::<String>(),
repeat("dddd").take(10000).collect::<String>(),
));
bench_captures!(short_haystack_100000x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(100000).collect::<String>(),
repeat("dddd").take(100000).collect::<String>(),
));
bench_captures!(short_haystack_1000000x,
Regex::new(r"(bbbb)cccc(bbb)").unwrap(), 2,
format!("{}bbbbccccbbb{}",
repeat("aaaa").take(1000000).collect::<String>(),
repeat("dddd").take(1000000).collect::<String>(),
));
14 changes: 9 additions & 5 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -554,12 +554,16 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
})
}
MatchType::Dfa => {
match self.find_dfa_forward(text, start) {
dfa::Result::Match((s, e)) => {
self.captures_nfa_with_match(slots, text, s, e)
if self.ro.nfa.is_anchored_start {
self.captures_nfa(slots, text, start)
} else {
match self.find_dfa_forward(text, start) {
dfa::Result::Match((s, e)) => {
self.captures_nfa_with_match(slots, text, s, e)
}
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.captures_nfa(slots, text, start),
}
dfa::Result::NoMatch(_) => None,
dfa::Result::Quit => self.captures_nfa(slots, text, start),
}
}
MatchType::DfaAnchoredReverse => {
Expand Down

0 comments on commit 8138366

Please sign in to comment.