Skip to content

Commit

Permalink
literal: fix anchor performance problem
Browse files Browse the repository at this point in the history
The Match literal iterator would repeatedly look for matches
in the remainder of the input after it found its first
match regardless of whether or not the regex was anchored
at the start. This commit adds logic to make sure that we
don't keep looking for matches after the first match is
returned for a start-anchored literal regex.
  • Loading branch information
ethanpailes authored and BurntSushi committed Apr 28, 2018
1 parent 2f6f88e commit cf8acc7
Showing 1 changed file with 66 additions and 2 deletions.
68 changes: 66 additions & 2 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -622,8 +622,13 @@ impl<'c> ExecNoSync<'c> {
}
AnchoredStart => {
let lits = &self.ro.nfa.prefixes;
lits.find_start(&text[start..])
.map(|(s, e)| (start + s, start + e))
if !self.ro.nfa.is_anchored_start
|| (self.ro.nfa.is_anchored_start && start == 0) {
lits.find_start(&text[start..])
.map(|(s, e)| (start + s, start + e))
} else {
None
}
}
AnchoredEnd => {
let lits = &self.ro.suffixes;
Expand Down Expand Up @@ -1286,3 +1291,62 @@ impl ProgramCacheInner {
}
}
}

#[cfg(test)]
mod test {
#[test]
fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
use internal::ExecBuilder;

let backtrack_bytes_re = ExecBuilder::new("^S")
.bounded_backtracking()
.only_utf8(false)
.build()
.map(|exec| exec.into_byte_regex())
.map_err(|err| format!("{}", err))
.unwrap();

let default_bytes_re = ExecBuilder::new("^S")
.only_utf8(false)
.build()
.map(|exec| exec.into_byte_regex())
.map_err(|err| format!("{}", err))
.unwrap();

let input = vec![83, 83];

let s1 = backtrack_bytes_re.split(&input);
let s2 = default_bytes_re.split(&input);
for (chunk1, chunk2) in s1.zip(s2) {
assert_eq!(chunk1, chunk2);
}
}

#[test]
fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
use internal::ExecBuilder;

let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
.bounded_backtracking()
.bytes(true)
.build()
.map(|exec| exec.into_regex())
.map_err(|err| format!("{}", err))
.unwrap();

let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
.bytes(true)
.build()
.map(|exec| exec.into_regex())
.map_err(|err| format!("{}", err))
.unwrap();

let input = "**";

let s1 = backtrack_bytes_re.split(input);
let s2 = default_bytes_re.split(input);
for (chunk1, chunk2) in s1.zip(s2) {
assert_eq!(chunk1, chunk2);
}
}
}

0 comments on commit cf8acc7

Please sign in to comment.