Skip to content

Commit

Permalink
Disable Unicode mode for literal regex.
Browse files Browse the repository at this point in the history
When ripgrep detects a literal, it emits them as raw hex escaped byte
sequences to Regex::new. This permits literal optimizations for arbitrary
byte sequences (i.e., possibly invalid UTF-8). The problem is that
Regex::new interprets hex escaped byte sequences as *Unicode codepoints*
by default, but we want them to actually stand for their raw byte values.
Therefore, disable Unicode mode.

This is OK, since the regex is composed entirely of literals and literal
extraction does Unicode case folding.

Fixes #251
  • Loading branch information
BurntSushi committed Nov 28, 2016
1 parent 301a3fd commit 0473df1
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 4 deletions.
4 changes: 2 additions & 2 deletions grep/src/literals.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,12 @@ impl LiteralSets {
debug!("required literals found: {:?}", req_lits);
let alts: Vec<String> =
req_lits.into_iter().map(|x| bytes_to_regex(x)).collect();
Some(RegexBuilder::new(&alts.join("|")))
Some(RegexBuilder::new(&alts.join("|")).unicode(false))
} else if lit.is_empty() {
None
} else {
debug!("required literal found: {:?}", show(lit));
Some(RegexBuilder::new(&bytes_to_regex(lit)))
Some(RegexBuilder::new(&bytes_to_regex(&lit)).unicode(false))
}
}
}
Expand Down
3 changes: 1 addition & 2 deletions grep/src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,14 +167,13 @@ impl GrepBuilder {
/// Creates a new regex from the given expression with the current
/// configuration.
fn regex(&self, expr: &Expr) -> Result<Regex> {
self.regex_build(RegexBuilder::new(&expr.to_string()))
self.regex_build(RegexBuilder::new(&expr.to_string()).unicode(true))
}

/// Builds a new regex from the given builder using the caller's settings.
fn regex_build(&self, builder: RegexBuilder) -> Result<Regex> {
builder
.multi_line(true)
.unicode(true)
.size_limit(self.opts.size_limit)
.dfa_size_limit(self.opts.dfa_size_limit)
.compile()
Expand Down
9 changes: 9 additions & 0 deletions tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -936,6 +936,15 @@ clean!(regression_229, "[E]conomie", ".", |wd: WorkDir, mut cmd: Command| {
wd.assert_err(&mut cmd);
});

// See: https://github.com/BurntSushi/ripgrep/issues/251
clean!(regression_251, "привет", ".", |wd: WorkDir, mut cmd: Command| {
wd.create("foo", "привет\nПривет\nПрИвЕт");
cmd.arg("-i");

let lines: String = wd.stdout(&mut cmd);
assert_eq!(lines, "foo:привет\nfoo:Привет\nfoo:ПрИвЕт\n");
});

// See: https://github.com/BurntSushi/ripgrep/issues/7
sherlock!(feature_7, "-fpat", "sherlock", |wd: WorkDir, mut cmd: Command| {
wd.create("pat", "Sherlock\nHolmes");
Expand Down

0 comments on commit 0473df1

Please sign in to comment.