grep: upgrade to regex-syntax 0.5

This update brings with it many bug fixes: * Better error messages are printed overall. We also include explicit call out for unsupported features like backreferences and look-around. * Regexes like `\s*{` no longer emit incomprehensible errors. * Unicode escape sequences, such as `\u{..}` are now supported. For the most part, this upgrade was done in a straight-forward way. We resist the urge to refactor the `grep` crate, in anticipation of it being rewritten anyway. Note that we removed the `--fixed-strings` suggestion whenever a regex syntax error occurs. In practice, I've found that it results in a lot of false positives, and I believe that its use is not as paramount now that regex parse errors are much more readable. Closes #268, Closes #395, Closes #702, Closes #853
BurntSushi · Mar 14, 2018 · a12868c · a12868c
1 parent c2e97cd
commit a12868c
Show file tree

Hide file tree

Showing 9 changed files with 149 additions and 156 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,11 +14,16 @@ CPUs (such as AVX2) for additional optimizations.
   per line. Previously, the behavior of ripgrep was to report the total number
   of matching lines. (Note that this behavior diverges from the behavior of
   GNU grep.)
+* Octal syntax is no longer supported. ripgrep previously accepted expressions
+  like `\1` as syntax for matching `U+0001`, but ripgrep will now report an
+  error instead.
 
 Feature enhancements:
 
 * [FEATURE #411](https://github.com/BurntSushi/ripgrep/issues/411):
   Add a `--stats` flag, which emits aggregate statistics after search results.
+* [FEATURE #702](https://github.com/BurntSushi/ripgrep/issues/702):
+  Support `\u{..}` Unicode escape sequences.
 * [FEATURE #812](https://github.com/BurntSushi/ripgrep/issues/812):
   Add `-b/--byte-offset` flag that reports byte offset of each matching line.
 * [FEATURE #814](https://github.com/BurntSushi/ripgrep/issues/814):
@@ -29,12 +34,19 @@ Bug fixes:
 * [BUG #135](https://github.com/BurntSushi/ripgrep/issues/135):
   Release portable binaries that conditionally use SSSE3, AVX2, etc., at
   runtime.
+* [BUG #268](https://github.com/BurntSushi/ripgrep/issues/268):
+  Print descriptive error message when trying to use look-around or
+  backreferences.
+* [BUG #395](https://github.com/BurntSushi/ripgrep/issues/395):
+  Show comprehensible error messages for regexes like `\s*{`.
 * [BUG #526](https://github.com/BurntSushi/ripgrep/issues/526):
   Support backslash escapes in globs.
 * [BUG #832](https://github.com/BurntSushi/ripgrep/issues/832):
   Clarify usage instructions for `-f/--file` flag.
 * [BUG #852](https://github.com/BurntSushi/ripgrep/issues/852):
   Be robust with respect to `ENOMEM` errors returned by `mmap`.
+* [BUG #853](https://github.com/BurntSushi/ripgrep/issues/853):
+  Upgrade `grep` crate to `regex-syntax 0.5.0`.
 
 
 0.8.1 (2018-02-20)

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/grep/Cargo.toml b/grep/Cargo.toml
@@ -16,4 +16,4 @@ license = "Unlicense/MIT"
 log = "0.4"
 memchr = "2"
 regex = "0.2.9"
-regex-syntax = "0.4.0"
+regex-syntax = "0.5.3"
diff --git a/grep/src/literals.rs b/grep/src/literals.rs
@@ -10,10 +10,8 @@ principled.
 use std::cmp;
 
 use regex::bytes::RegexBuilder;
-use syntax::{
-    Expr, Literals, Lit,
-    ByteClass, ByteRange, CharClass, ClassRange, Repeater,
-};
+use syntax::hir::{self, Hir, HirKind};
+use syntax::hir::literal::{Literal, Literals};
 
 #[derive(Clone, Debug)]
 pub struct LiteralSets {
@@ -23,12 +21,12 @@ pub struct LiteralSets {
 }
 
 impl LiteralSets {
-    pub fn create(expr: &Expr) -> Self {
+    pub fn create(expr: &Hir) -> Self {
         let mut required = Literals::empty();
         union_required(expr, &mut required);
         LiteralSets {
-            prefixes: expr.prefixes(),
-            suffixes: expr.suffixes(),
+            prefixes: Literals::prefixes(expr),
+            suffixes: Literals::suffixes(expr),
             required: required,
         }
     }
@@ -93,60 +91,52 @@ impl LiteralSets {
     }
 }
 
-fn union_required(expr: &Expr, lits: &mut Literals) {
-    use syntax::Expr::*;
-    match *expr {
-        Literal { ref chars, casei: false } => {
-            let s: String = chars.iter().cloned().collect();
-            lits.cross_add(s.as_bytes());
+fn union_required(expr: &Hir, lits: &mut Literals) {
+    match *expr.kind() {
+        HirKind::Literal(hir::Literal::Unicode(c)) => {
+            let mut buf = [0u8; 4];
+            lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
         }
-        Literal { ref chars, casei: true } => {
-            for &c in chars {
-                let cls = CharClass::new(vec![
-                    ClassRange { start: c, end: c },
-                ]).case_fold();
-                if !lits.add_char_class(&cls) {
-                    lits.cut();
-                    return;
-                }
+        HirKind::Literal(hir::Literal::Byte(b)) => {
+            lits.cross_add(&[b]);
+        }
+        HirKind::Class(hir::Class::Unicode(ref cls)) => {
+            if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) {
+                lits.cut();
+            }
+        }
+        HirKind::Class(hir::Class::Bytes(ref cls)) => {
+            if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) {
+                lits.cut();
             }
         }
-        LiteralBytes { ref bytes, casei: false } => {
-            lits.cross_add(bytes);
+        HirKind::Group(hir::Group { ref hir, .. }) => {
+            union_required(&**hir, lits);
         }
-        LiteralBytes { ref bytes, casei: true } => {
-            for &b in bytes {
-                let cls = ByteClass::new(vec![
-                    ByteRange { start: b, end: b },
-                ]).case_fold();
-                if !lits.add_byte_class(&cls) {
+        HirKind::Repetition(ref x) => {
+            match x.kind {
+                hir::RepetitionKind::ZeroOrOne => lits.cut(),
+                hir::RepetitionKind::ZeroOrMore => lits.cut(),
+                hir::RepetitionKind::OneOrMore => {
+                    union_required(&x.hir, lits);
                     lits.cut();
-                    return;
+                }
+                hir::RepetitionKind::Range(ref rng) => {
+                    let (min, max) = match *rng {
+                        hir::RepetitionRange::Exactly(m) => (m, Some(m)),
+                        hir::RepetitionRange::AtLeast(m) => (m, None),
+                        hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
+                    };
+                    repeat_range_literals(
+                        &x.hir, min, max, x.greedy, lits, union_required);
                 }
             }
         }
-        Class(_) => {
-            lits.cut();
-        }
-        ClassBytes(_) => {
-            lits.cut();
+        HirKind::Concat(ref es) if es.is_empty() => {}
+        HirKind::Concat(ref es) if es.len() == 1 => {
+            union_required(&es[0], lits)
         }
-        Group { ref e, .. } => {
-            union_required(&**e, lits);
-        }
-        Repeat { r: Repeater::ZeroOrOne, .. } => lits.cut(),
-        Repeat { r: Repeater::ZeroOrMore, .. } => lits.cut(),
-        Repeat { ref e, r: Repeater::OneOrMore, .. } => {
-            union_required(&**e, lits);
-            lits.cut();
-        }
-        Repeat { ref e, r: Repeater::Range { min, max }, greedy } => {
-            repeat_range_literals(
-                &**e, min, max, greedy, lits, union_required);
-        }
-        Concat(ref es) if es.is_empty() => {}
-        Concat(ref es) if es.len() == 1 => union_required(&es[0], lits),
-        Concat(ref es) => {
+        HirKind::Concat(ref es) => {
             for e in es {
                 let mut lits2 = lits.to_empty();
                 union_required(e, &mut lits2);
@@ -157,7 +147,6 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
                 if lits2.contains_empty() {
                     lits.cut();
                 }
-                // if !lits.union(lits2) {
                 if !lits.cross_product(&lits2) {
                     // If this expression couldn't yield any literal that
                     // could be extended, then we need to quit. Since we're
@@ -167,15 +156,15 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
                 }
             }
         }
-        Alternate(ref es) => {
+        HirKind::Alternation(ref es) => {
             alternate_literals(es, lits, union_required);
         }
         _ => lits.cut(),
     }
 }
 
-fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
-    e: &Expr,
+fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
+    e: &Hir,
     min: u32,
     max: Option<u32>,
     _greedy: bool,
@@ -204,8 +193,8 @@ fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
     }
 }
 
-fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
-    es: &[Expr],
+fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
+    es: &[Hir],
     lits: &mut Literals,
     mut f: F,
 ) {
@@ -234,11 +223,21 @@ fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
     }
     lits.cut();
     if !lcs.is_empty() {
-        lits.add(Lit::empty());
-        lits.add(Lit::new(lcs.to_vec()));
+        lits.add(Literal::empty());
+        lits.add(Literal::new(lcs.to_vec()));
     }
 }
 
+/// Return the number of characters in the given class.
+fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
+    cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
+}
+
+/// Return the number of bytes in the given class.
+fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
+    cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
+}
+
 /// Converts an arbitrary sequence of bytes to a literal suitable for building
 /// a regular expression.
 fn bytes_to_regex(bs: &[u8]) -> String {

diff --git a/grep/src/nonl.rs b/grep/src/nonl.rs
@@ -1,4 +1,4 @@
-use syntax::Expr;
+use syntax::hir::{self, Hir, HirKind};
 
 use {Error, Result};
 
@@ -9,59 +9,66 @@ use {Error, Result};
 ///
 /// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this
 /// function panics.
-pub fn remove(expr: Expr, byte: u8) -> Result<Expr> {
-    // TODO(burntsushi): There is a bug in this routine where only `\n` is
-    // handled correctly. Namely, `AnyChar` and `AnyByte` need to be translated
-    // to proper character classes instead of the special `AnyCharNoNL` and
-    // `AnyByteNoNL` classes.
-    use syntax::Expr::*;
+pub fn remove(expr: Hir, byte: u8) -> Result<Hir> {
     assert!(byte <= 0x7F);
     let chr = byte as char;
     assert!(chr.len_utf8() == 1);
 
-    Ok(match expr {
-        Literal { chars, casei } => {
-            if chars.iter().position(|&c| c == chr).is_some() {
+    Ok(match expr.into_kind() {
+        HirKind::Empty => Hir::empty(),
+        HirKind::Literal(hir::Literal::Unicode(c)) => {
+            if c == chr {
                 return Err(Error::LiteralNotAllowed(chr));
             }
-            Literal { chars: chars, casei: casei }
+            Hir::literal(hir::Literal::Unicode(c))
         }
-        LiteralBytes { bytes, casei } => {
-            if bytes.iter().position(|&b| b == byte).is_some() {
+        HirKind::Literal(hir::Literal::Byte(b)) => {
+            if b as char == chr {
                 return Err(Error::LiteralNotAllowed(chr));
             }
-            LiteralBytes { bytes: bytes, casei: casei }
+            Hir::literal(hir::Literal::Byte(b))
         }
-        AnyChar => AnyCharNoNL,
-        AnyByte => AnyByteNoNL,
-        Class(mut cls) => {
-            cls.remove(chr);
-            Class(cls)
-        }
-        ClassBytes(mut cls) => {
-            cls.remove(byte);
-            ClassBytes(cls)
-        }
-        Group { e, i, name } => {
-            Group {
-                e: Box::new(remove(*e, byte)?),
-                i: i,
-                name: name,
+        HirKind::Class(hir::Class::Unicode(mut cls)) => {
+            let remove = hir::ClassUnicode::new(Some(
+                hir::ClassUnicodeRange::new(chr, chr),
+            ));
+            cls.difference(&remove);
+            if cls.iter().next().is_none() {
+                return Err(Error::LiteralNotAllowed(chr));
             }
+            Hir::class(hir::Class::Unicode(cls))
         }
-        Repeat { e, r, greedy } => {
-            Repeat {
-                e: Box::new(remove(*e, byte)?),
-                r: r,
-                greedy: greedy,
+        HirKind::Class(hir::Class::Bytes(mut cls)) => {
+            let remove = hir::ClassBytes::new(Some(
+                hir::ClassBytesRange::new(byte, byte),
+            ));
+            cls.difference(&remove);
+            if cls.iter().next().is_none() {
+                return Err(Error::LiteralNotAllowed(chr));
             }
+            Hir::class(hir::Class::Bytes(cls))
+        }
+        HirKind::Anchor(x) => Hir::anchor(x),
+        HirKind::WordBoundary(x) => Hir::word_boundary(x),
+        HirKind::Repetition(mut x) => {
+            x.hir = Box::new(remove(*x.hir, byte)?);
+            Hir::repetition(x)
+        }
+        HirKind::Group(mut x) => {
+            x.hir = Box::new(remove(*x.hir, byte)?);
+            Hir::group(x)
         }
-        Concat(exprs) => {
-            Concat(exprs.into_iter().map(|e| remove(e, byte)).collect::<Result<Vec<Expr>>>()?)
+        HirKind::Concat(xs) => {
+            let xs = xs.into_iter()
+                .map(|e| remove(e, byte))
+                .collect::<Result<Vec<Hir>>>()?;
+            Hir::concat(xs)
         }
-        Alternate(exprs) => {
-            Alternate(exprs.into_iter().map(|e| remove(e, byte)).collect::<Result<Vec<Expr>>>()?)
+        HirKind::Alternation(xs) => {
+            let xs = xs.into_iter()
+                .map(|e| remove(e, byte))
+                .collect::<Result<Vec<Hir>>>()?;
+            Hir::alternation(xs)
         }
-        e => e,
     })
 }