From 5a770dc819eceb7b415e27075bce2435bde39f5f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 26 Aug 2022 19:11:04 -0400 Subject: [PATCH] syntax: permit empty character classes An empty character class is effectively a way to write something that can never match anything. The regex crate has pretty much always returned an error for such things because it was never taught how to handle "always fail" states. Partly because I just didn't think about it when initially writing the regex engines and partly because it isn't often useful. With that said, it should be supported for completeness and because there is no real reason to not support it. Moreover, it can be useful in certain contexts where regexes are generated and you want to insert an expression that can never match. It's somewhat contrived, but it happens when the interface is a regex pattern. Previously, the ban on empty character classes was implemented in the regex-syntax crate. But with the rewrite in #656 getting closer and closer to landing, it's now time to relax this restriction. However, we do keep the overall restriction in the 'regex' API by returning an error in the NFA compiler. Once #656 is done, the new regex engines will permit this case. --- regex-syntax/src/hir/mod.rs | 14 ------ regex-syntax/src/hir/translate.rs | 73 +++---------------------------- src/compile.rs | 12 ++++- 3 files changed, 15 insertions(+), 84 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 6a91c2588..b16df20c8 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -78,21 +78,8 @@ pub enum ErrorKind { /// available, and the regular expression required Unicode aware case /// insensitivity. UnicodeCaseUnavailable, - /// This occurs when the translator attempts to construct a character class - /// that is empty. - /// - /// Note that this restriction in the translator may be removed in the - /// future. - EmptyClassNotAllowed, } -// BREADCRUMBS: -// -// Remove EmptyClassNotAllowed -// Make errors non_exhaustive -// Simplify repetitions (get rid of ZeroOrOne, OneOrMore etc) -// Get rid of deprecated things - impl std::error::Error for Error {} impl fmt::Display for Error { @@ -118,7 +105,6 @@ impl fmt::Display for ErrorKind { "Unicode-aware case insensitivity matching is not available \ (make sure the unicode-case feature is enabled)" } - EmptyClassNotAllowed => "empty character classes are not allowed", }; f.write_str(msg) } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 04409cf95..d7686988a 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -322,12 +322,6 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { ast.negated, &mut cls, )?; - if cls.ranges().is_empty() { - return Err(self.error( - ast.span, - ErrorKind::EmptyClassNotAllowed, - )); - } let expr = Hir::class(hir::Class::Unicode(cls)); self.push(HirFrame::Expr(expr)); } else { @@ -337,13 +331,6 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { ast.negated, &mut cls, )?; - if cls.ranges().is_empty() { - return Err(self.error( - ast.span, - ErrorKind::EmptyClassNotAllowed, - )); - } - let expr = Hir::class(hir::Class::Bytes(cls)); self.push(HirFrame::Expr(expr)); } @@ -839,11 +826,6 @@ impl<'t, 'p> TranslatorI<'t, 'p> { ast_class.negated, class, )?; - if class.ranges().is_empty() { - let err = self - .error(ast_class.span, ErrorKind::EmptyClassNotAllowed); - return Err(err); - } } result } @@ -2357,16 +2339,7 @@ mod tests { #[test] #[cfg(feature = "unicode-gencat")] fn class_unicode_any_empty() { - assert_eq!( - t_err(r"\P{any}"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(0, 1, 1), - Position::new(7, 1, 8) - ), - } - ); + assert_eq!(t(r"\P{any}"), hir_uclass(&[]),); } #[test] @@ -2518,27 +2491,9 @@ mod tests { } ); #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] - assert_eq!( - t_err(r"[^\s\S]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(0, 1, 1), - Position::new(7, 1, 8) - ), - } - ); + assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),); #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] - assert_eq!( - t_err(r"(?-u)[^\s\S]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(5, 1, 6), - Position::new(12, 1, 13) - ), - } - ); + assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),); } #[test] @@ -2686,27 +2641,9 @@ mod tests { hir_uclass(&[('C', 'C'), ('c', 'c')]) ); - assert_eq!( - t_err(r"[^a-c[^c]]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(0, 1, 1), - Position::new(10, 1, 11) - ), - } - ); + assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),); #[cfg(feature = "unicode-case")] - assert_eq!( - t_err(r"(?i)[^a-c[^c]]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(4, 1, 5), - Position::new(14, 1, 15) - ), - } - ); + assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),); } #[test] diff --git a/src/compile.rs b/src/compile.rs index 90ca25015..361ea4cb7 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -457,7 +457,11 @@ impl Compiler { fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { use std::mem::size_of; - assert!(!ranges.is_empty()); + if ranges.is_empty() { + return Err(Error::Syntax( + "empty character classes are not allowed".to_string(), + )); + } if self.compiled.uses_bytes() { Ok(Some(CompileClass { c: self, ranges }.compile()?)) } else { @@ -482,7 +486,11 @@ impl Compiler { &mut self, ranges: &[hir::ClassBytesRange], ) -> ResultOrEmpty { - debug_assert!(!ranges.is_empty()); + if ranges.is_empty() { + return Err(Error::Syntax( + "empty character classes are not allowed".to_string(), + )); + } let first_split_entry = self.insts.len(); let mut holes = vec![];