Skip to content

Commit

Permalink
syntax: permit empty character classes
Browse files Browse the repository at this point in the history
An empty character class is effectively a way to write something that
can never match anything. The regex crate has pretty much always
returned an error for such things because it was never taught how to
handle "always fail" states. Partly because I just didn't think about it
when initially writing the regex engines and partly because it isn't
often useful.

With that said, it should be supported for completeness and because
there is no real reason to not support it. Moreover, it can be useful in
certain contexts where regexes are generated and you want to insert an
expression that can never match. It's somewhat contrived, but it
happens when the interface is a regex pattern.

Previously, the ban on empty character classes was implemented in the
regex-syntax crate. But with the rewrite in #656 getting closer and
closer to landing, it's now time to relax this restriction. However, we
do keep the overall restriction in the 'regex' API by returning an error
in the NFA compiler. Once #656 is done, the new regex engines will
permit this case.
  • Loading branch information
BurntSushi committed Nov 5, 2022
1 parent d6e936b commit 8e98623
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 84 deletions.
14 changes: 0 additions & 14 deletions regex-syntax/src/hir/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,21 +78,8 @@ pub enum ErrorKind {
/// available, and the regular expression required Unicode aware case
/// insensitivity.
UnicodeCaseUnavailable,
/// This occurs when the translator attempts to construct a character class
/// that is empty.
///
/// Note that this restriction in the translator may be removed in the
/// future.
EmptyClassNotAllowed,
}

// BREADCRUMBS:
//
// Remove EmptyClassNotAllowed
// Make errors non_exhaustive
// Simplify repetitions (get rid of ZeroOrOne, OneOrMore etc)
// Get rid of deprecated things

impl std::error::Error for Error {}

impl fmt::Display for Error {
Expand All @@ -118,7 +105,6 @@ impl fmt::Display for ErrorKind {
"Unicode-aware case insensitivity matching is not available \
(make sure the unicode-case feature is enabled)"
}
EmptyClassNotAllowed => "empty character classes are not allowed",
};
f.write_str(msg)
}
Expand Down
73 changes: 5 additions & 68 deletions regex-syntax/src/hir/translate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -322,12 +322,6 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
ast.negated,
&mut cls,
)?;
if cls.ranges().is_empty() {
return Err(self.error(
ast.span,
ErrorKind::EmptyClassNotAllowed,
));
}
let expr = Hir::class(hir::Class::Unicode(cls));
self.push(HirFrame::Expr(expr));
} else {
Expand All @@ -337,13 +331,6 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
ast.negated,
&mut cls,
)?;
if cls.ranges().is_empty() {
return Err(self.error(
ast.span,
ErrorKind::EmptyClassNotAllowed,
));
}

let expr = Hir::class(hir::Class::Bytes(cls));
self.push(HirFrame::Expr(expr));
}
Expand Down Expand Up @@ -839,11 +826,6 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
ast_class.negated,
class,
)?;
if class.ranges().is_empty() {
let err = self
.error(ast_class.span, ErrorKind::EmptyClassNotAllowed);
return Err(err);
}
}
result
}
Expand Down Expand Up @@ -2357,16 +2339,7 @@ mod tests {
#[test]
#[cfg(feature = "unicode-gencat")]
fn class_unicode_any_empty() {
assert_eq!(
t_err(r"\P{any}"),
TestError {
kind: hir::ErrorKind::EmptyClassNotAllowed,
span: Span::new(
Position::new(0, 1, 1),
Position::new(7, 1, 8)
),
}
);
assert_eq!(t(r"\P{any}"), hir_uclass(&[]),);
}

#[test]
Expand Down Expand Up @@ -2518,27 +2491,9 @@ mod tests {
}
);
#[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
assert_eq!(
t_err(r"[^\s\S]"),
TestError {
kind: hir::ErrorKind::EmptyClassNotAllowed,
span: Span::new(
Position::new(0, 1, 1),
Position::new(7, 1, 8)
),
}
);
assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),);
#[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
assert_eq!(
t_err(r"(?-u)[^\s\S]"),
TestError {
kind: hir::ErrorKind::EmptyClassNotAllowed,
span: Span::new(
Position::new(5, 1, 6),
Position::new(12, 1, 13)
),
}
);
assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),);
}

#[test]
Expand Down Expand Up @@ -2686,27 +2641,9 @@ mod tests {
hir_uclass(&[('C', 'C'), ('c', 'c')])
);

assert_eq!(
t_err(r"[^a-c[^c]]"),
TestError {
kind: hir::ErrorKind::EmptyClassNotAllowed,
span: Span::new(
Position::new(0, 1, 1),
Position::new(10, 1, 11)
),
}
);
assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),);
#[cfg(feature = "unicode-case")]
assert_eq!(
t_err(r"(?i)[^a-c[^c]]"),
TestError {
kind: hir::ErrorKind::EmptyClassNotAllowed,
span: Span::new(
Position::new(4, 1, 5),
Position::new(14, 1, 15)
),
}
);
assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),);
}

#[test]
Expand Down
12 changes: 10 additions & 2 deletions src/compile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,11 @@ impl Compiler {
fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
use std::mem::size_of;

assert!(!ranges.is_empty());
if ranges.is_empty() {
return Err(Error::Syntax(
"empty character classes are not allowed".to_string(),
));
}
if self.compiled.uses_bytes() {
Ok(Some(CompileClass { c: self, ranges }.compile()?))
} else {
Expand All @@ -482,7 +486,11 @@ impl Compiler {
&mut self,
ranges: &[hir::ClassBytesRange],
) -> ResultOrEmpty {
debug_assert!(!ranges.is_empty());
if ranges.is_empty() {
return Err(Error::Syntax(
"empty character classes are not allowed".to_string(),
));
}

let first_split_entry = self.insts.len();
let mut holes = vec![];
Expand Down

0 comments on commit 8e98623

Please sign in to comment.