From f43d7451bd494f1f5b4cad4076a65a384e1efeae Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 17 Apr 2023 11:47:22 -0400 Subject: [PATCH 01/79] msrv: set to Rust 1.60.0 This sets 'rust-version' to 1.60 and also increases the pinned Rust version that we test against in CI to 1.60.0. Rust 1.60.0 was released over a year ago and contains some important stuff. Notably, it includes namespaced and weak dependency features that are used in the (soon to be) released aho-corasick 1.0. They will also be extensively used in regex-automata 0.3, which is coming to a rust-lang/regex repository near you Real Soon Now. --- .github/workflows/ci.yml | 2 +- Cargo.toml | 1 + regex-syntax/Cargo.toml | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5171d7936..896bb4f2d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,7 +52,7 @@ jobs: include: - build: pinned os: ubuntu-latest - rust: 1.41.1 + rust: 1.60.0 - build: stable os: ubuntu-latest rust: stable diff --git a/Cargo.toml b/Cargo.toml index 4c5bd1cc1..65fbbdb38 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ categories = ["text-processing"] autotests = false exclude = ["/scripts/*", "/.github/*"] edition = "2018" +rust-version = "1.60.0" [workspace] members = [ diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index be9aeb568..949316291 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -9,6 +9,7 @@ homepage = "https://github.com/rust-lang/regex" description = "A regular expression parser." workspace = ".." edition = "2018" +rust-version = "1.60.0" # Features are documented in the "Crate features" section of the crate docs: # https://docs.rs/regex-syntax/*/#crate-features From b68896d3c5082034072146c1d2adf8fb36696f53 Mon Sep 17 00:00:00 2001 From: XXIV <13811862+thechampagne@users.noreply.github.com> Date: Fri, 6 Jan 2023 02:15:01 +0300 Subject: [PATCH 02/79] capi: add missing void Apparently in C, an empty parameter list means "the function takes an unspecified number of arguments." (lol.) But an explicit void means "the function takes zero arguments." The latter is indeed what we want here. Ref: https://softwareengineering.stackexchange.com/questions/286490/what-is-the-difference-between-function-and-functionvoid Closes #942 --- regex-capi/include/rure.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/regex-capi/include/rure.h b/regex-capi/include/rure.h index a87be61a8..01173b451 100644 --- a/regex-capi/include/rure.h +++ b/regex-capi/include/rure.h @@ -408,7 +408,7 @@ size_t rure_captures_len(rure_captures *captures); * safe to call rure_compile from multiple threads simultaneously using the * same options pointer. */ -rure_options *rure_options_new(); +rure_options *rure_options_new(void); /* * rure_options_free frees the given options. @@ -536,7 +536,7 @@ size_t rure_set_len(rure_set *re); * It is not safe to use errors from multiple threads simultaneously. An error * value may be reused on subsequent calls to rure_compile. */ -rure_error *rure_error_new(); +rure_error *rure_error_new(void); /* * rure_error_free frees the error given. From caf014111ff7e541f67d3e281cad2ac06d79c61f Mon Sep 17 00:00:00 2001 From: Martin Pool Date: Wed, 21 Sep 2022 10:38:17 -0700 Subject: [PATCH 03/79] api: impl Default for RegexSet This is justified by the fact that a RegexSet is, after all, a set. And a set has a very obvious default value: the empty set. Plus, this is exactly what you get by passing a default `Vec` or an empty iterator to the `RegexSet::new` constructor. We specifically do not add a `Default` impl for Regex because it has no obvious default value. Fixes #905, Closes #906 --- src/re_set.rs | 6 ++++++ tests/set.rs | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/src/re_set.rs b/src/re_set.rs index a6d886d76..92d475f7b 100644 --- a/src/re_set.rs +++ b/src/re_set.rs @@ -289,6 +289,12 @@ impl RegexSet { } } +impl Default for RegexSet { + fn default() -> Self { + RegexSet::empty() + } +} + /// A set of matches returned by a regex set. #[derive(Clone, Debug)] pub struct SetMatches { diff --git a/tests/set.rs b/tests/set.rs index 37fcf8700..d1144d662 100644 --- a/tests/set.rs +++ b/tests/set.rs @@ -65,3 +65,10 @@ fn len_and_empty() { assert_eq!(not_empty.len(), 2); assert!(!not_empty.is_empty()); } + +#[test] +fn default_set_is_empty() { + let set: regex::bytes::RegexSet = Default::default(); + assert_eq!(set.len(), 0); + assert!(set.is_empty()); +} From 544374b49bd8a5fa2750b21a80656eb00cbe5227 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 28 Feb 2023 17:09:35 -0500 Subject: [PATCH 04/79] regex-debug: this removes regex-debug There will be a new 'regex-cli' tool that will supplant this (and more). --- .github/workflows/ci.yml | 5 - Cargo.toml | 2 +- regex-debug/Cargo.toml | 18 -- regex-debug/src/main.rs | 376 --------------------------------------- 4 files changed, 1 insertion(+), 400 deletions(-) delete mode 100644 regex-debug/Cargo.toml delete mode 100644 regex-debug/src/main.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 896bb4f2d..bacb4b087 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -159,11 +159,6 @@ jobs: cd regex-capi ./test - - if: matrix.build == 'nightly' - name: Compile regex-debug - run: | - ${{ env.CARGO }} build --verbose --manifest-path regex-debug/Cargo.toml $TARGET - - if: matrix.build == 'nightly' name: Run benchmarks as tests run: | diff --git a/Cargo.toml b/Cargo.toml index 65fbbdb38..5e707ae79 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ rust-version = "1.60.0" [workspace] members = [ - "bench", "regex-capi", "regex-debug", "regex-syntax", + "bench", "regex-capi", "regex-syntax", ] [lib] diff --git a/regex-debug/Cargo.toml b/regex-debug/Cargo.toml deleted file mode 100644 index 1db4036b9..000000000 --- a/regex-debug/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -publish = false -name = "regex-debug" -version = "0.1.0" -authors = ["The Rust Project Developers"] -license = "MIT OR Apache-2.0" -repository = "https://github.com/rust-lang/regex" -documentation = "https://docs.rs/regex" -homepage = "https://github.com/rust-lang/regex" -description = "A tool useful for debugging regular expressions." -workspace = ".." -edition = "2018" - -[dependencies] -docopt = "1" -regex = { version = "1.1", path = ".." } -regex-syntax = { version = "0.6", path = "../regex-syntax" } -serde = { version = "1", features = ["derive"] } diff --git a/regex-debug/src/main.rs b/regex-debug/src/main.rs deleted file mode 100644 index a7dd453e1..000000000 --- a/regex-debug/src/main.rs +++ /dev/null @@ -1,376 +0,0 @@ -use std::error; -use std::io::{self, Write}; -use std::process; -use std::result; - -use docopt::Docopt; -use regex::internal::{Compiler, LiteralSearcher}; -use regex_syntax::hir::literal::Literals; -use regex_syntax::hir::Hir; - -const USAGE: &'static str = " -Usage: - regex-debug [options] ast - regex-debug [options] hir - regex-debug [options] prefixes ... - regex-debug [options] suffixes ... - regex-debug [options] anchors - regex-debug [options] captures - regex-debug [options] compile ... - regex-debug [options] utf8-ranges - regex-debug [options] utf8-ranges-rev - regex-debug --help - -Options: - --help Show this usage message. - --size-limit ARG An approximate size limit on the total size (in bytes) - of a compiled regular expression program. - [default: 10485760] - --bytes Show the instruction codes for byte oriented programs. - (As opposed to Unicode oriented programs.) - --dfa Show the instruction codes for a DFA. - --dfa-reverse Show the instruction codes for a reverse DFA. - This implies --dfa. - -a, --all-literals Shows all literals extracted. - By default, only unambiguous literals are shown. - --literal-limit ARG An approximate limit on the total size (in bytes) - of all literals extracted. [default: 250] - --class-limit ARG A limit on the size of character classes used to - extract literals. [default: 10] - --literal-bytes Show raw literal bytes instead of Unicode chars. - --lcp Show the longest common prefix of all the literals - extracted. - --lcs Show the longest common suffix of all the literals - extracted. - --searcher Show the debug output for the literal searcher - constructed by the literals found. - --quiet Show less output. -"; - -#[derive(serde::Deserialize)] -struct Args { - cmd_ast: bool, - cmd_hir: bool, - cmd_prefixes: bool, - cmd_suffixes: bool, - cmd_anchors: bool, - cmd_captures: bool, - cmd_compile: bool, - cmd_utf8_ranges: bool, - cmd_utf8_ranges_rev: bool, - - arg_pattern: String, - arg_patterns: Vec, - arg_class: String, - - flag_size_limit: usize, - flag_bytes: bool, - flag_dfa: bool, - flag_dfa_reverse: bool, - flag_all_literals: bool, - flag_literal_limit: usize, - flag_class_limit: usize, - flag_literal_bytes: bool, - flag_lcp: bool, - flag_lcs: bool, - flag_searcher: bool, - flag_quiet: bool, -} - -type Result = result::Result>; - -fn main() { - let mut args: Args = Docopt::new(USAGE) - .and_then(|d| d.deserialize()) - .unwrap_or_else(|e| e.exit()); - if args.flag_dfa_reverse { - args.flag_dfa = true; - } - match run(&args) { - Ok(_) => process::exit(0), - Err(err) => { - let _ = writeln!(&mut io::stderr(), "{}", err); - process::exit(1) - } - } -} - -fn run(args: &Args) -> Result<()> { - if args.cmd_ast { - cmd_ast(args) - } else if args.cmd_hir { - cmd_hir(args) - } else if args.cmd_prefixes { - cmd_literals(args) - } else if args.cmd_suffixes { - cmd_literals(args) - } else if args.cmd_anchors { - cmd_anchors(args) - } else if args.cmd_captures { - cmd_captures(args) - } else if args.cmd_compile { - cmd_compile(args) - } else if args.cmd_utf8_ranges { - cmd_utf8_ranges(args) - } else if args.cmd_utf8_ranges_rev { - cmd_utf8_ranges_rev(args) - } else { - unreachable!() - } -} - -fn cmd_ast(args: &Args) -> Result<()> { - use regex_syntax::ast::parse::Parser; - - let mut parser = Parser::new(); - let ast = parser.parse(&args.arg_pattern)?; - println!("{:#?}", ast); - Ok(()) -} - -fn cmd_hir(args: &Args) -> Result<()> { - use regex_syntax::ParserBuilder; - - let mut parser = ParserBuilder::new().allow_invalid_utf8(false).build(); - let hir = parser.parse(&args.arg_pattern)?; - println!("{:#?}", hir); - Ok(()) -} - -fn cmd_literals(args: &Args) -> Result<()> { - let exprs = args.parse_many()?; - let mut lits = if args.cmd_prefixes { - args.literals(&exprs, |lits, e| lits.union_prefixes(e)) - } else { - args.literals(&exprs, |lits, e| lits.union_suffixes(e)) - }; - if !args.flag_all_literals { - if args.cmd_prefixes { - lits = lits.unambiguous_prefixes(); - } else { - lits = lits.unambiguous_suffixes(); - } - } - if args.flag_searcher { - if args.cmd_prefixes { - println!("{:?}", LiteralSearcher::prefixes(lits)) - } else { - println!("{:?}", LiteralSearcher::suffixes(lits)) - } - } else if args.flag_lcp { - println!("{}", escape_unicode(lits.longest_common_prefix())); - } else if args.flag_lcs { - println!("{}", escape_unicode(lits.longest_common_suffix())); - } else { - for lit in lits.literals() { - if args.flag_literal_bytes { - if lit.is_cut() { - println!("Cut({})", escape_bytes(lit)); - } else { - println!("Complete({})", escape_bytes(lit)); - } - } else { - println!("{:?}", lit); - } - } - } - Ok(()) -} - -fn cmd_anchors(args: &Args) -> Result<()> { - let expr = args.parse_one()?; - if expr.is_anchored_start() { - println!("start"); - } - if expr.is_anchored_end() { - println!("end"); - } - Ok(()) -} - -fn cmd_captures(args: &Args) -> Result<()> { - let expr = args.parse_one()?; - let prog = args.compiler().only_utf8(false).compile(&[expr])?; - for (i, name) in prog.captures.iter().enumerate() { - match *name { - None => println!("{}", i), - Some(ref name) => println!("{}:{}", i, name), - } - } - Ok(()) -} - -fn cmd_compile(args: &Args) -> Result<()> { - let exprs = args.parse_many()?; - let compiler = args - .compiler() - .bytes(args.flag_bytes) - .only_utf8(!args.flag_bytes) - .dfa(args.flag_dfa) - .reverse(args.flag_dfa_reverse); - let prog = compiler.compile(&exprs)?; - if !args.flag_quiet { - print!("{:?}", prog); - } else { - println!("instruction count: {}", prog.insts.len()); - } - Ok(()) -} - -fn cmd_utf8_ranges(args: &Args) -> Result<()> { - use regex_syntax::hir::{self, HirKind}; - use regex_syntax::utf8::Utf8Sequences; - use regex_syntax::ParserBuilder; - - let hir = ParserBuilder::new() - .build() - .parse(&format!("[{}]", args.arg_class))?; - let cls = match hir.into_kind() { - HirKind::Class(hir::Class::Unicode(cls)) => cls, - _ => { - return Err( - format!("unexpected HIR, expected Unicode class").into() - ) - } - }; - let mut char_count = 0; - for (i, range) in cls.iter().enumerate() { - if i > 0 { - println!("----------------------------"); - } - char_count += (range.end() as u32) - (range.start() as u32) + 1; - for seq in Utf8Sequences::new(range.start(), range.end()) { - for utf8_range in seq.into_iter() { - print!("[{:02X}-{:02X}]", utf8_range.start, utf8_range.end); - } - println!(); - } - } - println!("codepoint count: {}", char_count); - Ok(()) -} - -fn cmd_utf8_ranges_rev(args: &Args) -> Result<()> { - use regex_syntax::hir::{self, HirKind}; - use regex_syntax::utf8::Utf8Sequences; - use regex_syntax::ParserBuilder; - - let hir = ParserBuilder::new() - .build() - .parse(&format!("[{}]", args.arg_class))?; - let cls = match hir.into_kind() { - HirKind::Class(hir::Class::Unicode(cls)) => cls, - _ => { - return Err( - format!("unexpected HIR, expected Unicode class").into() - ) - } - }; - let mut char_count = 0; - let mut seqs = vec![]; - for (_, range) in cls.iter().enumerate() { - char_count += (range.end() as u32) - (range.start() as u32) + 1; - for seq in Utf8Sequences::new(range.start(), range.end()) { - let mut seq = seq.as_slice().to_vec(); - seq.reverse(); - seqs.push(seq); - } - } - seqs.sort(); - for seq in seqs { - for utf8_range in seq.into_iter() { - print!("[{:02X}-{:02X}]", utf8_range.start, utf8_range.end); - } - println!(); - } - println!("codepoint count: {}", char_count); - Ok(()) -} - -impl Args { - fn parse_one(&self) -> Result { - parse(&self.arg_pattern) - } - - fn parse_many(&self) -> Result> { - self.arg_patterns.iter().map(|s| parse(s)).collect() - } - - fn literals bool>( - &self, - exprs: &[Hir], - get_literals: F, - ) -> Literals { - let mut lits = Some(self.empty_literals()); - for e in exprs { - lits = lits.and_then(|mut lits| { - if !get_literals(&mut lits, e) { - None - } else { - Some(lits) - } - }); - } - lits.unwrap_or(self.empty_literals()) - } - - fn empty_literals(&self) -> Literals { - let mut lits = Literals::empty(); - lits.set_limit_size(self.flag_literal_limit); - lits.set_limit_class(self.flag_class_limit); - lits - } - - fn compiler(&self) -> Compiler { - Compiler::new().size_limit(self.flag_size_limit) - } -} - -fn parse(re: &str) -> Result { - use regex_syntax::ParserBuilder; - ParserBuilder::new() - .allow_invalid_utf8(true) - .build() - .parse(re) - .map_err(From::from) -} - -fn escape_unicode(bytes: &[u8]) -> String { - let show = match ::std::str::from_utf8(bytes) { - Ok(v) => v.to_string(), - Err(_) => escape_bytes(bytes), - }; - let mut space_escaped = String::new(); - for c in show.chars() { - if c.is_whitespace() { - let escaped = if c as u32 <= 0x7F { - escape_byte(c as u8) - } else { - if c as u32 <= 0xFFFF { - format!(r"\u{{{:04x}}}", c as u32) - } else { - format!(r"\U{{{:08x}}}", c as u32) - } - }; - space_escaped.push_str(&escaped); - } else { - space_escaped.push(c); - } - } - space_escaped -} - -fn escape_bytes(bytes: &[u8]) -> String { - let mut s = String::new(); - for &b in bytes { - s.push_str(&escape_byte(b)); - } - s -} - -fn escape_byte(byte: u8) -> String { - use std::ascii::escape_default; - - let escaped: Vec = escape_default(byte).collect(); - String::from_utf8_lossy(&escaped).into_owned() -} From 345f18ace57cc019f7835a0e11babb1f4fcd7f11 Mon Sep 17 00:00:00 2001 From: snsmac Date: Mon, 1 Aug 2022 20:38:40 +0200 Subject: [PATCH 05/79] syntax: \p{Sc} should map to \p{Currency_Symbol} 'sc' refers to the 'Currency_Symbol' general category, but is also the abbreviation for the 'Script' property. So when going through the canonicalization process, it would get normalized to 'Script' before being checked as a general category. We fix it by special casing it. See also #719 Fixes #835, #899 --- regex-syntax/src/unicode.rs | 7 ++++++- tests/unicode.rs | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index 8194d7f55..84e781db4 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -243,7 +243,12 @@ impl<'a> ClassQuery<'a> { // a general category. (Currently, we don't even support the // 'Case_Folding' property. But if we do in the future, users will be // required to spell it out.) - if norm != "cf" { + // + // Also 'sc' refers to the 'Currency_Symbol' general category, but is + // also the abbreviation for the 'Script' property. So we avoid calling + // 'canonical_prop' for it too, which would erroneously normalize it + // to 'Script'. + if norm != "cf" && norm != "sc" { if let Some(canon) = canonical_prop(&norm)? { return Ok(CanonicalClassQuery::Binary(canon)); } diff --git a/tests/unicode.rs b/tests/unicode.rs index 9b3228624..748bbb79c 100644 --- a/tests/unicode.rs +++ b/tests/unicode.rs @@ -77,6 +77,7 @@ mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4))); // See: https://github.com/rust-lang/regex/issues/719 mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4))); mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4))); +mat!(uni_class_gencat_format_abbrev3, r"\p{Sc}", "$", Some((0, 1))); mat!( uni_class_gencat_initial_punctuation, r"\p{Initial_Punctuation}", From 6bbb06494f2d3f4bd36f848e2a5e2a464ab4acb1 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 15 Mar 2023 08:40:42 -0400 Subject: [PATCH 06/79] syntax: \p{Lc} should map to \p{Cased_Letter} This is more similar to the \p{Cf} bug than the \p{Sc} bug, but basically, 'lc' is an abbreviation for both 'Cased_Letter' and 'Lowercase_Mapping'. Since we don't support the latter (currently), we make 'lc' map to 'Cased_Letter'. If we do ever add 'Lowercase_Mapping' in the future, then we will just require users to type out its full form. Fixes #965 --- regex-syntax/src/unicode.rs | 7 ++++++- tests/unicode.rs | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index 84e781db4..5c22f66ac 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -248,7 +248,12 @@ impl<'a> ClassQuery<'a> { // also the abbreviation for the 'Script' property. So we avoid calling // 'canonical_prop' for it too, which would erroneously normalize it // to 'Script'. - if norm != "cf" && norm != "sc" { + // + // Another case: 'lc' is an abbreviation for the 'Cased_Letter' + // general category, but is also an abbreviation for the 'Lowercase_Mapping' + // property. We don't currently support the latter, so as with 'cf' + // above, we treat 'lc' as 'Cased_Letter'. + if norm != "cf" && norm != "sc" && norm != "lc" { if let Some(canon) = canonical_prop(&norm)? { return Ok(CanonicalClassQuery::Binary(canon)); } diff --git a/tests/unicode.rs b/tests/unicode.rs index 748bbb79c..d7dbdd31b 100644 --- a/tests/unicode.rs +++ b/tests/unicode.rs @@ -35,6 +35,8 @@ mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None); // We should test more, but there's a lot. Write a script to generate more of // these tests. mat!(uni_class_gencat_cased_letter, r"\p{Cased_Letter}", "A", Some((0, 3))); +mat!(uni_class_gencat_cased_letter2, r"\p{gc=LC}", "A", Some((0, 3))); +mat!(uni_class_gencat_cased_letter3, r"\p{LC}", "A", Some((0, 3))); mat!( uni_class_gencat_close_punctuation, r"\p{Close_Punctuation}", From 906d1497eece129793da602037a9970c338b4fab Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 24 Aug 2022 14:46:35 -0400 Subject: [PATCH 07/79] syntax: add 'try_case_fold_simple' to 'Class' Previously this was only defined on 'ClassUnicode', but since 'Class' might contain a 'ClassUnicode', it should be defined here too. We don't need to update any call sites since this crate doesn't actually use 'Class::case_fold_simple' directly, and instead manipulates the underlying 'ClassUnicode' or 'ClassBytes'. --- regex-syntax/src/hir/mod.rs | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 156bcc284..55dc95c20 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -795,6 +795,15 @@ impl Class { /// /// If this is a byte oriented character class, then this will be limited /// to the ASCII ranges `A-Z` and `a-z`. + /// + /// # Panics + /// + /// This routine panics when the case mapping data necessary for this + /// routine to complete is unavailable. This occurs when the `unicode-case` + /// feature is not enabled and the underlying class is Unicode oriented. + /// + /// Callers should prefer using `try_case_fold_simple` instead, which will + /// return an error instead of panicking. pub fn case_fold_simple(&mut self) { match *self { Class::Unicode(ref mut x) => x.case_fold_simple(), @@ -802,6 +811,29 @@ impl Class { } } + /// Apply Unicode simple case folding to this character class, in place. + /// The character class will be expanded to include all simple case folded + /// character variants. + /// + /// If this is a byte oriented character class, then this will be limited + /// to the ASCII ranges `A-Z` and `a-z`. + /// + /// # Error + /// + /// This routine returns an error when the case mapping data necessary + /// for this routine to complete is unavailable. This occurs when the + /// `unicode-case` feature is not enabled and the underlying class is + /// Unicode oriented. + pub fn try_case_fold_simple( + &mut self, + ) -> result::Result<(), CaseFoldError> { + match *self { + Class::Unicode(ref mut x) => x.try_case_fold_simple()?, + Class::Bytes(ref mut x) => x.case_fold_simple(), + } + Ok(()) + } + /// Negate this character class in place. /// /// After completion, this character class will contain precisely the From f59ebfa478f5d4caaa30d3ccabbc764c315f58df Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 26 Aug 2022 13:33:52 -0400 Subject: [PATCH 08/79] syntax: switch to Rust 2021 This effectively bumps the MSRV of 'regex' to Rust 1.56, which was released in Oct 2021. It's not quite a year at the time of writing, but I expect it will be a year by the time this change is released. --- regex-syntax/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index 949316291..6c92717e3 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -8,7 +8,7 @@ documentation = "https://docs.rs/regex-syntax" homepage = "https://github.com/rust-lang/regex" description = "A regular expression parser." workspace = ".." -edition = "2018" +edition = "2021" rust-version = "1.60.0" # Features are documented in the "Crate features" section of the crate docs: From a23911daada7bb584ec77cc3e2c33c1a73d973fd Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 26 Aug 2022 13:37:17 -0400 Subject: [PATCH 09/79] syntax: remove all uses of 'as' It turns out that all uses of 'as' in the regex-syntax crate can be replaced with either explicitly infallible routines (like 'u32::from(char)'), or with routines that will panic on failure. These panics are strictly better than truncating casts that might otherwise lead to subtle bugs in the context of this crate. (Namely, we never really care about the perf effects here, since regex parsing is just never a bottleneck.) --- regex-syntax/src/ast/mod.rs | 11 +- regex-syntax/src/ast/print.rs | 14 +- regex-syntax/src/hir/interval.rs | 8 +- regex-syntax/src/hir/literal/mod.rs | 35 ++--- regex-syntax/src/hir/mod.rs | 41 +++++- regex-syntax/src/hir/print.rs | 10 +- regex-syntax/src/hir/translate.rs | 195 +++++++++++++--------------- regex-syntax/src/unicode.rs | 3 +- regex-syntax/src/utf8.rs | 20 ++- 9 files changed, 181 insertions(+), 156 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 9db9afaf1..8ea740735 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -615,11 +615,12 @@ impl Literal { /// If this literal was written as a `\x` hex escape, then this returns /// the corresponding byte value. Otherwise, this returns `None`. pub fn byte(&self) -> Option { - let short_hex = LiteralKind::HexFixed(HexLiteralKind::X); - if self.c as u32 <= 255 && self.kind == short_hex { - Some(self.c as u8) - } else { - None + match self.kind { + LiteralKind::HexFixed(HexLiteralKind::X) => { + // MSRV(1.59): Use 'u8::try_from(self.c)' instead. + u8::try_from(u32::from(self.c)).ok() + } + _ => None, } } } diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 045de2eaf..f6b2462c0 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -213,24 +213,24 @@ impl Writer { match ast.kind { Verbatim => self.wtr.write_char(ast.c), Punctuation => write!(self.wtr, r"\{}", ast.c), - Octal => write!(self.wtr, r"\{:o}", ast.c as u32), + Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)), HexFixed(ast::HexLiteralKind::X) => { - write!(self.wtr, r"\x{:02X}", ast.c as u32) + write!(self.wtr, r"\x{:02X}", u32::from(ast.c)) } HexFixed(ast::HexLiteralKind::UnicodeShort) => { - write!(self.wtr, r"\u{:04X}", ast.c as u32) + write!(self.wtr, r"\u{:04X}", u32::from(ast.c)) } HexFixed(ast::HexLiteralKind::UnicodeLong) => { - write!(self.wtr, r"\U{:08X}", ast.c as u32) + write!(self.wtr, r"\U{:08X}", u32::from(ast.c)) } HexBrace(ast::HexLiteralKind::X) => { - write!(self.wtr, r"\x{{{:X}}}", ast.c as u32) + write!(self.wtr, r"\x{{{:X}}}", u32::from(ast.c)) } HexBrace(ast::HexLiteralKind::UnicodeShort) => { - write!(self.wtr, r"\u{{{:X}}}", ast.c as u32) + write!(self.wtr, r"\u{{{:X}}}", u32::from(ast.c)) } HexBrace(ast::HexLiteralKind::UnicodeLong) => { - write!(self.wtr, r"\U{{{:X}}}", ast.c as u32) + write!(self.wtr, r"\U{{{:X}}}", u32::from(ast.c)) } Special(ast::SpecialLiteralKind::Bell) => { self.wtr.write_str(r"\a") diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index 56698c53a..d6e83f7b2 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -481,7 +481,7 @@ impl Bound for u8 { u8::MAX } fn as_u32(self) -> u32 { - self as u32 + u32::from(self) } fn increment(self) -> Self { self.checked_add(1).unwrap() @@ -499,20 +499,20 @@ impl Bound for char { '\u{10FFFF}' } fn as_u32(self) -> u32 { - self as u32 + u32::from(self) } fn increment(self) -> Self { match self { '\u{D7FF}' => '\u{E000}', - c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(), + c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(), } } fn decrement(self) -> Self { match self { '\u{E000}' => '\u{D7FF}', - c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(), + c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(), } } } diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs index fbc5d3c97..58b8871ed 100644 --- a/regex-syntax/src/hir/literal/mod.rs +++ b/regex-syntax/src/hir/literal/mod.rs @@ -475,8 +475,8 @@ impl Literals { base = vec![Literal::empty()]; } for r in cls.iter() { - let (s, e) = (r.start as u32, r.end as u32 + 1); - for c in (s..e).filter_map(char::from_u32) { + let (s, e) = (u32::from(r.start), u32::from(r.end)); + for c in (s..=e).filter_map(char::from_u32) { for mut lit in base.clone() { let mut bytes = c.to_string().into_bytes(); if reverse { @@ -502,8 +502,7 @@ impl Literals { base = vec![Literal::empty()]; } for r in cls.iter() { - let (s, e) = (r.start as u32, r.end as u32 + 1); - for b in (s..e).map(|b| b as u8) { + for b in r.start..=r.end { for mut lit in base.clone() { lit.push(b); self.lits.push(lit); @@ -784,7 +783,10 @@ fn repeat_range_literals( lits: &mut Literals, mut f: F, ) { - if min == 0 { + // If 'min' somehow overflows usize, then we just treat it as 0, which is + // the most conservative thing we can do. + let umin = usize::try_from(min).unwrap_or(0); + if umin == 0 { // This is a bit conservative. If `max` is set, then we could // treat this as a finite set of alternations. For now, we // just treat it as `e*`. @@ -797,11 +799,11 @@ fn repeat_range_literals( lits, ); } else { - if min > 0 { - let n = cmp::min(lits.limit_size, min as usize); + if umin > 0 { + let n = cmp::min(lits.limit_size, umin); let es = iter::repeat(e.clone()).take(n).collect(); f(&Hir::concat(es), lits); - if n < min as usize || lits.contains_empty() { + if n < umin || lits.contains_empty() { lits.cut(); } } @@ -928,12 +930,13 @@ fn escape_unicode(bytes: &[u8]) -> String { let mut space_escaped = String::new(); for c in show.chars() { if c.is_whitespace() { - let escaped = if c as u32 <= 0x7F { - escape_byte(c as u8) - } else if c as u32 <= 0xFFFF { - format!(r"\u{{{:04x}}}", c as u32) + let cp = u32::from(c); + let escaped = if cp <= 0x7F { + escape_byte(u8::try_from(cp).unwrap()) + } else if cp <= 0xFFFF { + format!(r"\u{{{:04x}}}", cp) } else { - format!(r"\U{{{:08x}}}", c as u32) + format!(r"\U{{{:08x}}}", cp) }; space_escaped.push_str(&escaped); } else { @@ -959,13 +962,11 @@ fn escape_byte(byte: u8) -> String { } fn cls_char_count(cls: &hir::ClassUnicode) -> usize { - cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::() - as usize + cls.iter().map(|&r| r.len()).sum() } fn cls_byte_count(cls: &hir::ClassBytes) -> usize { - cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::() - as usize + cls.iter().map(|&r| r.len()).sum() } #[cfg(test)] diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 55dc95c20..e363e2fb6 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -90,6 +90,13 @@ pub enum ErrorKind { __Nonexhaustive, } +// BREADCRUMBS: +// +// Remove EmptyClassNotAllowed +// Make errors non_exhaustive +// Simplify repetitions (get rid of ZeroOrOne, OneOrMore etc) +// Get rid of deprecated things + impl ErrorKind { // TODO: Remove this method entirely on the next breaking semver release. #[allow(deprecated)] @@ -1013,12 +1020,12 @@ impl fmt::Debug for ClassUnicodeRange { { self.start.to_string() } else { - format!("0x{:X}", self.start as u32) + format!("0x{:X}", u32::from(self.start)) }; let end = if !self.end.is_whitespace() && !self.end.is_control() { self.end.to_string() } else { - format!("0x{:X}", self.end as u32) + format!("0x{:X}", u32::from(self.end)) }; f.debug_struct("ClassUnicodeRange") .field("start", &start) @@ -1058,10 +1065,9 @@ impl Interval for ClassUnicodeRange { if !unicode::contains_simple_case_mapping(self.start, self.end)? { return Ok(()); } - let start = self.start as u32; - let end = (self.end as u32).saturating_add(1); + let (start, end) = (u32::from(self.start), u32::from(self.end)); let mut next_simple_cp = None; - for cp in (start..end).filter_map(char::from_u32) { + for cp in (start..=end).filter_map(char::from_u32) { if next_simple_cp.map_or(false, |next| cp < next) { continue; } @@ -1104,6 +1110,18 @@ impl ClassUnicodeRange { pub fn end(&self) -> char { self.end } + + /// Returns the number of codepoints in this range. + pub fn len(&self) -> usize { + let diff = 1 + u32::from(self.end) - u32::from(self.start); + // This is likely to panic in 16-bit targets since a usize can only fit + // 2^16. It's not clear what to do here, other than to return an error + // when building a Unicode class that contains a range whose length + // overflows usize. (Which, to be honest, is probably quite common on + // 16-bit targets. For example, this would imply that '.' and '\p{any}' + // would be impossible to build.) + usize::try_from(diff).expect("char class len fits in usize") + } } /// A set of characters represented by arbitrary bytes (where one byte @@ -1291,18 +1309,27 @@ impl ClassBytesRange { pub fn end(&self) -> u8 { self.end } + + /// Returns the number of bytes in this range. + pub fn len(&self) -> usize { + usize::from(self.end.checked_sub(self.start).unwrap()) + .checked_add(1) + .unwrap() + } } impl fmt::Debug for ClassBytesRange { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut debug = f.debug_struct("ClassBytesRange"); if self.start <= 0x7F { - debug.field("start", &(self.start as char)); + let ch = char::try_from(self.start).unwrap(); + debug.field("start", &ch); } else { debug.field("start", &self.start); } if self.end <= 0x7F { - debug.field("end", &(self.end as char)); + let ch = char::try_from(self.start).unwrap(); + debug.field("end", &ch); } else { debug.field("end", &self.end); } diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index b71f3897c..433f9bf11 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -217,18 +217,16 @@ impl Writer { } fn write_literal_byte(&mut self, b: u8) -> fmt::Result { - let c = b as char; - if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { - self.write_literal_char(c) + if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { + self.write_literal_char(char::try_from(b).unwrap()) } else { write!(self.wtr, "(?-u:\\x{:02X})", b) } } fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { - let c = b as char; - if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { - self.write_literal_char(c) + if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { + self.write_literal_char(char::try_from(b).unwrap()) } else { write!(self.wtr, "\\x{:02X}", b) } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 890e1608b..04409cf95 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -656,7 +656,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { Some(byte) => byte, }; if byte <= 0x7F { - return Ok(hir::Literal::Unicode(byte as char)); + return Ok(hir::Literal::Unicode(char::try_from(byte).unwrap())); } if !self.trans().allow_invalid_utf8 { return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); @@ -704,7 +704,12 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } let mut cls = hir::ClassBytes::new(vec![hir::ClassBytesRange::new( - c as u8, c as u8, + // OK because 'c.len_utf8() == 1' which in turn implies + // that 'c' is ASCII. + // + // MSRV(1.59): Use 'u8::try_from(c)' instead. + u8::try_from(u32::from(c)).unwrap(), + u8::try_from(u32::from(c)).unwrap(), )]); cls.case_fold_simple(); Ok(Hir::class(hir::Class::Bytes(cls))) @@ -848,9 +853,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { ast: &ast::ClassAscii, ) -> Result { let mut cls = hir::ClassUnicode::new( - ascii_class(&ast.kind) - .iter() - .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)), + ascii_class_as_chars(&ast.kind) + .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), ); self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; Ok(cls) @@ -862,8 +866,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { ) -> Result { let mut cls = hir::ClassBytes::new( ascii_class(&ast.kind) - .iter() - .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)), + .map(|(s, e)| hir::ClassBytesRange::new(s, e)), ); self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; Ok(cls) @@ -985,8 +988,9 @@ impl<'t, 'p> TranslatorI<'t, 'p> { match self.literal_to_char(ast)? { hir::Literal::Byte(byte) => Ok(byte), hir::Literal::Unicode(ch) => { - if ch <= 0x7F as char { - Ok(ch as u8) + let cp = u32::from(ch); + if cp <= 0x7F { + Ok(u8::try_from(cp).unwrap()) } else { // We can't feasibly support Unicode in // byte oriented classes. Byte classes don't @@ -1085,38 +1089,44 @@ impl Flags { fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { let ranges: Vec<_> = ascii_class(kind) - .iter() - .cloned() - .map(|(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)) + .map(|(s, e)| hir::ClassBytesRange::new(s, e)) .collect(); hir::ClassBytes::new(ranges) } -fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] { +fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator { use crate::ast::ClassAsciiKind::*; - match *kind { - Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')], - Alpha => &[('A', 'Z'), ('a', 'z')], - Ascii => &[('\x00', '\x7F')], - Blank => &[('\t', '\t'), (' ', ' ')], - Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')], - Digit => &[('0', '9')], - Graph => &[('!', '~')], - Lower => &[('a', 'z')], - Print => &[(' ', '~')], - Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')], + + let slice: &'static [(u8, u8)] = match *kind { + Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], + Alpha => &[(b'A', b'Z'), (b'a', b'z')], + Ascii => &[(b'\x00', b'\x7F')], + Blank => &[(b'\t', b'\t'), (b' ', b' ')], + Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], + Digit => &[(b'0', b'9')], + Graph => &[(b'!', b'~')], + Lower => &[(b'a', b'z')], + Print => &[(b' ', b'~')], + Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], Space => &[ - ('\t', '\t'), - ('\n', '\n'), - ('\x0B', '\x0B'), - ('\x0C', '\x0C'), - ('\r', '\r'), - (' ', ' '), + (b'\t', b'\t'), + (b'\n', b'\n'), + (b'\x0B', b'\x0B'), + (b'\x0C', b'\x0C'), + (b'\r', b'\r'), + (b' ', b' '), ], - Upper => &[('A', 'Z')], - Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')], - Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')], - } + Upper => &[(b'A', b'Z')], + Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], + Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], + }; + slice.iter().copied() +} + +fn ascii_class_as_chars( + kind: &ast::ClassAsciiKind, +) -> impl Iterator { + ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e))) } #[cfg(test)] @@ -1126,7 +1136,7 @@ mod tests { use crate::hir::{self, Hir, HirKind}; use crate::unicode::{self, ClassQuery}; - use super::{ascii_class, TranslatorBuilder}; + use super::{ascii_class, ascii_class_as_chars, TranslatorBuilder}; // We create these errors to compare with real hir::Errors in the tests. // We define equality between TestError and hir::Error to disregard the @@ -1281,6 +1291,19 @@ mod tests { Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) } + fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir { + Hir::class(hir::Class::Unicode(hir::ClassUnicode::new( + ascii_class_as_chars(kind) + .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), + ))) + } + + fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir { + Hir::class(hir::Class::Bytes(hir::ClassBytes::new( + ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)), + ))) + } + fn hir_uclass(ranges: &[(char, char)]) -> Hir { let ranges: Vec = ranges .iter() @@ -1297,18 +1320,6 @@ mod tests { Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) } - fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir { - let ranges: Vec = ranges - .iter() - .map(|&(s, e)| { - assert!(s as u32 <= 0x7F); - assert!(e as u32 <= 0x7F); - hir::ClassBytesRange::new(s as u8, e as u8) - }) - .collect(); - Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) - } - fn hir_case_fold(expr: Hir) -> Hir { match expr.into_kind() { HirKind::Class(mut cls) => { @@ -1856,64 +1867,64 @@ mod tests { fn class_ascii() { assert_eq!( t("[[:alnum:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)) + hir_ascii_uclass(&ast::ClassAsciiKind::Alnum) ); assert_eq!( t("[[:alpha:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha)) + hir_ascii_uclass(&ast::ClassAsciiKind::Alpha) ); assert_eq!( t("[[:ascii:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii)) + hir_ascii_uclass(&ast::ClassAsciiKind::Ascii) ); assert_eq!( t("[[:blank:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank)) + hir_ascii_uclass(&ast::ClassAsciiKind::Blank) ); assert_eq!( t("[[:cntrl:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl)) + hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl) ); assert_eq!( t("[[:digit:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_uclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t("[[:graph:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph)) + hir_ascii_uclass(&ast::ClassAsciiKind::Graph) ); assert_eq!( t("[[:lower:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)) + hir_ascii_uclass(&ast::ClassAsciiKind::Lower) ); assert_eq!( t("[[:print:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Print)) + hir_ascii_uclass(&ast::ClassAsciiKind::Print) ); assert_eq!( t("[[:punct:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct)) + hir_ascii_uclass(&ast::ClassAsciiKind::Punct) ); assert_eq!( t("[[:space:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Space)) + hir_ascii_uclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t("[[:upper:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper)) + hir_ascii_uclass(&ast::ClassAsciiKind::Upper) ); assert_eq!( t("[[:word:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Word)) + hir_ascii_uclass(&ast::ClassAsciiKind::Word) ); assert_eq!( t("[[:xdigit:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit)) + hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit) ); assert_eq!( t("[[:^lower:]]"), - hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))) + hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower)) ); #[cfg(feature = "unicode-case")] assert_eq!( @@ -1928,13 +1939,11 @@ mod tests { assert_eq!( t("(?-u)[[:lower:]]"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower)) + hir_ascii_bclass(&ast::ClassAsciiKind::Lower) ); assert_eq!( t("(?i-u)[[:lower:]]"), - hir_case_fold(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Lower - ))) + hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower)) ); assert_eq!( @@ -1965,14 +1974,14 @@ mod tests { assert_eq!( t("[[:alnum:][:^ascii:]]"), hir_union( - hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_ascii_uclass(&ast::ClassAsciiKind::Alnum), hir_uclass(&[('\u{80}', '\u{10FFFF}')]), ), ); assert_eq!( t_bytes("(?-u)[[:alnum:][:^ascii:]]"), hir_union( - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_ascii_bclass(&ast::ClassAsciiKind::Alnum), hir_bclass(&[(0x80, 0xFF)]), ), ); @@ -2024,65 +2033,53 @@ mod tests { // ASCII only assert_eq!( t(r"(?-u)\d"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t(r"(?-u)\s"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)) + hir_ascii_bclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t(r"(?-u)\w"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)) + hir_ascii_bclass(&ast::ClassAsciiKind::Word) ); assert_eq!( t(r"(?i-u)\d"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t(r"(?i-u)\s"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)) + hir_ascii_bclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t(r"(?i-u)\w"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)) + hir_ascii_bclass(&ast::ClassAsciiKind::Word) ); // ASCII only, negated assert_eq!( t(r"(?-u)\D"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t(r"(?-u)\S"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Space - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( t(r"(?-u)\W"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Word - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); assert_eq!( t(r"(?i-u)\D"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t(r"(?i-u)\S"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Space - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( t(r"(?i-u)\W"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Word - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); } @@ -2826,9 +2823,7 @@ mod tests { #[cfg(feature = "unicode-perl")] assert_eq!( t_bytes(r"(?-u)[^\w&&\d]"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?-u)[^[a-z&&a-c]]"), @@ -2836,19 +2831,15 @@ mod tests { ); assert_eq!( t_bytes(r"(?-u)[^[\w&&\d]]"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?-u)[^[^\w&&\d]]"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Word - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); } diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index 5c22f66ac..0b716f5e6 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -419,7 +419,8 @@ pub fn is_word_character(c: char) -> result::Result { use crate::unicode_tables::perl_word::PERL_WORD; use std::cmp::Ordering; - if c <= 0x7F as char && is_word_byte(c as u8) { + // MSRV(1.59): Use 'u8::try_from(c)' instead. + if u8::try_from(u32::from(c)).map_or(false, is_word_byte) { return Ok(true); } Ok(PERL_WORD diff --git a/regex-syntax/src/utf8.rs b/regex-syntax/src/utf8.rs index b9c865532..b00cd7dba 100644 --- a/regex-syntax/src/utf8.rs +++ b/regex-syntax/src/utf8.rs @@ -306,7 +306,7 @@ impl Utf8Sequences { /// given. pub fn new(start: char, end: char) -> Self { let mut it = Utf8Sequences { range_stack: vec![] }; - it.push(start as u32, end as u32); + it.push(u32::from(start), u32::from(end)); it } @@ -317,7 +317,7 @@ impl Utf8Sequences { #[doc(hidden)] pub fn reset(&mut self, start: char, end: char) { self.range_stack.clear(); - self.push(start as u32, end as u32); + self.push(u32::from(start), u32::from(end)); } fn push(&mut self, start: u32, end: u32) { @@ -416,7 +416,9 @@ impl ScalarRange { /// values in this range can be encoded as a single byte. fn as_ascii(&self) -> Option { if self.is_ascii() { - Some(Utf8Range::new(self.start as u8, self.end as u8)) + let start = u8::try_from(self.start).unwrap(); + let end = u8::try_from(self.end).unwrap(); + Some(Utf8Range::new(start, end)) } else { None } @@ -472,7 +474,11 @@ mod tests { "Sequence ({:X}, {:X}) contains range {:?}, \ which matches surrogate code point {:X} \ with encoded bytes {:?}", - start as u32, end as u32, r, cp, buf, + u32::from(start), + u32::from(end), + r, + cp, + buf, ); } } @@ -579,9 +585,9 @@ mod tests { assert!(0xD800 <= cp && cp < 0xE000); let mut dst = [0; 3]; - dst[0] = (cp >> 12 & 0x0F) as u8 | TAG_THREE_B; - dst[1] = (cp >> 6 & 0x3F) as u8 | TAG_CONT; - dst[2] = (cp & 0x3F) as u8 | TAG_CONT; + dst[0] = u8::try_from(cp >> 12 & 0x0F).unwrap() | TAG_THREE_B; + dst[1] = u8::try_from(cp >> 6 & 0x3F).unwrap() | TAG_CONT; + dst[2] = u8::try_from(cp & 0x3F).unwrap() | TAG_CONT; dst } } From b147fe3d7155b48c56b4829d86e874259b5a8b1d Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 26 Aug 2022 13:56:32 -0400 Subject: [PATCH 10/79] syntax: remove 'std::error::Error::description' impls This method was deprecated a while ago, but we kept it around because it wasn't worth a breaking release to remove them. This also simplifies some imports. --- regex-syntax/src/ast/mod.rs | 42 +----------------------------------- regex-syntax/src/error.rs | 13 +---------- regex-syntax/src/hir/mod.rs | 43 ++++++++++++------------------------- 3 files changed, 16 insertions(+), 82 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 8ea740735..6ca60a199 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -3,7 +3,6 @@ Defines an abstract syntax for regular expressions. */ use std::cmp::Ordering; -use std::error; use std::fmt; pub use crate::ast::visitor::{visit, Visitor}; @@ -178,46 +177,7 @@ pub enum ErrorKind { __Nonexhaustive, } -impl error::Error for Error { - // TODO: Remove this method entirely on the next breaking semver release. - #[allow(deprecated)] - fn description(&self) -> &str { - use self::ErrorKind::*; - match self.kind { - CaptureLimitExceeded => "capture group limit exceeded", - ClassEscapeInvalid => "invalid escape sequence in character class", - ClassRangeInvalid => "invalid character class range", - ClassRangeLiteral => "invalid range boundary, must be a literal", - ClassUnclosed => "unclosed character class", - DecimalEmpty => "empty decimal literal", - DecimalInvalid => "invalid decimal literal", - EscapeHexEmpty => "empty hexadecimal literal", - EscapeHexInvalid => "invalid hexadecimal literal", - EscapeHexInvalidDigit => "invalid hexadecimal digit", - EscapeUnexpectedEof => "unexpected eof (escape sequence)", - EscapeUnrecognized => "unrecognized escape sequence", - FlagDanglingNegation => "dangling flag negation operator", - FlagDuplicate { .. } => "duplicate flag", - FlagRepeatedNegation { .. } => "repeated negation", - FlagUnexpectedEof => "unexpected eof (flag)", - FlagUnrecognized => "unrecognized flag", - GroupNameDuplicate { .. } => "duplicate capture group name", - GroupNameEmpty => "empty capture group name", - GroupNameInvalid => "invalid capture group name", - GroupNameUnexpectedEof => "unclosed capture group name", - GroupUnclosed => "unclosed group", - GroupUnopened => "unopened group", - NestLimitExceeded(_) => "nest limit exceeded", - RepetitionCountInvalid => "invalid repetition count range", - RepetitionCountUnclosed => "unclosed counted repetition", - RepetitionMissing => "repetition operator missing expression", - UnicodeClassInvalid => "invalid Unicode character class", - UnsupportedBackreference => "backreferences are not supported", - UnsupportedLookAround => "look-around is not supported", - _ => unreachable!(), - } - } -} +impl std::error::Error for Error {} impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { diff --git a/regex-syntax/src/error.rs b/regex-syntax/src/error.rs index 1230d2fc5..bd3fcc079 100644 --- a/regex-syntax/src/error.rs +++ b/regex-syntax/src/error.rs @@ -1,5 +1,4 @@ use std::cmp; -use std::error; use std::fmt; use std::result; @@ -39,17 +38,7 @@ impl From for Error { } } -impl error::Error for Error { - // TODO: Remove this method entirely on the next breaking semver release. - #[allow(deprecated)] - fn description(&self) -> &str { - match *self { - Error::Parse(ref x) => x.description(), - Error::Translate(ref x) => x.description(), - _ => unreachable!(), - } - } -} +impl std::error::Error for Error {} impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index e363e2fb6..03238caad 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -3,7 +3,6 @@ Defines a high-level intermediate representation for regular expressions. */ use std::char; use std::cmp; -use std::error; use std::fmt; use std::result; use std::u8; @@ -97,12 +96,19 @@ pub enum ErrorKind { // Simplify repetitions (get rid of ZeroOrOne, OneOrMore etc) // Get rid of deprecated things -impl ErrorKind { - // TODO: Remove this method entirely on the next breaking semver release. - #[allow(deprecated)] - fn description(&self) -> &str { +impl std::error::Error for Error {} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + crate::error::Formatter::from(self).fmt(f) + } +} + +impl fmt::Display for ErrorKind { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::ErrorKind::*; - match *self { + + let msg = match *self { UnicodeNotAllowed => "Unicode not allowed here", InvalidUtf8 => "pattern can match invalid UTF-8", UnicodePropertyNotFound => "Unicode property not found", @@ -117,29 +123,8 @@ impl ErrorKind { } EmptyClassNotAllowed => "empty character classes are not allowed", __Nonexhaustive => unreachable!(), - } - } -} - -impl error::Error for Error { - // TODO: Remove this method entirely on the next breaking semver release. - #[allow(deprecated)] - fn description(&self) -> &str { - self.kind.description() - } -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - crate::error::Formatter::from(self).fmt(f) - } -} - -impl fmt::Display for ErrorKind { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // TODO: Remove this on the next breaking semver release. - #[allow(deprecated)] - f.write_str(self.description()) + }; + f.write_str(msg) } } From 06df9ac7dd5f32899f49436ce1353df72ed38098 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 26 Aug 2022 14:01:02 -0400 Subject: [PATCH 11/79] syntax: remove '__Nonexhaustive' hack, use #[non_exhaustive] This marks the various error types as '#[non_exhaustive]' instead of using a __Nonexhaustive variant hack. Closes #884 --- regex-syntax/src/ast/mod.rs | 12 ++++-------- regex-syntax/src/error.rs | 12 ++++-------- regex-syntax/src/hir/mod.rs | 12 ++++-------- 3 files changed, 12 insertions(+), 24 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 6ca60a199..7c9dae7a0 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -64,6 +64,10 @@ impl Error { } /// The type of an error that occurred while building an AST. +/// +/// This error type is marked as `non_exhaustive`. This means that adding a +/// new variant is not considered a breaking change. +#[non_exhaustive] #[derive(Clone, Debug, Eq, PartialEq)] pub enum ErrorKind { /// The capturing group limit was exceeded. @@ -168,13 +172,6 @@ pub enum ErrorKind { /// `(? unreachable!(), } } } diff --git a/regex-syntax/src/error.rs b/regex-syntax/src/error.rs index bd3fcc079..6e7fa7466 100644 --- a/regex-syntax/src/error.rs +++ b/regex-syntax/src/error.rs @@ -9,6 +9,10 @@ use crate::hir; pub type Result = result::Result; /// This error type encompasses any error that can be returned by this crate. +/// +/// This error type is marked as `non_exhaustive`. This means that adding a +/// new variant is not considered a breaking change. +#[non_exhaustive] #[derive(Clone, Debug, Eq, PartialEq)] pub enum Error { /// An error that occurred while translating concrete syntax into abstract @@ -17,13 +21,6 @@ pub enum Error { /// An error that occurred while translating abstract syntax into a high /// level intermediate representation (HIR). Translate(hir::Error), - /// Hints that destructuring should not be exhaustive. - /// - /// This enum may grow additional variants, so this makes sure clients - /// don't count on exhaustive matching. (Otherwise, adding a new variant - /// could break existing code.) - #[doc(hidden)] - __Nonexhaustive, } impl From for Error { @@ -45,7 +42,6 @@ impl fmt::Display for Error { match *self { Error::Parse(ref x) => x.fmt(f), Error::Translate(ref x) => x.fmt(f), - _ => unreachable!(), } } } diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 03238caad..6a91c2588 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -52,6 +52,10 @@ impl Error { } /// The type of an error that occurred while building an `Hir`. +/// +/// This error type is marked as `non_exhaustive`. This means that adding a +/// new variant is not considered a breaking change. +#[non_exhaustive] #[derive(Clone, Debug, Eq, PartialEq)] pub enum ErrorKind { /// This error occurs when a Unicode feature is used when Unicode @@ -80,13 +84,6 @@ pub enum ErrorKind { /// Note that this restriction in the translator may be removed in the /// future. EmptyClassNotAllowed, - /// Hints that destructuring should not be exhaustive. - /// - /// This enum may grow additional variants, so this makes sure clients - /// don't count on exhaustive matching. (Otherwise, adding a new variant - /// could break existing code.) - #[doc(hidden)] - __Nonexhaustive, } // BREADCRUMBS: @@ -122,7 +119,6 @@ impl fmt::Display for ErrorKind { (make sure the unicode-case feature is enabled)" } EmptyClassNotAllowed => "empty character classes are not allowed", - __Nonexhaustive => unreachable!(), }; f.write_str(msg) } From 5a770dc819eceb7b415e27075bce2435bde39f5f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 26 Aug 2022 19:11:04 -0400 Subject: [PATCH 12/79] syntax: permit empty character classes An empty character class is effectively a way to write something that can never match anything. The regex crate has pretty much always returned an error for such things because it was never taught how to handle "always fail" states. Partly because I just didn't think about it when initially writing the regex engines and partly because it isn't often useful. With that said, it should be supported for completeness and because there is no real reason to not support it. Moreover, it can be useful in certain contexts where regexes are generated and you want to insert an expression that can never match. It's somewhat contrived, but it happens when the interface is a regex pattern. Previously, the ban on empty character classes was implemented in the regex-syntax crate. But with the rewrite in #656 getting closer and closer to landing, it's now time to relax this restriction. However, we do keep the overall restriction in the 'regex' API by returning an error in the NFA compiler. Once #656 is done, the new regex engines will permit this case. --- regex-syntax/src/hir/mod.rs | 14 ------ regex-syntax/src/hir/translate.rs | 73 +++---------------------------- src/compile.rs | 12 ++++- 3 files changed, 15 insertions(+), 84 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 6a91c2588..b16df20c8 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -78,21 +78,8 @@ pub enum ErrorKind { /// available, and the regular expression required Unicode aware case /// insensitivity. UnicodeCaseUnavailable, - /// This occurs when the translator attempts to construct a character class - /// that is empty. - /// - /// Note that this restriction in the translator may be removed in the - /// future. - EmptyClassNotAllowed, } -// BREADCRUMBS: -// -// Remove EmptyClassNotAllowed -// Make errors non_exhaustive -// Simplify repetitions (get rid of ZeroOrOne, OneOrMore etc) -// Get rid of deprecated things - impl std::error::Error for Error {} impl fmt::Display for Error { @@ -118,7 +105,6 @@ impl fmt::Display for ErrorKind { "Unicode-aware case insensitivity matching is not available \ (make sure the unicode-case feature is enabled)" } - EmptyClassNotAllowed => "empty character classes are not allowed", }; f.write_str(msg) } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 04409cf95..d7686988a 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -322,12 +322,6 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { ast.negated, &mut cls, )?; - if cls.ranges().is_empty() { - return Err(self.error( - ast.span, - ErrorKind::EmptyClassNotAllowed, - )); - } let expr = Hir::class(hir::Class::Unicode(cls)); self.push(HirFrame::Expr(expr)); } else { @@ -337,13 +331,6 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { ast.negated, &mut cls, )?; - if cls.ranges().is_empty() { - return Err(self.error( - ast.span, - ErrorKind::EmptyClassNotAllowed, - )); - } - let expr = Hir::class(hir::Class::Bytes(cls)); self.push(HirFrame::Expr(expr)); } @@ -839,11 +826,6 @@ impl<'t, 'p> TranslatorI<'t, 'p> { ast_class.negated, class, )?; - if class.ranges().is_empty() { - let err = self - .error(ast_class.span, ErrorKind::EmptyClassNotAllowed); - return Err(err); - } } result } @@ -2357,16 +2339,7 @@ mod tests { #[test] #[cfg(feature = "unicode-gencat")] fn class_unicode_any_empty() { - assert_eq!( - t_err(r"\P{any}"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(0, 1, 1), - Position::new(7, 1, 8) - ), - } - ); + assert_eq!(t(r"\P{any}"), hir_uclass(&[]),); } #[test] @@ -2518,27 +2491,9 @@ mod tests { } ); #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] - assert_eq!( - t_err(r"[^\s\S]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(0, 1, 1), - Position::new(7, 1, 8) - ), - } - ); + assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),); #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] - assert_eq!( - t_err(r"(?-u)[^\s\S]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(5, 1, 6), - Position::new(12, 1, 13) - ), - } - ); + assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),); } #[test] @@ -2686,27 +2641,9 @@ mod tests { hir_uclass(&[('C', 'C'), ('c', 'c')]) ); - assert_eq!( - t_err(r"[^a-c[^c]]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(0, 1, 1), - Position::new(10, 1, 11) - ), - } - ); + assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),); #[cfg(feature = "unicode-case")] - assert_eq!( - t_err(r"(?i)[^a-c[^c]]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(4, 1, 5), - Position::new(14, 1, 15) - ), - } - ); + assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),); } #[test] diff --git a/src/compile.rs b/src/compile.rs index 90ca25015..361ea4cb7 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -457,7 +457,11 @@ impl Compiler { fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { use std::mem::size_of; - assert!(!ranges.is_empty()); + if ranges.is_empty() { + return Err(Error::Syntax( + "empty character classes are not allowed".to_string(), + )); + } if self.compiled.uses_bytes() { Ok(Some(CompileClass { c: self, ranges }.compile()?)) } else { @@ -482,7 +486,11 @@ impl Compiler { &mut self, ranges: &[hir::ClassBytesRange], ) -> ResultOrEmpty { - debug_assert!(!ranges.is_empty()); + if ranges.is_empty() { + return Err(Error::Syntax( + "empty character classes are not allowed".to_string(), + )); + } let first_split_entry = self.insts.len(); let mut holes = vec![]; From 2b2e20aa2fdfd18046edcda9af43becebd109401 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 26 Aug 2022 19:41:37 -0400 Subject: [PATCH 13/79] syntax: reject '(?-u)\W' when UTF-8 mode is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Unicode mode is disabled (i.e., (?-u)), the Perl character classes (\w, \d and \s) revert to their ASCII definitions. The negated forms of these classes are also derived from their ASCII definitions, and this means that they may actually match bytes outside of ASCII and thus possibly invalid UTF-8. For this reason, when the translator is configured to only produce HIR that matches valid UTF-8, '(?-u)\W' should be rejected. Previously, it was not being rejected, which could actually lead to matches that produced offsets that split codepoints, and thus lead to panics when match offsets are used to slice a string. For example, this code fn main() { let re = regex::Regex::new(r"(?-u)\W").unwrap(); let haystack = "☃"; if let Some(m) = re.find(haystack) { println!("{:?}", &haystack[m.range()]); } } panics with byte index 1 is not a char boundary; it is inside '☃' (bytes 0..3) of `☃` That is, it reports a match at 0..1, which is technically correct, but the regex itself should have been rejected in the first place since the top-level Regex API always has UTF-8 mode enabled. Also, many of the replacement tests were using '(?-u)\W' (or similar) for some reason. I'm not sure why, so I just removed the '(?-u)' to make those tests pass. Whether Unicode is enabled or not doesn't seem to be an interesting detail for those tests. (All haystacks and replacements appear to be ASCII.) Fixes #895, Partially addresses #738 --- regex-syntax/src/hir/translate.rs | 95 +++++++++++++++++++++++++++---- tests/replace.rs | 16 +++--- 2 files changed, 92 insertions(+), 19 deletions(-) diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index d7686988a..988384ede 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -305,7 +305,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { let hcls = hir::Class::Unicode(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } else { - let cls = self.hir_perl_byte_class(x); + let cls = self.hir_perl_byte_class(x)?; let hcls = hir::Class::Bytes(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } @@ -445,7 +445,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); } else { - let xcls = self.hir_perl_byte_class(x); + let xcls = self.hir_perl_byte_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_bytes(); cls.union(&xcls); self.push(HirFrame::ClassBytes(cls)); @@ -879,7 +879,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { fn hir_perl_byte_class( &self, ast_class: &ast::ClassPerl, - ) -> hir::ClassBytes { + ) -> Result { use crate::ast::ClassPerlKind::*; assert!(!self.flags().unicode()); @@ -893,7 +893,13 @@ impl<'t, 'p> TranslatorI<'t, 'p> { if ast_class.negated { class.negate(); } - class + // Negating a Perl byte class is likely to cause it to match invalid + // UTF-8. That's only OK if the translator is configured to allow such + // things. + if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() { + return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8)); + } + Ok(class) } /// Converts the given Unicode specific error to an HIR translation error. @@ -1971,7 +1977,7 @@ mod tests { #[test] #[cfg(feature = "unicode-perl")] - fn class_perl() { + fn class_perl_unicode() { // Unicode assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); @@ -2011,7 +2017,10 @@ mod tests { ); #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); + } + #[test] + fn class_perl_ascii() { // ASCII only assert_eq!( t(r"(?-u)\d"), @@ -2040,29 +2049,93 @@ mod tests { // ASCII only, negated assert_eq!( - t(r"(?-u)\D"), + t_bytes(r"(?-u)\D"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( - t(r"(?-u)\S"), + t_bytes(r"(?-u)\S"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( - t(r"(?-u)\W"), + t_bytes(r"(?-u)\W"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); assert_eq!( - t(r"(?i-u)\D"), + t_bytes(r"(?i-u)\D"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( - t(r"(?i-u)\S"), + t_bytes(r"(?i-u)\S"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( - t(r"(?i-u)\W"), + t_bytes(r"(?i-u)\W"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); + + // ASCII only, negated, with UTF-8 mode enabled. + // In this case, negating any Perl class results in an error because + // all such classes can match invalid UTF-8. + assert_eq!( + t_err(r"(?-u)\D"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?-u)\S"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?-u)\W"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\D"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\S"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\W"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); } #[test] diff --git a/tests/replace.rs b/tests/replace.rs index d65be072f..f23c57551 100644 --- a/tests/replace.rs +++ b/tests/replace.rs @@ -15,7 +15,7 @@ replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ"); replace!( groups, replace, - r"(?-u)(\S+)\s+(\S+)", + r"([^ ]+)[ ]+([^ ]+)", "w1 w2", t!("$2 $1"), "w2 w1" @@ -23,7 +23,7 @@ replace!( replace!( double_dollar, replace, - r"(?-u)(\S+)\s+(\S+)", + r"([^ ]+)[ ]+([^ ]+)", "w1 w2", t!("$2 $$1"), "w2 $1" @@ -33,7 +33,7 @@ replace!( replace!( named, replace_all, - r"(?-u)(?P\S+)\s+(?P\S+)(?P\s*)", + r"(?P[^ ]+)[ ]+(?P[^ ]+)(?P[ ]*)", "w1 w2 w3 w4", t!("$last $first$space"), "w2 w1 w4 w3" @@ -51,7 +51,7 @@ replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b"); replace!( simple_expand, replace_all, - r"(?-u)(\w) (\w)", + r"([a-z]) ([a-z])", "a b", t!("$2 $1"), "b a" @@ -59,7 +59,7 @@ replace!( replace!( literal_dollar1, replace_all, - r"(?-u)(\w+) (\w+)", + r"([a-z]+) ([a-z]+)", "a b", t!("$$1"), "$1" @@ -67,7 +67,7 @@ replace!( replace!( literal_dollar2, replace_all, - r"(?-u)(\w+) (\w+)", + r"([a-z]+) ([a-z]+)", "a b", t!("$2 $$c $1"), "b $c a" @@ -75,7 +75,7 @@ replace!( replace!( no_expand1, replace, - r"(?-u)(\S+)\s+(\S+)", + r"([^ ]+)[ ]+([^ ]+)", "w1 w2", no_expand!("$2 $1"), "$2 $1" @@ -83,7 +83,7 @@ replace!( replace!( no_expand2, replace, - r"(?-u)(\S+)\s+(\S+)", + r"([^ ]+)[ ]+([^ ]+)", "w1 w2", no_expand!("$$1"), "$$1" From 377232bfb6fab36a37598ef6ce889fa0b66459e4 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 27 Aug 2022 15:15:07 -0400 Subject: [PATCH 14/79] syntax: add 'std' feature In effect, this adds support for no_std by depending on only core and alloc. There is still currently some benefit to enabling std support, namely, getting the 'std::error::Error' trait impls for the various error types. (Although, it seems like the 'Error' trait is going to get moved to 'core' finally.) Otherwise, the only 'std' things we use are in tests for tweaking stack sizes. This is the first step in an effort to make 'regex' itself work without depending on 'std'. 'regex' itself will be more precarious since it uses things like HashMap and Mutex that we'll need to find a way around. Getting around HashMap is easy (just use BTreeMap), but figuring out how to synchronize the threadpool will be interesting. Ref #476, Ref #477 --- Cargo.toml | 2 +- regex-syntax/Cargo.toml | 3 +- regex-syntax/src/ast/mod.rs | 32 ++++++------ regex-syntax/src/ast/parse.rs | 57 ++++++++++++---------- regex-syntax/src/ast/print.rs | 14 ++++-- regex-syntax/src/ast/visitor.rs | 10 ++-- regex-syntax/src/error.rs | 19 +++++--- regex-syntax/src/hir/interval.rs | 8 ++- regex-syntax/src/hir/literal/mod.rs | 54 +++++++++++--------- regex-syntax/src/hir/mod.rs | 59 ++++++++++++---------- regex-syntax/src/hir/print.rs | 18 +++++-- regex-syntax/src/hir/translate.rs | 26 +++++----- regex-syntax/src/hir/visitor.rs | 2 + regex-syntax/src/lib.rs | 27 ++++++++-- regex-syntax/src/parser.rs | 5 +- regex-syntax/src/unicode.rs | 76 ++++++++++++++++------------- regex-syntax/src/utf8.rs | 11 ++--- regex-syntax/test | 1 + 18 files changed, 245 insertions(+), 179 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5e707ae79..1e6ec664d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ finite automata and guarantees linear time matching on all inputs. categories = ["text-processing"] autotests = false exclude = ["/scripts/*", "/.github/*"] -edition = "2018" +edition = "2021" rust-version = "1.60.0" [workspace] diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index 6c92717e3..e30de004b 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -14,7 +14,8 @@ rust-version = "1.60.0" # Features are documented in the "Crate features" section of the crate docs: # https://docs.rs/regex-syntax/*/#crate-features [features] -default = ["unicode"] +default = ["std", "unicode"] +std = [] unicode = [ "unicode-age", diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 7c9dae7a0..7329fabbe 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -2,8 +2,9 @@ Defines an abstract syntax for regular expressions. */ -use std::cmp::Ordering; -use std::fmt; +use core::cmp::Ordering; + +use alloc::{boxed::Box, string::String, vec, vec::Vec}; pub use crate::ast::visitor::{visit, Visitor}; @@ -174,23 +175,24 @@ pub enum ErrorKind { UnsupportedLookAround, } +#[cfg(feature = "std")] impl std::error::Error for Error {} -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { crate::error::Formatter::from(self).fmt(f) } } -impl fmt::Display for ErrorKind { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for ErrorKind { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use self::ErrorKind::*; match *self { CaptureLimitExceeded => write!( f, "exceeded the maximum number of \ capturing groups ({})", - ::std::u32::MAX + u32::MAX ), ClassEscapeInvalid => { write!(f, "invalid escape sequence found in character class") @@ -283,8 +285,8 @@ pub struct Span { pub end: Position, } -impl fmt::Debug for Span { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Debug for Span { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "Span({:?}, {:?})", self.start, self.end) } } @@ -316,8 +318,8 @@ pub struct Position { pub column: usize, } -impl fmt::Debug for Position { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Debug for Position { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!( f, "Position(o: {:?}, l: {:?}, c: {:?})", @@ -497,8 +499,8 @@ impl Ast { /// /// This implementation uses constant stack space and heap space proportional /// to the size of the `Ast`. -impl fmt::Display for Ast { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for Ast { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use crate::ast::print::Printer; Printer::new().print(self, f) } @@ -1315,7 +1317,7 @@ pub enum Flag { /// space but heap space proportional to the depth of the `Ast`. impl Drop for Ast { fn drop(&mut self) { - use std::mem; + use core::mem; match *self { Ast::Empty(_) @@ -1365,7 +1367,7 @@ impl Drop for Ast { /// stack space but heap space proportional to the depth of the `ClassSet`. impl Drop for ClassSet { fn drop(&mut self) { - use std::mem; + use core::mem; match *self { ClassSet::Item(ref item) => match *item { diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 6e9c9aca0..f730ee659 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -2,17 +2,26 @@ This module provides a regular expression parser. */ -use std::borrow::Borrow; -use std::cell::{Cell, RefCell}; -use std::mem; -use std::result; - -use crate::ast::{self, Ast, Position, Span}; -use crate::either::Either; - -use crate::is_meta_character; - -type Result = result::Result; +use core::{ + borrow::Borrow, + cell::{Cell, RefCell}, + mem, +}; + +use alloc::{ + boxed::Box, + string::{String, ToString}, + vec, + vec::Vec, +}; + +use crate::{ + ast::{self, Ast, Position, Span}, + either::Either, + is_meta_character, +}; + +type Result = core::result::Result; /// A primitive is an expression with no sub-expressions. This includes /// literals, assertions and non-set character classes. This representation @@ -1533,9 +1542,6 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// Assuming the preconditions are met, this routine can never fail. #[inline(never)] fn parse_octal(&self) -> ast::Literal { - use std::char; - use std::u32; - assert!(self.parser().octal); assert!('0' <= self.char() && self.char() <= '7'); let start = self.pos(); @@ -1600,9 +1606,6 @@ impl<'s, P: Borrow> ParserI<'s, P> { &self, kind: ast::HexLiteralKind, ) -> Result { - use std::char; - use std::u32; - let mut scratch = self.parser().scratch.borrow_mut(); scratch.clear(); @@ -1646,9 +1649,6 @@ impl<'s, P: Borrow> ParserI<'s, P> { &self, kind: ast::HexLiteralKind, ) -> Result { - use std::char; - use std::u32; - let mut scratch = self.parser().scratch.borrow_mut(); scratch.clear(); @@ -2146,7 +2146,7 @@ impl<'p, 's, P: Borrow> NestLimiter<'p, 's, P> { let new = self.depth.checked_add(1).ok_or_else(|| { self.p.error( span.clone(), - ast::ErrorKind::NestLimitExceeded(::std::u32::MAX), + ast::ErrorKind::NestLimitExceeded(u32::MAX), ) })?; let limit = self.p.parser().nest_limit; @@ -2297,11 +2297,14 @@ fn specialize_err( #[cfg(test)] mod tests { - use std::ops::Range; + use core::ops::Range; + + use alloc::format; - use super::{Parser, ParserBuilder, ParserI, Primitive}; use crate::ast::{self, Ast, Position, Span}; + use super::*; + // Our own assert_eq, which has slightly better formatting (but honestly // still kind of crappy). macro_rules! assert_eq { @@ -4272,7 +4275,7 @@ bar Ok(Primitive::Literal(ast::Literal { span: span(0..pat.len()), kind: ast::LiteralKind::Octal, - c: ::std::char::from_u32(i).unwrap(), + c: char::from_u32(i).unwrap(), })) ); } @@ -4347,7 +4350,7 @@ bar Ok(Primitive::Literal(ast::Literal { span: span(0..pat.len()), kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X), - c: ::std::char::from_u32(i).unwrap(), + c: char::from_u32(i).unwrap(), })) ); } @@ -4378,7 +4381,7 @@ bar #[test] fn parse_hex_four() { for i in 0..65536 { - let c = match ::std::char::from_u32(i) { + let c = match char::from_u32(i) { None => continue, Some(c) => c, }; @@ -4442,7 +4445,7 @@ bar #[test] fn parse_hex_eight() { for i in 0..65536 { - let c = match ::std::char::from_u32(i) { + let c = match char::from_u32(i) { None => continue, Some(c) => c, }; diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index f6b2462c0..e6c000d57 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -2,10 +2,13 @@ This module provides a regular expression printer for `Ast`. */ -use std::fmt; +use core::fmt; -use crate::ast::visitor::{self, Visitor}; -use crate::ast::{self, Ast}; +use crate::ast::{ + self, + visitor::{self, Visitor}, + Ast, +}; /// A builder for constructing a printer. /// @@ -395,9 +398,12 @@ impl Writer { #[cfg(test)] mod tests { - use super::Printer; + use alloc::string::String; + use crate::ast::parse::ParserBuilder; + use super::*; + fn roundtrip(given: &str) { roundtrip_with(|b| b, given); } diff --git a/regex-syntax/src/ast/visitor.rs b/regex-syntax/src/ast/visitor.rs index 78ee487cf..03f8bf963 100644 --- a/regex-syntax/src/ast/visitor.rs +++ b/regex-syntax/src/ast/visitor.rs @@ -1,4 +1,4 @@ -use std::fmt; +use alloc::{vec, vec::Vec}; use crate::ast::{self, Ast}; @@ -475,8 +475,8 @@ impl<'a> ClassInduct<'a> { } } -impl<'a> fmt::Debug for ClassFrame<'a> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl<'a> core::fmt::Debug for ClassFrame<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let x = match *self { ClassFrame::Union { .. } => "Union", ClassFrame::Binary { .. } => "Binary", @@ -487,8 +487,8 @@ impl<'a> fmt::Debug for ClassFrame<'a> { } } -impl<'a> fmt::Debug for ClassInduct<'a> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl<'a> core::fmt::Debug for ClassInduct<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let x = match *self { ClassInduct::Item(it) => match *it { ast::ClassSetItem::Empty(_) => "Item(Empty)", diff --git a/regex-syntax/src/error.rs b/regex-syntax/src/error.rs index 6e7fa7466..a10230a87 100644 --- a/regex-syntax/src/error.rs +++ b/regex-syntax/src/error.rs @@ -1,9 +1,13 @@ -use std::cmp; -use std::fmt; -use std::result; +use core::{cmp, fmt, result}; -use crate::ast; -use crate::hir; +use alloc::{ + format, + string::{String, ToString}, + vec, + vec::Vec, +}; + +use crate::{ast, hir}; /// A type alias for dealing with errors returned by this crate. pub type Result = result::Result; @@ -35,6 +39,7 @@ impl From for Error { } } +#[cfg(feature = "std")] impl std::error::Error for Error {} impl fmt::Display for Error { @@ -266,11 +271,13 @@ impl<'p> Spans<'p> { } fn repeat_char(c: char, count: usize) -> String { - ::std::iter::repeat(c).take(count).collect() + core::iter::repeat(c).take(count).collect() } #[cfg(test)] mod tests { + use alloc::string::ToString; + use crate::ast::parse::Parser; fn assert_panic_message(pattern: &str, expected_msg: &str) { diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index d6e83f7b2..fbe772ea4 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -1,8 +1,6 @@ -use std::char; -use std::cmp; -use std::fmt::Debug; -use std::slice; -use std::u8; +use core::{char, cmp, fmt::Debug, slice}; + +use alloc::vec::Vec; use crate::unicode; diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs index 58b8871ed..d49cffd92 100644 --- a/regex-syntax/src/hir/literal/mod.rs +++ b/regex-syntax/src/hir/literal/mod.rs @@ -2,11 +2,15 @@ Provides routines for extracting literal prefixes and suffixes from an `Hir`. */ -use std::cmp; -use std::fmt; -use std::iter; -use std::mem; -use std::ops; +use core::{cmp, iter, mem, ops}; + +use alloc::{ + boxed::Box, + format, + string::{String, ToString}, + vec, + vec::Vec, +}; use crate::hir::{self, Hir, HirKind}; @@ -408,7 +412,7 @@ impl Literals { } if self.lits.is_empty() { let i = cmp::min(self.limit_size, bytes.len()); - self.lits.push(Literal::new(bytes[..i].to_owned())); + self.lits.push(Literal::new(bytes[..i].to_vec())); self.lits[0].cut = i < bytes.len(); return !self.lits[0].is_cut(); } @@ -465,8 +469,6 @@ impl Literals { cls: &hir::ClassUnicode, reverse: bool, ) -> bool { - use std::char; - if self.class_exceeds_limits(cls_char_count(cls)) { return false; } @@ -837,8 +839,8 @@ fn alternate_literals( } } -impl fmt::Debug for Literals { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Debug for Literals { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("Literals") .field("lits", &self.lits) .field("limit_size", &self.limit_size) @@ -881,8 +883,8 @@ impl PartialOrd for Literal { } } -impl fmt::Debug for Literal { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Debug for Literal { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { if self.is_cut() { write!(f, "Cut({})", escape_unicode(&self.v)) } else { @@ -923,7 +925,7 @@ fn position(needle: &[u8], mut haystack: &[u8]) -> Option { } fn escape_unicode(bytes: &[u8]) -> String { - let show = match ::std::str::from_utf8(bytes) { + let show = match core::str::from_utf8(bytes) { Ok(v) => v.to_string(), Err(_) => escape_bytes(bytes), }; @@ -955,7 +957,7 @@ fn escape_bytes(bytes: &[u8]) -> String { } fn escape_byte(byte: u8) -> String { - use std::ascii::escape_default; + use core::ascii::escape_default; let escaped: Vec = escape_default(byte).collect(); String::from_utf8_lossy(&escaped).into_owned() @@ -971,11 +973,15 @@ fn cls_byte_count(cls: &hir::ClassBytes) -> usize { #[cfg(test)] mod tests { - use std::fmt; + use alloc::{ + string::{String, ToString}, + vec, + vec::Vec, + }; + + use crate::{hir::Hir, ParserBuilder}; - use super::{escape_bytes, Literal, Literals}; - use crate::hir::Hir; - use crate::ParserBuilder; + use super::*; // To make test failures easier to read. #[derive(Debug, Eq, PartialEq)] @@ -1013,8 +1019,8 @@ mod tests { } } - impl fmt::Debug for ULiteral { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + impl core::fmt::Debug for ULiteral { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { if self.is_cut() { write!(f, "Cut({})", self.v) } else { @@ -1037,11 +1043,11 @@ mod tests { #[allow(non_snake_case)] fn C(s: &'static str) -> ULiteral { - ULiteral { v: s.to_owned(), cut: true } + ULiteral { v: s.to_string(), cut: true } } #[allow(non_snake_case)] fn M(s: &'static str) -> ULiteral { - ULiteral { v: s.to_owned(), cut: false } + ULiteral { v: s.to_string(), cut: false } } fn prefixes(lits: &mut Literals, expr: &Hir) { @@ -1626,7 +1632,7 @@ mod tests { let given: Vec = $given .into_iter() .map(|s: &str| Literal { - v: s.to_owned().into_bytes(), + v: s.to_string().into_bytes(), cut: false, }) .collect(); @@ -1661,7 +1667,7 @@ mod tests { let given: Vec = $given .into_iter() .map(|s: &str| Literal { - v: s.to_owned().into_bytes(), + v: s.to_string().into_bytes(), cut: false, }) .collect(); diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index b16df20c8..2af769e92 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1,18 +1,27 @@ /*! Defines a high-level intermediate representation for regular expressions. */ -use std::char; -use std::cmp; -use std::fmt; -use std::result; -use std::u8; -use crate::ast::Span; -use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter}; -use crate::unicode; +use core::{char, cmp}; -pub use crate::hir::visitor::{visit, Visitor}; -pub use crate::unicode::CaseFoldError; +use alloc::{ + boxed::Box, + format, + string::{String, ToString}, + vec, + vec::Vec, +}; + +use crate::{ + ast::Span, + hir::interval::{Interval, IntervalSet, IntervalSetIter}, + unicode, +}; + +pub use crate::{ + hir::visitor::{visit, Visitor}, + unicode::CaseFoldError, +}; mod interval; pub mod literal; @@ -80,16 +89,17 @@ pub enum ErrorKind { UnicodeCaseUnavailable, } +#[cfg(feature = "std")] impl std::error::Error for Error {} -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { crate::error::Formatter::from(self).fmt(f) } } -impl fmt::Display for ErrorKind { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for ErrorKind { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use self::ErrorKind::*; let msg = match *self { @@ -197,8 +207,7 @@ impl Hir { /// Consumes ownership of this HIR expression and returns its underlying /// `HirKind`. pub fn into_kind(mut self) -> HirKind { - use std::mem; - mem::replace(&mut self.kind, HirKind::Empty) + core::mem::replace(&mut self.kind, HirKind::Empty) } /// Returns an empty HIR expression. @@ -704,8 +713,8 @@ impl HirKind { /// /// This implementation uses constant stack space and heap space proportional /// to the size of the `Hir`. -impl fmt::Display for Hir { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for Hir { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use crate::hir::print::Printer; Printer::new().print(self, f) } @@ -800,7 +809,7 @@ impl Class { /// Unicode oriented. pub fn try_case_fold_simple( &mut self, - ) -> result::Result<(), CaseFoldError> { + ) -> core::result::Result<(), CaseFoldError> { match *self { Class::Unicode(ref mut x) => x.try_case_fold_simple()?, Class::Bytes(ref mut x) => x.case_fold_simple(), @@ -909,7 +918,7 @@ impl ClassUnicode { /// `unicode-case` feature is not enabled. pub fn try_case_fold_simple( &mut self, - ) -> result::Result<(), CaseFoldError> { + ) -> core::result::Result<(), CaseFoldError> { self.set.case_fold_simple() } @@ -981,8 +990,8 @@ pub struct ClassUnicodeRange { end: char, } -impl fmt::Debug for ClassUnicodeRange { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Debug for ClassUnicodeRange { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let start = if !self.start.is_whitespace() && !self.start.is_control() { self.start.to_string() @@ -1285,8 +1294,8 @@ impl ClassBytesRange { } } -impl fmt::Debug for ClassBytesRange { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Debug for ClassBytesRange { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut debug = f.debug_struct("ClassBytesRange"); if self.start <= 0x7F { let ch = char::try_from(self.start).unwrap(); @@ -1459,7 +1468,7 @@ pub enum RepetitionRange { /// space but heap space proportional to the depth of the total `Hir`. impl Drop for Hir { fn drop(&mut self) { - use std::mem; + use core::mem; match *self.kind() { HirKind::Empty diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 433f9bf11..63d014b1b 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -2,11 +2,16 @@ This module provides a regular expression printer for `Hir`. */ -use std::fmt; +use core::fmt; -use crate::hir::visitor::{self, Visitor}; -use crate::hir::{self, Hir, HirKind}; -use crate::is_meta_character; +use crate::{ + hir::{ + self, + visitor::{self, Visitor}, + Hir, HirKind, + }, + is_meta_character, +}; /// A builder for constructing a printer. /// @@ -235,9 +240,12 @@ impl Writer { #[cfg(test)] mod tests { - use super::Printer; + use alloc::string::String; + use crate::ParserBuilder; + use super::*; + fn roundtrip(given: &str, expected: &str) { roundtrip_with(|b| b, given, expected); } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 988384ede..b4338bc94 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -2,14 +2,17 @@ Defines a translator that converts an `Ast` to an `Hir`. */ -use std::cell::{Cell, RefCell}; -use std::result; +use core::cell::{Cell, RefCell}; -use crate::ast::{self, Ast, Span, Visitor}; -use crate::hir::{self, Error, ErrorKind, Hir}; -use crate::unicode::{self, ClassQuery}; +use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; -type Result = result::Result; +use crate::{ + ast::{self, Ast, Span, Visitor}, + hir::{self, Error, ErrorKind, Hir}, + unicode::{self, ClassQuery}, +}; + +type Result = core::result::Result; /// A builder for constructing an AST->HIR translator. #[derive(Clone, Debug)] @@ -1119,12 +1122,13 @@ fn ascii_class_as_chars( #[cfg(test)] mod tests { - use crate::ast::parse::ParserBuilder; - use crate::ast::{self, Ast, Position, Span}; - use crate::hir::{self, Hir, HirKind}; - use crate::unicode::{self, ClassQuery}; + use crate::{ + ast::{self, parse::ParserBuilder, Ast, Position, Span}, + hir::{self, Hir, HirKind}, + unicode::{self, ClassQuery}, + }; - use super::{ascii_class, ascii_class_as_chars, TranslatorBuilder}; + use super::*; // We create these errors to compare with real hir::Errors in the tests. // We define equality between TestError and hir::Error to disregard the diff --git a/regex-syntax/src/hir/visitor.rs b/regex-syntax/src/hir/visitor.rs index 4f5a70909..97771d92f 100644 --- a/regex-syntax/src/hir/visitor.rs +++ b/regex-syntax/src/hir/visitor.rs @@ -1,3 +1,5 @@ +use alloc::{vec, vec::Vec}; + use crate::hir::{self, Hir, HirKind}; /// A trait for visiting the high-level IR (HIR) in depth first order. diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 1dfb38af3..287b3417c 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -116,6 +116,11 @@ match semantics of a regular expression. The following features are available: +* **std** - + Enables support for the standard library. This feature is enabled by default. + When disabled, only `core` and `alloc` are used. Otherwise, enabling `std` + generally just enables `std::error::Error` trait impls for the various error + types. * **unicode** - Enables all Unicode features. This feature is enabled by default, and will always cover all Unicode features, even if more are added in the future. @@ -154,13 +159,23 @@ The following features are available: `\p{sb=ATerm}`. */ +#![forbid(unsafe_code)] #![deny(missing_docs)] #![warn(missing_debug_implementations)] -#![forbid(unsafe_code)] +#![no_std] + +#[cfg(any(test, feature = "std"))] +extern crate std; -pub use crate::error::{Error, Result}; -pub use crate::parser::{Parser, ParserBuilder}; -pub use crate::unicode::UnicodeWordError; +extern crate alloc; + +pub use crate::{ + error::{Error, Result}, + parser::{Parser, ParserBuilder}, + unicode::UnicodeWordError, +}; + +use alloc::string::String; pub mod ast; mod either; @@ -248,7 +263,7 @@ pub fn is_word_character(c: char) -> bool { /// returns an error. pub fn try_is_word_character( c: char, -) -> std::result::Result { +) -> core::result::Result { unicode::is_word_character(c) } @@ -265,6 +280,8 @@ pub fn is_word_byte(c: u8) -> bool { #[cfg(test)] mod tests { + use alloc::string::ToString; + use super::*; #[test] diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index ded95b280..93df72279 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -1,7 +1,4 @@ -use crate::ast; -use crate::hir; - -use crate::Result; +use crate::{ast, hir, Result}; /// A builder for a regular expression parser. /// diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index 0b716f5e6..1689681fa 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -1,11 +1,12 @@ -use std::error; -use std::fmt; -use std::result; +use alloc::{ + string::{String, ToString}, + vec::Vec, +}; use crate::hir; /// A type alias for errors specific to Unicode handling of classes. -pub type Result = result::Result; +pub type Result = core::result::Result; /// An inclusive range of codepoints from a generated file (hence the static /// lifetime). @@ -25,7 +26,7 @@ pub enum Error { } /// A type alias for errors specific to Unicode case folding. -pub type FoldResult = result::Result; +pub type FoldResult = core::result::Result; /// An error that occurs when Unicode-aware simple case folding fails. /// @@ -35,10 +36,11 @@ pub type FoldResult = result::Result; #[derive(Debug)] pub struct CaseFoldError(()); -impl error::Error for CaseFoldError {} +#[cfg(feature = "std")] +impl std::error::Error for CaseFoldError {} -impl fmt::Display for CaseFoldError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for CaseFoldError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!( f, "Unicode-aware case folding is not available \ @@ -55,10 +57,11 @@ impl fmt::Display for CaseFoldError { #[derive(Debug)] pub struct UnicodeWordError(()); -impl error::Error for UnicodeWordError {} +#[cfg(feature = "std")] +impl std::error::Error for UnicodeWordError {} -impl fmt::Display for UnicodeWordError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for UnicodeWordError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!( f, "Unicode-aware \\w class is not available \ @@ -80,21 +83,24 @@ impl fmt::Display for UnicodeWordError { /// This returns an error if the Unicode case folding tables are not available. pub fn simple_fold( c: char, -) -> FoldResult, Option>> { +) -> FoldResult, Option>> +{ #[cfg(not(feature = "unicode-case"))] fn imp( _: char, - ) -> FoldResult, Option>> - { - use std::option::IntoIter; - Err::, _>, _>(CaseFoldError(())) + ) -> FoldResult< + core::result::Result, Option>, + > { + use core::option::IntoIter; + Err::, _>, _>(CaseFoldError(())) } #[cfg(feature = "unicode-case")] fn imp( c: char, - ) -> FoldResult, Option>> - { + ) -> FoldResult< + core::result::Result, Option>, + > { use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; Ok(CASE_FOLDING_SIMPLE @@ -130,8 +136,9 @@ pub fn contains_simple_case_mapping( #[cfg(feature = "unicode-case")] fn imp(start: char, end: char) -> FoldResult { + use core::cmp::Ordering; + use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; - use std::cmp::Ordering; assert!(start <= end); Ok(CASE_FOLDING_SIMPLE @@ -407,17 +414,17 @@ pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { /// Returns true only if the given codepoint is in the `\w` character class. /// /// If the `unicode-perl` feature is not enabled, then this returns an error. -pub fn is_word_character(c: char) -> result::Result { +pub fn is_word_character( + c: char, +) -> core::result::Result { #[cfg(not(feature = "unicode-perl"))] - fn imp(_: char) -> result::Result { + fn imp(_: char) -> core::result::Result { Err(UnicodeWordError(())) } #[cfg(feature = "unicode-perl")] - fn imp(c: char) -> result::Result { - use crate::is_word_byte; - use crate::unicode_tables::perl_word::PERL_WORD; - use std::cmp::Ordering; + fn imp(c: char) -> core::result::Result { + use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD}; // MSRV(1.59): Use 'u8::try_from(c)' instead. if u8::try_from(u32::from(c)).map_or(false, is_word_byte) { @@ -425,6 +432,8 @@ pub fn is_word_character(c: char) -> result::Result { } Ok(PERL_WORD .binary_search_by(|&(start, end)| { + use core::cmp::Ordering; + if start <= c && c <= end { Ordering::Equal } else if start > c { @@ -583,7 +592,7 @@ fn property_set( fn ages(canonical_age: &str) -> Result> { #[cfg(not(feature = "unicode-age"))] fn imp(_: &str) -> Result> { - use std::option::IntoIter; + use core::option::IntoIter; Err::, _>(Error::PropertyNotFound) } @@ -884,10 +893,7 @@ fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { #[cfg(test)] mod tests { - use super::{ - contains_simple_case_mapping, simple_fold, symbolic_name_normalize, - symbolic_name_normalize_bytes, - }; + use super::*; #[cfg(feature = "unicode-case")] fn simple_fold_ok(c: char) -> impl Iterator { @@ -911,23 +917,23 @@ mod tests { #[cfg(feature = "unicode-case")] fn simple_fold_k() { let xs: Vec = simple_fold_ok('k').collect(); - assert_eq!(xs, vec!['K', 'K']); + assert_eq!(xs, alloc::vec!['K', 'K']); let xs: Vec = simple_fold_ok('K').collect(); - assert_eq!(xs, vec!['k', 'K']); + assert_eq!(xs, alloc::vec!['k', 'K']); let xs: Vec = simple_fold_ok('K').collect(); - assert_eq!(xs, vec!['K', 'k']); + assert_eq!(xs, alloc::vec!['K', 'k']); } #[test] #[cfg(feature = "unicode-case")] fn simple_fold_a() { let xs: Vec = simple_fold_ok('a').collect(); - assert_eq!(xs, vec!['A']); + assert_eq!(xs, alloc::vec!['A']); let xs: Vec = simple_fold_ok('A').collect(); - assert_eq!(xs, vec!['a']); + assert_eq!(xs, alloc::vec!['a']); } #[test] diff --git a/regex-syntax/src/utf8.rs b/regex-syntax/src/utf8.rs index b00cd7dba..a75a8afa8 100644 --- a/regex-syntax/src/utf8.rs +++ b/regex-syntax/src/utf8.rs @@ -80,12 +80,9 @@ I also got the idea from which uses it for executing automata on their term index. */ -#![deny(missing_docs)] +use core::{char, fmt, iter::FusedIterator, slice}; -use std::char; -use std::fmt; -use std::iter::FusedIterator; -use std::slice; +use alloc::{vec, vec::Vec}; const MAX_UTF8_BYTES: usize = 4; @@ -457,7 +454,9 @@ fn max_scalar_value(nbytes: usize) -> u32 { #[cfg(test)] mod tests { - use std::char; + use core::char; + + use alloc::{vec, vec::Vec}; use crate::utf8::{Utf8Range, Utf8Sequences}; diff --git a/regex-syntax/test b/regex-syntax/test index 4b1b9fb1a..d03db94b4 100755 --- a/regex-syntax/test +++ b/regex-syntax/test @@ -7,6 +7,7 @@ echo "===== DEFAULT FEATURES ===" cargo test features=( + std unicode unicode-age unicode-bool From 5d9746d30139dd7d53197c0251abd6c92ae3abdd Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 27 Aug 2022 20:00:18 -0400 Subject: [PATCH 15/79] syntax: enable 'doc_auto_cfg' I wish this feature were stable and enabled by default. I suspect that it maybe doesn't work correctly 100% of the time, but it's super useful. And manually annotating APIs is a huge pain, so it's worth at least attempting. --- regex-syntax/Cargo.toml | 13 +++++++++++++ regex-syntax/src/lib.rs | 1 + 2 files changed, 14 insertions(+) diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index e30de004b..6ae9b8485 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -33,3 +33,16 @@ unicode-gencat = [] unicode-perl = [] unicode-script = [] unicode-segment = [] + +[package.metadata.docs.rs] +# We want to document all features. +all-features = true +# Since this crate's feature setup is pretty complicated, it is worth opting +# into a nightly unstable option to show the features that need to be enabled +# for public API items. To do that, we set 'docsrs', and when that's enabled, +# we enable the 'doc_auto_cfg' feature. +# +# To test this locally, run: +# +# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features +rustdoc-args = ["--cfg", "docsrs"] diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 287b3417c..34c259d31 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -162,6 +162,7 @@ The following features are available: #![forbid(unsafe_code)] #![deny(missing_docs)] #![warn(missing_debug_implementations)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] #![no_std] #[cfg(any(test, feature = "std"))] From 7bd2d9aab063427876c0a88234ba4fb3d6b654e9 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 27 Aug 2022 20:15:43 -0400 Subject: [PATCH 16/79] syntax: switch to rustdoc intra links Get rid of those old crusty HTML links! Also, if an intradoc link is used that is bunk, fail the build. --- regex-syntax/src/ast/parse.rs | 6 ++--- regex-syntax/src/ast/visitor.rs | 32 +++++++++++-------------- regex-syntax/src/hir/literal/mod.rs | 10 ++++---- regex-syntax/src/hir/mod.rs | 23 +++++++++--------- regex-syntax/src/hir/translate.rs | 2 +- regex-syntax/src/hir/visitor.rs | 5 ++-- regex-syntax/src/lib.rs | 36 ++++++++++++++--------------- regex-syntax/src/parser.rs | 15 +++++------- regex-syntax/src/utf8.rs | 2 +- 9 files changed, 59 insertions(+), 72 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index f730ee659..48a0507e2 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -229,8 +229,7 @@ impl ParserBuilder { /// abstract syntax tree. The size of the tree is proportional to the length /// of the regular expression pattern. /// -/// A `Parser` can be configured in more detail via a -/// [`ParserBuilder`](struct.ParserBuilder.html). +/// A `Parser` can be configured in more detail via a [`ParserBuilder`]. #[derive(Clone, Debug)] pub struct Parser { /// The current position of the parser. @@ -336,8 +335,7 @@ impl Parser { /// The parser can be run with either the `parse` or `parse_with_comments` /// methods. The parse methods return an abstract syntax tree. /// - /// To set configuration options on the parser, use - /// [`ParserBuilder`](struct.ParserBuilder.html). + /// To set configuration options on the parser, use [`ParserBuilder`]. pub fn new() -> Parser { ParserBuilder::new().build() } diff --git a/regex-syntax/src/ast/visitor.rs b/regex-syntax/src/ast/visitor.rs index 03f8bf963..ab136739e 100644 --- a/regex-syntax/src/ast/visitor.rs +++ b/regex-syntax/src/ast/visitor.rs @@ -11,15 +11,12 @@ use crate::ast::{self, Ast}; /// may be proportional to end user input. /// /// Typical usage of this trait involves providing an implementation and then -/// running it using the [`visit`](fn.visit.html) function. +/// running it using the [`visit`] function. /// /// Note that the abstract syntax tree for a regular expression is quite -/// complex. Unless you specifically need it, you might be able to use the -/// much simpler -/// [high-level intermediate representation](../hir/struct.Hir.html) -/// and its -/// [corresponding `Visitor` trait](../hir/trait.Visitor.html) -/// instead. +/// complex. Unless you specifically need it, you might be able to use the much +/// simpler [high-level intermediate representation](crate::hir::Hir) and its +/// [corresponding `Visitor` trait](crate::hir::Visitor) instead. pub trait Visitor { /// The result of visiting an AST. type Output; @@ -46,13 +43,12 @@ pub trait Visitor { } /// This method is called between child nodes of an - /// [`Alternation`](struct.Alternation.html). + /// [`Alternation`](ast::Alternation). fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { Ok(()) } - /// This method is called on every - /// [`ClassSetItem`](enum.ClassSetItem.html) + /// This method is called on every [`ClassSetItem`](ast::ClassSetItem) /// before descending into child nodes. fn visit_class_set_item_pre( &mut self, @@ -61,8 +57,7 @@ pub trait Visitor { Ok(()) } - /// This method is called on every - /// [`ClassSetItem`](enum.ClassSetItem.html) + /// This method is called on every [`ClassSetItem`](ast::ClassSetItem) /// after descending into child nodes. fn visit_class_set_item_post( &mut self, @@ -72,8 +67,8 @@ pub trait Visitor { } /// This method is called on every - /// [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html) - /// before descending into child nodes. + /// [`ClassSetBinaryOp`](ast::ClassSetBinaryOp) before descending into + /// child nodes. fn visit_class_set_binary_op_pre( &mut self, _ast: &ast::ClassSetBinaryOp, @@ -82,8 +77,8 @@ pub trait Visitor { } /// This method is called on every - /// [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html) - /// after descending into child nodes. + /// [`ClassSetBinaryOp`](ast::ClassSetBinaryOp) after descending into child + /// nodes. fn visit_class_set_binary_op_post( &mut self, _ast: &ast::ClassSetBinaryOp, @@ -92,7 +87,7 @@ pub trait Visitor { } /// This method is called between the left hand and right hand child nodes - /// of a [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html). + /// of a [`ClassSetBinaryOp`](ast::ClassSetBinaryOp). fn visit_class_set_binary_op_in( &mut self, _ast: &ast::ClassSetBinaryOp, @@ -104,8 +99,7 @@ pub trait Visitor { /// Executes an implementation of `Visitor` in constant stack space. /// /// This function will visit every node in the given `Ast` while calling the -/// appropriate methods provided by the -/// [`Visitor`](trait.Visitor.html) trait. +/// appropriate methods provided by the [`Visitor`] trait. /// /// The primary use case for this method is when one wants to perform case /// analysis over an `Ast` without using a stack size proportional to the depth diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs index d49cffd92..85453fa11 100644 --- a/regex-syntax/src/hir/literal/mod.rs +++ b/regex-syntax/src/hir/literal/mod.rs @@ -30,11 +30,11 @@ use crate::hir::{self, Hir, HirKind}; /// bounded to some low number by default using heuristics, but the limits can /// be tweaked. /// -/// **WARNING**: Literal extraction uses stack space proportional to the size -/// of the `Hir` expression. At some point, this drawback will be eliminated. -/// To protect yourself, set a reasonable -/// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit). -/// This is done for you by default. +/// **WARNING**: Literal extraction uses stack space proportional to the +/// size of the `Hir` expression. At some point, this drawback will be +/// eliminated. To protect yourself, set a reasonable [`nest_limit` on your +/// `Parser`](crate::ParserBuilder::nest_limit). This is done for you by +/// default. #[derive(Clone, Eq, PartialEq)] pub struct Literals { lits: Vec, diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 2af769e92..d27297292 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -681,8 +681,8 @@ impl HirKind { /// Return true if and only if this HIR is the empty regular expression. /// /// Note that this is not defined inductively. That is, it only tests if - /// this kind is the `Empty` variant. To get the inductive definition, - /// use the `is_match_empty` method on [`Hir`](struct.Hir.html). + /// this kind is the `Empty` variant. To get the inductive definition, use + /// the `is_match_empty` method on [`Hir`]. pub fn is_empty(&self) -> bool { match *self { HirKind::Empty => true, @@ -756,12 +756,12 @@ impl Literal { /// A character class, regardless of its character type, is represented by a /// sequence of non-overlapping non-adjacent ranges of characters. /// -/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may -/// be produced even when it exclusively matches valid UTF-8. This is because -/// a `Bytes` variant represents an intention by the author of the regular -/// expression to disable Unicode mode, which in turn impacts the semantics of -/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not -/// match the same set of strings. +/// Note that unlike [`Literal`], a `Bytes` variant may be produced even when +/// it exclusively matches valid UTF-8. This is because a `Bytes` variant +/// represents an intention by the author of the regular expression to disable +/// Unicode mode, which in turn impacts the semantics of case insensitive +/// matching. For example, `(?i)k` and `(?i-u)k` will not match the same set of +/// strings. #[derive(Clone, Debug, Eq, PartialEq)] pub enum Class { /// A set of characters represented by Unicode scalar values. @@ -1424,10 +1424,9 @@ impl Repetition { /// /// Note that this is not defined inductively. For example, while `a*` /// will report `true`, `()+` will not, even though `()` matches the empty - /// string and one or more occurrences of something that matches the empty - /// string will always match the empty string. In order to get the - /// inductive definition, see the corresponding method on - /// [`Hir`](struct.Hir.html). + /// string and one or more occurrences of something that matches the + /// empty string will always match the empty string. In order to get the + /// inductive definition, see the corresponding method on [`Hir`]. pub fn is_match_empty(&self) -> bool { match self.kind { RepetitionKind::ZeroOrOne => true, diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index b4338bc94..44a6a8309 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -103,7 +103,7 @@ impl TranslatorBuilder { /// many abstract syntax trees. /// /// A `Translator` can be configured in more detail via a -/// [`TranslatorBuilder`](struct.TranslatorBuilder.html). +/// [`TranslatorBuilder`]. #[derive(Clone, Debug)] pub struct Translator { /// Our call stack, but on the heap. diff --git a/regex-syntax/src/hir/visitor.rs b/regex-syntax/src/hir/visitor.rs index 97771d92f..0012d5697 100644 --- a/regex-syntax/src/hir/visitor.rs +++ b/regex-syntax/src/hir/visitor.rs @@ -11,7 +11,7 @@ use crate::hir::{self, Hir, HirKind}; /// important since the size of an HIR may be proportional to end user input. /// /// Typical usage of this trait involves providing an implementation and then -/// running it using the [`visit`](fn.visit.html) function. +/// running it using the [`visit`] function. pub trait Visitor { /// The result of visiting an HIR. type Output; @@ -46,8 +46,7 @@ pub trait Visitor { /// Executes an implementation of `Visitor` in constant stack space. /// /// This function will visit every node in the given `Hir` while calling -/// appropriate methods provided by the -/// [`Visitor`](trait.Visitor.html) trait. +/// appropriate methods provided by the [`Visitor`] trait. /// /// The primary use case for this method is when one wants to perform case /// analysis over an `Hir` without using a stack size proportional to the depth diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 34c259d31..f87360f48 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -3,14 +3,14 @@ This crate provides a robust regular expression parser. This crate defines two primary types: -* [`Ast`](ast/enum.Ast.html) is the abstract syntax of a regular expression. +* [`Ast`](ast::Ast) is the abstract syntax of a regular expression. An abstract syntax corresponds to a *structured representation* of the concrete syntax of a regular expression, where the concrete syntax is the pattern string itself (e.g., `foo(bar)+`). Given some abstract syntax, it can be converted back to the original concrete syntax (modulo some details, like whitespace). To a first approximation, the abstract syntax is complex and difficult to analyze. -* [`Hir`](hir/struct.Hir.html) is the high-level intermediate representation +* [`Hir`](hir::Hir) is the high-level intermediate representation ("HIR" or "high-level IR" for short) of regular expression. It corresponds to an intermediate state of a regular expression that sits between the abstract syntax and the low level compiled opcodes that are eventually responsible for @@ -22,14 +22,14 @@ This crate defines two primary types: These two types come with conversion routines: -* An [`ast::parse::Parser`](ast/parse/struct.Parser.html) converts concrete - syntax (a `&str`) to an [`Ast`](ast/enum.Ast.html). -* A [`hir::translate::Translator`](hir/translate/struct.Translator.html) - converts an [`Ast`](ast/enum.Ast.html) to a [`Hir`](hir/struct.Hir.html). +* An [`ast::parse::Parser`] converts concrete syntax (a `&str`) to an +[`Ast`](ast::Ast). +* A [`hir::translate::Translator`] converts an [`Ast`](ast::Ast) to a +[`Hir`](hir::Hir). As a convenience, the above two conversion routines are combined into one via -the top-level [`Parser`](struct.Parser.html) type. This `Parser` will first -convert your pattern to an `Ast` and then convert the `Ast` to an `Hir`. +the top-level [`Parser`] type. This `Parser` will first convert your pattern to +an `Ast` and then convert the `Ast` to an `Hir`. # Example @@ -81,10 +81,10 @@ in a monospace font. # Literal extraction -This crate provides limited support for -[literal extraction from `Hir` values](hir/literal/struct.Literals.html). -Be warned that literal extraction currently uses recursion, and therefore, -stack size proportional to the size of the `Hir`. +This crate provides limited support for [literal extraction from `Hir` +values](hir::literal::Literals). Be warned that literal extraction currently +uses recursion, and therefore, stack size proportional to the size of the +`Hir`. The purpose of literal extraction is to speed up searches. That is, if you know a regular expression must match a prefix or suffix literal, then it is @@ -159,11 +159,12 @@ The following features are available: `\p{sb=ATerm}`. */ +#![no_std] #![forbid(unsafe_code)] -#![deny(missing_docs)] +#![deny(missing_docs, rustdoc::broken_intra_doc_links)] +#![doc(test(attr(deny(warnings))))] #![warn(missing_debug_implementations)] #![cfg_attr(docsrs, feature(doc_auto_cfg))] -#![no_std] #[cfg(any(test, feature = "std"))] extern crate std; @@ -240,10 +241,9 @@ pub fn is_meta_character(c: char) -> bool { /// /// # Panics /// -/// If the `unicode-perl` feature is not enabled, then this function panics. -/// For this reason, it is recommended that callers use -/// [`try_is_word_character`](fn.try_is_word_character.html) -/// instead. +/// If the `unicode-perl` feature is not enabled, then this function +/// panics. For this reason, it is recommended that callers use +/// [`try_is_word_character`] instead. pub fn is_word_character(c: char) -> bool { try_is_word_character(c).expect("unicode-perl feature must be enabled") } diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index 93df72279..509ce3e15 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -4,10 +4,9 @@ use crate::{ast, hir, Result}; /// /// This builder permits modifying configuration options for the parser. /// -/// This type combines the builder options for both the -/// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html) -/// and the -/// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html). +/// This type combines the builder options for both the [AST +/// `ParserBuilder`](ast::parse::ParserBuilder) and the [HIR +/// `TranslatorBuilder`](hir::translate::TranslatorBuilder). #[derive(Clone, Debug, Default)] pub struct ParserBuilder { ast: ast::parse::ParserBuilder, @@ -164,10 +163,9 @@ impl ParserBuilder { /// convenience for never having to deal with it at all. /// /// If callers have more fine grained use cases that need an AST, then please -/// see the [`ast::parse`](ast/parse/index.html) module. +/// see the [`ast::parse`] module. /// -/// A `Parser` can be configured in more detail via a -/// [`ParserBuilder`](struct.ParserBuilder.html). +/// A `Parser` can be configured in more detail via a [`ParserBuilder`]. #[derive(Clone, Debug)] pub struct Parser { ast: ast::parse::Parser, @@ -181,8 +179,7 @@ impl Parser { /// a high level intermediate representation of the given regular /// expression. /// - /// To set configuration options on the parser, use - /// [`ParserBuilder`](struct.ParserBuilder.html). + /// To set configuration options on the parser, use [`ParserBuilder`]. pub fn new() -> Parser { ParserBuilder::new().build() } diff --git a/regex-syntax/src/utf8.rs b/regex-syntax/src/utf8.rs index a75a8afa8..e13b55abf 100644 --- a/regex-syntax/src/utf8.rs +++ b/regex-syntax/src/utf8.rs @@ -3,7 +3,7 @@ Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes. This is sub-module is useful for constructing byte based automatons that need to embed UTF-8 decoding. The most common use of this module is in conjunction -with the [`hir::ClassUnicodeRange`](../hir/struct.ClassUnicodeRange.html) type. +with the [`hir::ClassUnicodeRange`](crate::hir::ClassUnicodeRange) type. See the documentation on the `Utf8Sequences` iterator for more details and an example. From 52d5393661ddc448957c41badebae1652afa86d6 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 28 Aug 2022 12:53:12 -0400 Subject: [PATCH 17/79] syntax: simplify hir::GroupKind I'm not sure exactly why I used three variants instead of two like how I've defined it in this patch. Possibly because the AST uses three variants? (The AST needs to do a little more work to store a span associated with where the name actually is in the expression, so it maybe makes a little more sense there.) In any case, this is the first step of many in simplifying the HIR. --- regex-syntax/src/hir/mod.rs | 14 ++++++-------- regex-syntax/src/hir/print.rs | 8 ++++---- regex-syntax/src/hir/translate.rs | 20 +++++++++----------- src/compile.rs | 16 ++++++---------- 4 files changed, 25 insertions(+), 33 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index d27297292..5ff9eb0b2 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1383,19 +1383,17 @@ pub struct Group { /// The kind of group. #[derive(Clone, Debug, Eq, PartialEq)] pub enum GroupKind { - /// A normal unnamed capturing group. + /// A non-capturing group. + NonCapturing, + /// A capturing group with an optional name. /// /// The value is the capture index of the group. - CaptureIndex(u32), - /// A named capturing group. - CaptureName { - /// The name of the group. - name: String, + Capture { /// The capture index of the group. index: u32, + /// The name of the group, if it exists. + name: Option, }, - /// A non-capturing group. - NonCapturing, } /// The high-level intermediate representation of a repetition operator. diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 63d014b1b..460d0de62 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -150,11 +150,11 @@ impl Visitor for Writer { self.wtr.write_str(r"(?-u:\B)")?; } HirKind::Group(ref x) => match x.kind { - hir::GroupKind::CaptureIndex(_) => { + hir::GroupKind::Capture { ref name, .. } => { self.wtr.write_str("(")?; - } - hir::GroupKind::CaptureName { ref name, .. } => { - write!(self.wtr, "(?P<{}>", name)?; + if let Some(ref name) = *name { + write!(self.wtr, "?P<{}>", name)?; + } } hir::GroupKind::NonCapturing => { self.wtr.write_str("(?:")?; diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 44a6a8309..be3ffd102 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -763,15 +763,13 @@ impl<'t, 'p> TranslatorI<'t, 'p> { fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir { let kind = match group.kind { - ast::GroupKind::CaptureIndex(idx) => { - hir::GroupKind::CaptureIndex(idx) - } - ast::GroupKind::CaptureName(ref capname) => { - hir::GroupKind::CaptureName { - name: capname.name.clone(), - index: capname.index, - } + ast::GroupKind::CaptureIndex(index) => { + hir::GroupKind::Capture { index, name: None } } + ast::GroupKind::CaptureName(ref cap) => hir::GroupKind::Capture { + index: cap.index, + name: Some(cap.name.clone()), + }, ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing, }; Hir::group(hir::Group { kind, hir: Box::new(expr) }) @@ -1211,16 +1209,16 @@ mod tests { fn hir_group(i: u32, expr: Hir) -> Hir { Hir::group(hir::Group { - kind: hir::GroupKind::CaptureIndex(i), + kind: hir::GroupKind::Capture { index: i, name: None }, hir: Box::new(expr), }) } fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir { Hir::group(hir::Group { - kind: hir::GroupKind::CaptureName { - name: name.to_string(), + kind: hir::GroupKind::Capture { index: i, + name: Some(name.to_string()), }, hir: Box::new(expr), }) diff --git a/src/compile.rs b/src/compile.rs index 361ea4cb7..34ea3765e 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -362,17 +362,13 @@ impl Compiler { } Group(ref g) => match g.kind { hir::GroupKind::NonCapturing => self.c(&g.hir), - hir::GroupKind::CaptureIndex(index) => { + hir::GroupKind::Capture { index, ref name } => { if index as usize >= self.compiled.captures.len() { - self.compiled.captures.push(None); - } - self.c_capture(2 * index as usize, &g.hir) - } - hir::GroupKind::CaptureName { index, ref name } => { - if index as usize >= self.compiled.captures.len() { - let n = name.to_string(); - self.compiled.captures.push(Some(n.clone())); - self.capture_name_idx.insert(n, index as usize); + self.compiled.captures.push(name.clone()); + if let Some(ref name) = *name { + self.capture_name_idx + .insert(name.clone(), index as usize); + } } self.c_capture(2 * index as usize, &g.hir) } From 00ea5711fd083454750e0247756dfd11abda0240 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 28 Aug 2022 12:59:21 -0400 Subject: [PATCH 18/79] syntax: remove WordBoundary::is_negated method This is apparently not used anywhere. So drop it. Also motivated by wanting to squash look-around assertions into a single enum. So 'is_negated' won't make sense on its own anymore. --- regex-syntax/src/hir/mod.rs | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 5ff9eb0b2..2e2d7bdf3 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1353,16 +1353,6 @@ pub enum WordBoundary { AsciiNegate, } -impl WordBoundary { - /// Returns true if and only if this word boundary assertion is negated. - pub fn is_negated(&self) -> bool { - match *self { - WordBoundary::Unicode | WordBoundary::Ascii => false, - WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true, - } - } -} - /// The high-level intermediate representation for a group. /// /// This represents one of three possible group types: From 1f707e7bc4be16e359fc2fd04f4f98e0e1ad154f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 28 Aug 2022 14:38:57 -0400 Subject: [PATCH 19/79] syntax: flatten look-around assertions Instead of having both 'HirKind::Anchor' and 'HirKind::WordBoundary', this patch flattens them into one 'hirKind::Look'. Why do this? I think they make more sense grouped together. Namely, they are all simplistic look-around assertions and they all tend to be handled with very similar logic. --- regex-syntax/src/hir/literal/mod.rs | 4 +- regex-syntax/src/hir/mod.rs | 110 +++++++++-------------- regex-syntax/src/hir/print.rs | 53 ++++++------ regex-syntax/src/hir/translate.rs | 77 +++++++--------- src/compile.rs | 130 ++++++++++++++-------------- 5 files changed, 166 insertions(+), 208 deletions(-) diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs index 85453fa11..cd15cc836 100644 --- a/regex-syntax/src/hir/literal/mod.rs +++ b/regex-syntax/src/hir/literal/mod.rs @@ -627,7 +627,7 @@ fn prefixes(expr: &Hir, lits: &mut Literals) { HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), HirKind::Concat(ref es) => { for e in es { - if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() { + if let HirKind::Look(hir::Look::Start) = *e.kind() { if !lits.is_empty() { lits.cut(); break; @@ -703,7 +703,7 @@ fn suffixes(expr: &Hir, lits: &mut Literals) { HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), HirKind::Concat(ref es) => { for e in es.iter().rev() { - if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() { + if let HirKind::Look(hir::Look::End) = *e.kind() { if !lits.is_empty() { lits.cut(); break; diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 2e2d7bdf3..a9b0214f2 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -175,11 +175,8 @@ pub enum HirKind { /// class. A class can either consist of Unicode scalar values as /// characters, or it can use bytes. Class(Class), - /// An anchor assertion. An anchor assertion match always has zero length. - Anchor(Anchor), - /// A word boundary assertion, which may or may not be Unicode aware. A - /// word boundary assertion match always has zero length. - WordBoundary(WordBoundary), + /// A look-around assertion. A look-around match always has zero length. + Look(Look), /// A repetition operation applied to a child expression. Repetition(Repetition), /// A possibly capturing group, which contains a child expression. @@ -271,8 +268,8 @@ impl Hir { Hir { kind: HirKind::Class(class), info } } - /// Creates an anchor assertion HIR expression. - pub fn anchor(anchor: Anchor) -> Hir { + /// Creates a look-around assertion HIR expression. + pub fn look(look: Look) -> Hir { let mut info = HirInfo::new(); info.set_always_utf8(true); info.set_all_assertions(true); @@ -282,53 +279,34 @@ impl Hir { info.set_line_anchored_end(false); info.set_any_anchored_start(false); info.set_any_anchored_end(false); + // All look-around assertions always produce zero-length or "empty" + // matches. This is true even though not all of them (like \b) match + // the empty string itself. That is, '\b' does not match ''. But it + // does match the empty string between '!' and 'a' in '!a'. info.set_match_empty(true); info.set_literal(false); info.set_alternation_literal(false); - if let Anchor::StartText = anchor { + if let Look::Start = look { info.set_anchored_start(true); info.set_line_anchored_start(true); info.set_any_anchored_start(true); } - if let Anchor::EndText = anchor { + if let Look::End = look { info.set_anchored_end(true); info.set_line_anchored_end(true); info.set_any_anchored_end(true); } - if let Anchor::StartLine = anchor { + if let Look::StartLF = look { info.set_line_anchored_start(true); } - if let Anchor::EndLine = anchor { + if let Look::EndLF = look { info.set_line_anchored_end(true); } - Hir { kind: HirKind::Anchor(anchor), info } - } - - /// Creates a word boundary assertion HIR expression. - pub fn word_boundary(word_boundary: WordBoundary) -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_literal(false); - info.set_alternation_literal(false); - // A negated word boundary matches '', so that's fine. But \b does not - // match \b, so why do we say it can match the empty string? Well, - // because, if you search for \b against 'a', it will report [0, 0) and - // [1, 1) as matches, and both of those matches correspond to the empty - // string. Thus, only *certain* empty strings match \b, which similarly - // applies to \B. - info.set_match_empty(true); - // Negated ASCII word boundaries can match invalid UTF-8. - if let WordBoundary::AsciiNegate = word_boundary { + if let Look::WordAsciiNegate = look { + // Negated ASCII word boundaries can match invalid UTF-8. info.set_always_utf8(false); } - Hir { kind: HirKind::WordBoundary(word_boundary), info } + Hir { kind: HirKind::Look(look), info } } /// Creates a repetition HIR expression. @@ -697,8 +675,7 @@ impl HirKind { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) - | HirKind::Anchor(_) - | HirKind::WordBoundary(_) => false, + | HirKind::Look(_) => false, HirKind::Group(_) | HirKind::Repetition(_) | HirKind::Concat(_) @@ -1313,44 +1290,37 @@ impl core::fmt::Debug for ClassBytesRange { } } -/// The high-level intermediate representation for an anchor assertion. +/// The high-level intermediate representation for a look-around assertion. /// -/// A matching anchor assertion is always zero-length. +/// An assertion match is always zero-length. Also called an "empty match." #[derive(Clone, Debug, Eq, PartialEq)] -pub enum Anchor { - /// Match the beginning of a line or the beginning of text. Specifically, - /// this matches at the starting position of the input, or at the position - /// immediately following a `\n` character. - StartLine, - /// Match the end of a line or the end of text. Specifically, - /// this matches at the end position of the input, or at the position - /// immediately preceding a `\n` character. - EndLine, +pub enum Look { /// Match the beginning of text. Specifically, this matches at the starting /// position of the input. - StartText, + Start, /// Match the end of text. Specifically, this matches at the ending /// position of the input. - EndText, -} - -/// The high-level intermediate representation for a word-boundary assertion. -/// -/// A matching word boundary assertion is always zero-length. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum WordBoundary { - /// Match a Unicode-aware word boundary. That is, this matches a position - /// where the left adjacent character and right adjacent character - /// correspond to a word and non-word or a non-word and word character. - Unicode, - /// Match a Unicode-aware negation of a word boundary. - UnicodeNegate, + End, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following a `\n` character. + StartLF, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\n` character. + EndLF, /// Match an ASCII-only word boundary. That is, this matches a position /// where the left adjacent character and right adjacent character /// correspond to a word and non-word or a non-word and word character. - Ascii, + WordAscii, /// Match an ASCII-only negation of a word boundary. - AsciiNegate, + WordAsciiNegate, + /// Match a Unicode-aware word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + WordUnicode, + /// Match a Unicode-aware negation of a word boundary. + WordUnicodeNegate, } /// The high-level intermediate representation for a group. @@ -1461,8 +1431,7 @@ impl Drop for Hir { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) - | HirKind::Anchor(_) - | HirKind::WordBoundary(_) => return, + | HirKind::Look(_) => return, HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return, HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return, HirKind::Concat(ref x) if x.is_empty() => return, @@ -1476,8 +1445,7 @@ impl Drop for Hir { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) - | HirKind::Anchor(_) - | HirKind::WordBoundary(_) => {} + | HirKind::Look(_) => {} HirKind::Group(ref mut x) => { stack.push(mem::replace(&mut x.hir, Hir::empty())); } diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 460d0de62..633edf9d8 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -125,30 +125,32 @@ impl Visitor for Writer { } self.wtr.write_str("])")?; } - HirKind::Anchor(hir::Anchor::StartLine) => { - self.wtr.write_str("(?m:^)")?; - } - HirKind::Anchor(hir::Anchor::EndLine) => { - self.wtr.write_str("(?m:$)")?; - } - HirKind::Anchor(hir::Anchor::StartText) => { - self.wtr.write_str(r"\A")?; - } - HirKind::Anchor(hir::Anchor::EndText) => { - self.wtr.write_str(r"\z")?; - } - HirKind::WordBoundary(hir::WordBoundary::Unicode) => { - self.wtr.write_str(r"\b")?; - } - HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => { - self.wtr.write_str(r"\B")?; - } - HirKind::WordBoundary(hir::WordBoundary::Ascii) => { - self.wtr.write_str(r"(?-u:\b)")?; - } - HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => { - self.wtr.write_str(r"(?-u:\B)")?; - } + HirKind::Look(ref look) => match *look { + hir::Look::Start => { + self.wtr.write_str(r"\A")?; + } + hir::Look::End => { + self.wtr.write_str(r"\z")?; + } + hir::Look::StartLF => { + self.wtr.write_str("(?m:^)")?; + } + hir::Look::EndLF => { + self.wtr.write_str("(?m:$)")?; + } + hir::Look::WordAscii => { + self.wtr.write_str(r"(?-u:\b)")?; + } + hir::Look::WordAsciiNegate => { + self.wtr.write_str(r"(?-u:\B)")?; + } + hir::Look::WordUnicode => { + self.wtr.write_str(r"\b")?; + } + hir::Look::WordUnicodeNegate => { + self.wtr.write_str(r"\B")?; + } + }, HirKind::Group(ref x) => match x.kind { hir::GroupKind::Capture { ref name, .. } => { self.wtr.write_str("(")?; @@ -170,8 +172,7 @@ impl Visitor for Writer { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) - | HirKind::Anchor(_) - | HirKind::WordBoundary(_) + | HirKind::Look(_) | HirKind::Concat(_) | HirKind::Alternation(_) => {} HirKind::Repetition(ref x) => { diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index be3ffd102..0985276ad 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -722,30 +722,26 @@ impl<'t, 'p> TranslatorI<'t, 'p> { let unicode = self.flags().unicode(); let multi_line = self.flags().multi_line(); Ok(match asst.kind { - ast::AssertionKind::StartLine => Hir::anchor(if multi_line { - hir::Anchor::StartLine + ast::AssertionKind::StartLine => Hir::look(if multi_line { + hir::Look::StartLF } else { - hir::Anchor::StartText + hir::Look::Start }), - ast::AssertionKind::EndLine => Hir::anchor(if multi_line { - hir::Anchor::EndLine + ast::AssertionKind::EndLine => Hir::look(if multi_line { + hir::Look::EndLF } else { - hir::Anchor::EndText + hir::Look::End + }), + ast::AssertionKind::StartText => Hir::look(hir::Look::Start), + ast::AssertionKind::EndText => Hir::look(hir::Look::End), + ast::AssertionKind::WordBoundary => Hir::look(if unicode { + hir::Look::WordUnicode + } else { + hir::Look::WordAscii }), - ast::AssertionKind::StartText => { - Hir::anchor(hir::Anchor::StartText) - } - ast::AssertionKind::EndText => Hir::anchor(hir::Anchor::EndText), - ast::AssertionKind::WordBoundary => { - Hir::word_boundary(if unicode { - hir::WordBoundary::Unicode - } else { - hir::WordBoundary::Ascii - }) - } ast::AssertionKind::NotWordBoundary => { - Hir::word_boundary(if unicode { - hir::WordBoundary::UnicodeNegate + Hir::look(if unicode { + hir::Look::WordUnicodeNegate } else { // It is possible for negated ASCII word boundaries to // match at invalid UTF-8 boundaries, even when searching @@ -755,7 +751,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { self.error(asst.span, ErrorKind::InvalidUtf8) ); } - hir::WordBoundary::AsciiNegate + hir::Look::WordAsciiNegate }) } }) @@ -1364,12 +1360,8 @@ mod tests { } } - fn hir_anchor(anchor: hir::Anchor) -> Hir { - Hir::anchor(anchor) - } - - fn hir_word(wb: hir::WordBoundary) -> Hir { - Hir::word_boundary(wb) + fn hir_look(look: hir::Look) -> Hir { + Hir::look(look) } #[test] @@ -1563,22 +1555,19 @@ mod tests { #[test] fn assertions() { - assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText)); - assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText)); - assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText)); - assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText)); - assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine)); - assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine)); - assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText)); - assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText)); + assert_eq!(t("^"), hir_look(hir::Look::Start)); + assert_eq!(t("$"), hir_look(hir::Look::End)); + assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); + assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); + assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); - assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode)); - assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate)); - assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii)); - assert_eq!( - t_bytes(r"(?-u)\B"), - hir_word(hir::WordBoundary::AsciiNegate) - ); + assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode)); + assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate)); + assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii)); + assert_eq!(t_bytes(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); assert_eq!( t_err(r"(?-u)\B"), @@ -1693,7 +1682,7 @@ mod tests { t("(?im)a^"), hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), - hir_anchor(hir::Anchor::StartLine), + hir_look(hir::Look::StartLF), ]) ); #[cfg(feature = "unicode-case")] @@ -1701,9 +1690,9 @@ mod tests { t("(?im)a^(?i-m)a^"), hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), - hir_anchor(hir::Anchor::StartLine), + hir_look(hir::Look::StartLF), hir_uclass(&[('A', 'A'), ('a', 'a')]), - hir_anchor(hir::Anchor::StartText), + hir_look(hir::Look::Start), ]) ); assert_eq!( diff --git a/src/compile.rs b/src/compile.rs index 34ea3765e..ab0d6372b 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -291,75 +291,75 @@ impl Compiler { self.c_class(&char_ranges) } } - Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::EndLine) - } - Anchor(hir::Anchor::StartLine) => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::StartLine) - } - Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::StartLine) - } - Anchor(hir::Anchor::EndLine) => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::EndLine) - } - Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { - self.c_empty_look(prog::EmptyLook::EndText) - } - Anchor(hir::Anchor::StartText) => { - self.c_empty_look(prog::EmptyLook::StartText) - } - Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { - self.c_empty_look(prog::EmptyLook::StartText) - } - Anchor(hir::Anchor::EndText) => { - self.c_empty_look(prog::EmptyLook::EndText) - } - WordBoundary(hir::WordBoundary::Unicode) => { - if !cfg!(feature = "unicode-perl") { - return Err(Error::Syntax( - "Unicode word boundaries are unavailable when \ + Look(ref look) => match *look { + hir::Look::Start if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::EndText) + } + hir::Look::Start => { + self.c_empty_look(prog::EmptyLook::StartText) + } + hir::Look::End if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::StartText) + } + hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText), + hir::Look::StartLF if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + hir::Look::StartLF => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + hir::Look::EndLF if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + hir::Look::EndLF => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + hir::Look::WordAscii => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) + } + hir::Look::WordAsciiNegate => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) + } + hir::Look::WordUnicode => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ the unicode-perl feature is disabled" - .to_string(), - )); + .to_string(), + )); + } + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // We also make sure that all ASCII bytes are in a different + // class from non-ASCII bytes. Otherwise, it's possible for + // ASCII bytes to get lumped into the same class as non-ASCII + // bytes. This in turn may cause the lazy DFA to falsely start + // when it sees an ASCII byte that maps to a byte class with + // non-ASCII bytes. This ensures that never happens. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::WordBoundary) } - self.compiled.has_unicode_word_boundary = true; - self.byte_classes.set_word_boundary(); - // We also make sure that all ASCII bytes are in a different - // class from non-ASCII bytes. Otherwise, it's possible for - // ASCII bytes to get lumped into the same class as non-ASCII - // bytes. This in turn may cause the lazy DFA to falsely start - // when it sees an ASCII byte that maps to a byte class with - // non-ASCII bytes. This ensures that never happens. - self.byte_classes.set_range(0, 0x7F); - self.c_empty_look(prog::EmptyLook::WordBoundary) - } - WordBoundary(hir::WordBoundary::UnicodeNegate) => { - if !cfg!(feature = "unicode-perl") { - return Err(Error::Syntax( - "Unicode word boundaries are unavailable when \ + hir::Look::WordUnicodeNegate => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ the unicode-perl feature is disabled" - .to_string(), - )); + .to_string(), + )); + } + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // See comments above for why we set the ASCII range here. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::NotWordBoundary) } - self.compiled.has_unicode_word_boundary = true; - self.byte_classes.set_word_boundary(); - // See comments above for why we set the ASCII range here. - self.byte_classes.set_range(0, 0x7F); - self.c_empty_look(prog::EmptyLook::NotWordBoundary) - } - WordBoundary(hir::WordBoundary::Ascii) => { - self.byte_classes.set_word_boundary(); - self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) - } - WordBoundary(hir::WordBoundary::AsciiNegate) => { - self.byte_classes.set_word_boundary(); - self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) - } + }, Group(ref g) => match g.kind { hir::GroupKind::NonCapturing => self.c(&g.hir), hir::GroupKind::Capture { index, ref name } => { From aa0c117542dd38540f263e05e913bea2f0af36e9 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 28 Aug 2022 18:55:23 -0400 Subject: [PATCH 20/79] syntax: simplify hir::Repetition This greatly simplifies how repetitions are represented in the HIR from a sprawling set of variants down to just a simple `(u32, Option)`. This is much simpler and still permits us to specialize all of the cases we did before if necessary. This also simplifies some of the HIR printer's output. e.g., 'a{1}' is just 'a'. --- regex-syntax/src/hir/literal/mod.rs | 48 ++++------ regex-syntax/src/hir/mod.rs | 53 ++++------- regex-syntax/src/hir/print.rs | 135 +++++++++++++++++++++++----- regex-syntax/src/hir/translate.rs | 77 ++++++---------- src/compile.rs | 22 +++-- 5 files changed, 188 insertions(+), 147 deletions(-) diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs index cd15cc836..b32a02db7 100644 --- a/regex-syntax/src/hir/literal/mod.rs +++ b/regex-syntax/src/hir/literal/mod.rs @@ -602,26 +602,19 @@ fn prefixes(expr: &Hir, lits: &mut Literals) { HirKind::Group(hir::Group { ref hir, .. }) => { prefixes(&**hir, lits); } - HirKind::Repetition(ref x) => match x.kind { - hir::RepetitionKind::ZeroOrOne => { + HirKind::Repetition(ref x) => match (x.min, x.max) { + (0, Some(1)) => { repeat_zero_or_one_literals(&x.hir, lits, prefixes); } - hir::RepetitionKind::ZeroOrMore => { + (0, None) => { repeat_zero_or_more_literals(&x.hir, lits, prefixes); } - hir::RepetitionKind::OneOrMore => { + (1, None) => { repeat_one_or_more_literals(&x.hir, lits, prefixes); } - hir::RepetitionKind::Range(ref rng) => { - let (min, max) = match *rng { - hir::RepetitionRange::Exactly(m) => (m, Some(m)), - hir::RepetitionRange::AtLeast(m) => (m, None), - hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), - }; - repeat_range_literals( - &x.hir, min, max, x.greedy, lits, prefixes, - ) - } + (min, max) => repeat_range_literals( + &x.hir, min, max, x.greedy, lits, prefixes, + ), }, HirKind::Concat(ref es) if es.is_empty() => {} HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), @@ -678,26 +671,19 @@ fn suffixes(expr: &Hir, lits: &mut Literals) { HirKind::Group(hir::Group { ref hir, .. }) => { suffixes(&**hir, lits); } - HirKind::Repetition(ref x) => match x.kind { - hir::RepetitionKind::ZeroOrOne => { + HirKind::Repetition(ref x) => match (x.min, x.max) { + (0, Some(1)) => { repeat_zero_or_one_literals(&x.hir, lits, suffixes); } - hir::RepetitionKind::ZeroOrMore => { + (0, None) => { repeat_zero_or_more_literals(&x.hir, lits, suffixes); } - hir::RepetitionKind::OneOrMore => { + (1, None) => { repeat_one_or_more_literals(&x.hir, lits, suffixes); } - hir::RepetitionKind::Range(ref rng) => { - let (min, max) = match *rng { - hir::RepetitionRange::Exactly(m) => (m, Some(m)), - hir::RepetitionRange::AtLeast(m) => (m, None), - hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), - }; - repeat_range_literals( - &x.hir, min, max, x.greedy, lits, suffixes, - ) - } + (min, max) => repeat_range_literals( + &x.hir, min, max, x.greedy, lits, suffixes, + ), }, HirKind::Concat(ref es) if es.is_empty() => {} HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), @@ -736,7 +722,8 @@ fn repeat_zero_or_one_literals( ) { f( &Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, + min: 0, + max: None, // FIXME: Our literal extraction doesn't care about greediness. // Which is partially why we're treating 'e?' as 'e*'. Namely, // 'ab??' yields [Complete(ab), Complete(a)], but it should yield @@ -794,7 +781,8 @@ fn repeat_range_literals( // just treat it as `e*`. f( &Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, + min: 0, + max: None, greedy, hir: Box::new(e.clone()), }), diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index a9b0214f2..9d41f312f 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1362,8 +1362,18 @@ pub enum GroupKind { /// sub-expression. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Repetition { - /// The kind of this repetition operator. - pub kind: RepetitionKind, + /// The minimum range of the repetition. + /// + /// Note that special cases like `?`, `+` and `*` all get translated into + /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively. + pub min: u32, + /// The maximum range of the repetition. + /// + /// Note that when `max` is `None`, `min` acts as a lower bound but where + /// there is no upper bound. For something like `x{5}` where the min and + /// max are equivalent, `min` will be set to `5` and `max` will be set to + /// `Some(5)`. + pub max: Option, /// Whether this repetition operator is greedy or not. A greedy operator /// will match as much as it can. A non-greedy operator will match as /// little as it can. @@ -1385,42 +1395,14 @@ impl Repetition { /// string and one or more occurrences of something that matches the /// empty string will always match the empty string. In order to get the /// inductive definition, see the corresponding method on [`Hir`]. + /// + /// This returns true in precisely the cases that [`Repetition::min`] + /// is equal to `0`. pub fn is_match_empty(&self) -> bool { - match self.kind { - RepetitionKind::ZeroOrOne => true, - RepetitionKind::ZeroOrMore => true, - RepetitionKind::OneOrMore => false, - RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0, - RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0, - RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0, - } + self.min == 0 } } -/// The kind of a repetition operator. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum RepetitionKind { - /// Matches a sub-expression zero or one times. - ZeroOrOne, - /// Matches a sub-expression zero or more times. - ZeroOrMore, - /// Matches a sub-expression one or more times. - OneOrMore, - /// Matches a sub-expression within a bounded range of times. - Range(RepetitionRange), -} - -/// The kind of a counted repetition operator. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum RepetitionRange { - /// Matches a sub-expression exactly this many times. - Exactly(u32), - /// Matches a sub-expression at least this many times. - AtLeast(u32), - /// Matches a sub-expression at least `m` times and at most `n` times. - Bounded(u32, u32), -} - /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack /// space but heap space proportional to the depth of the total `Hir`. impl Drop for Hir { @@ -2257,7 +2239,8 @@ mod tests { hir: Box::new(expr), }); expr = Hir::repetition(Repetition { - kind: RepetitionKind::ZeroOrOne, + min: 0, + max: Some(1), greedy: true, hir: Box::new(expr), }); diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 633edf9d8..9d7b2f70e 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -176,27 +176,31 @@ impl Visitor for Writer { | HirKind::Concat(_) | HirKind::Alternation(_) => {} HirKind::Repetition(ref x) => { - match x.kind { - hir::RepetitionKind::ZeroOrOne => { + match (x.min, x.max) { + (0, Some(1)) => { self.wtr.write_str("?")?; } - hir::RepetitionKind::ZeroOrMore => { + (0, None) => { self.wtr.write_str("*")?; } - hir::RepetitionKind::OneOrMore => { + (1, None) => { self.wtr.write_str("+")?; } - hir::RepetitionKind::Range(ref x) => match *x { - hir::RepetitionRange::Exactly(m) => { - write!(self.wtr, "{{{}}}", m)?; - } - hir::RepetitionRange::AtLeast(m) => { - write!(self.wtr, "{{{},}}", m)?; - } - hir::RepetitionRange::Bounded(m, n) => { - write!(self.wtr, "{{{},{}}}", m, n)?; - } - }, + (1, Some(1)) => { + // 'a{1}' and 'a{1}?' are exactly equivalent to 'a'. + return Ok(()); + } + (m, None) => { + write!(self.wtr, "{{{},}}", m)?; + } + (m, Some(n)) if m == n => { + write!(self.wtr, "{{{}}}", m)?; + // a{m} and a{m}? are always exactly equivalent. + return Ok(()); + } + (m, Some(n)) => { + write!(self.wtr, "{{{},{}}}", m, n)?; + } } if !x.greedy { self.wtr.write_str("?")?; @@ -241,7 +245,10 @@ impl Writer { #[cfg(test)] mod tests { - use alloc::string::String; + use alloc::{ + boxed::Box, + string::{String, ToString}, + }; use crate::ParserBuilder; @@ -338,14 +345,17 @@ mod tests { roundtrip("a+?", "a+?"); roundtrip("(?U)a+", "a+?"); - roundtrip("a{1}", "a{1}"); - roundtrip("a{1,}", "a{1,}"); + roundtrip("a{1}", "a"); + roundtrip("a{2}", "a{2}"); + roundtrip("a{1,}", "a+"); roundtrip("a{1,5}", "a{1,5}"); - roundtrip("a{1}?", "a{1}?"); - roundtrip("a{1,}?", "a{1,}?"); + roundtrip("a{1}?", "a"); + roundtrip("a{2}?", "a{2}"); + roundtrip("a{1,}?", "a+?"); roundtrip("a{1,5}?", "a{1,5}?"); - roundtrip("(?U)a{1}", "a{1}?"); - roundtrip("(?U)a{1,}", "a{1,}?"); + roundtrip("(?U)a{1}", "a"); + roundtrip("(?U)a{2}", "a{2}"); + roundtrip("(?U)a{1,}", "a+?"); roundtrip("(?U)a{1,5}", "a{1,5}?"); } @@ -371,4 +381,85 @@ mod tests { roundtrip("a|b|c", "a|b|c"); roundtrip("foo|bar|quux", "foo|bar|quux"); } + + // This is a regression test that stresses a peculiarity of how the HIR + // is both constructed and printed. Namely, it is legal for a repetition + // to directly contain a concatenation. This particular construct isn't + // really possible to build from the concrete syntax directly, since you'd + // be forced to put the concatenation into (at least) a non-capturing + // group. Concurrently, the printer doesn't consider this case and just + // kind of naively prints the child expression and tacks on the repetition + // operator. + // + // As a result, if you attached '+' to a 'concat(a, b)', the printer gives + // you 'ab+', but clearly it really should be '(?:ab)+'. + // + // This bug isn't easy to surface because most ways of building an HIR + // come directly from the concrete syntax, and as mentioned above, it just + // isn't possible to build this kind of HIR from the concrete syntax. + // Nevertheless, this is definitely a bug. + // + // See: https://github.com/rust-lang/regex/issues/731 + #[test] + fn regression_repetition_concat() { + let expr = Hir::concat(alloc::vec![ + Hir::literal(hir::Literal::Unicode('x')), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + hir: Box::new(Hir::concat(alloc::vec![ + Hir::literal(hir::Literal::Unicode('a')), + Hir::literal(hir::Literal::Unicode('b')), + ])), + }), + Hir::literal(hir::Literal::Unicode('y')), + ]); + assert_eq!(r"x(?:ab)+y", expr.to_string()); + } + + // Just like regression_repetition_concat, but with the repetition using + // an alternation as a child expression instead. + // + // See: https://github.com/rust-lang/regex/issues/731 + #[test] + fn regression_repetition_alternation() { + let expr = Hir::concat(alloc::vec![ + Hir::literal(hir::Literal::Unicode('x')), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + hir: Box::new(Hir::alternation(alloc::vec![ + Hir::literal(hir::Literal::Unicode('a')), + Hir::literal(hir::Literal::Unicode('b')), + ])), + }), + Hir::literal(hir::Literal::Unicode('y')), + ]); + assert_eq!(r"x(?:a|b)+y", expr.to_string()); + } + + // This regression test is very similar in flavor to + // regression_repetition_concat in that the root of the issue lies in a + // peculiarity of how the HIR is represented and how the printer writes it + // out. Like the other regression, this one is also rooted in the fact that + // you can't produce the peculiar HIR from the concrete syntax. Namely, you + // just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally + // be in (at least) a non-capturing group. Why? Because the '|' has very + // low precedence (lower that concatenation), and so something like 'ab|c' + // is actually 'alt(ab, c)'. + // + // See: https://github.com/rust-lang/regex/issues/516 + #[test] + fn regression_alternation_concat() { + let expr = Hir::concat(alloc::vec![ + Hir::literal(hir::Literal::Unicode('a')), + Hir::alternation(alloc::vec![ + Hir::literal(hir::Literal::Unicode('b')), + Hir::literal(hir::Literal::Unicode('c')), + ]), + ]); + assert_eq!(r"a(?:b|c)", expr.to_string()); + } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 0985276ad..021a53db7 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -772,26 +772,29 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { - let kind = match rep.op.kind { - ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne, - ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore, - ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore, + let (min, max) = match rep.op.kind { + ast::RepetitionKind::ZeroOrOne => (0, Some(1)), + ast::RepetitionKind::ZeroOrMore => (0, None), + ast::RepetitionKind::OneOrMore => (1, None), ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { - hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m)) + (m, Some(m)) } ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { - hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m)) + (m, None) } ast::RepetitionKind::Range(ast::RepetitionRange::Bounded( m, n, - )) => { - hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n)) - } + )) => (m, Some(n)), }; let greedy = if self.flags().swap_greed() { !rep.greedy } else { rep.greedy }; - Hir::repetition(hir::Repetition { kind, greedy, hir: Box::new(expr) }) + Hir::repetition(hir::Repetition { + min, + max, + greedy, + hir: Box::new(expr), + }) } fn hir_unicode_class( @@ -1229,7 +1232,8 @@ mod tests { fn hir_quest(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrOne, + min: 0, + max: Some(1), greedy, hir: Box::new(expr), }) @@ -1237,7 +1241,8 @@ mod tests { fn hir_star(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, + min: 0, + max: None, greedy, hir: Box::new(expr), }) @@ -1245,15 +1250,17 @@ mod tests { fn hir_plus(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::OneOrMore, + min: 1, + max: None, greedy, hir: Box::new(expr), }) } - fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir { + fn hir_range(greedy: bool, min: u32, max: Option, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::Range(range), + min, + max, greedy, hir: Box::new(expr), }) @@ -1745,34 +1752,12 @@ mod tests { assert_eq!(t("a*?"), hir_star(false, hir_lit("a"))); assert_eq!(t("a+?"), hir_plus(false, hir_lit("a"))); - assert_eq!( - t("a{1}"), - hir_range(true, hir::RepetitionRange::Exactly(1), hir_lit("a"),) - ); - assert_eq!( - t("a{1,}"), - hir_range(true, hir::RepetitionRange::AtLeast(1), hir_lit("a"),) - ); - assert_eq!( - t("a{1,2}"), - hir_range(true, hir::RepetitionRange::Bounded(1, 2), hir_lit("a"),) - ); - assert_eq!( - t("a{1}?"), - hir_range(false, hir::RepetitionRange::Exactly(1), hir_lit("a"),) - ); - assert_eq!( - t("a{1,}?"), - hir_range(false, hir::RepetitionRange::AtLeast(1), hir_lit("a"),) - ); - assert_eq!( - t("a{1,2}?"), - hir_range( - false, - hir::RepetitionRange::Bounded(1, 2), - hir_lit("a"), - ) - ); + assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),)); + assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),)); + assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),)); + assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),)); + assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),)); + assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),)); assert_eq!( t("ab?"), @@ -2916,11 +2901,7 @@ mod tests { , # comment 10 # comment } # comment"), - hir_range( - true, - hir::RepetitionRange::Bounded(5, 10), - hir_lit("a") - ) + hir_range(true, 5, Some(10), hir_lit("a")) ); assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a ")); diff --git a/src/compile.rs b/src/compile.rs index ab0d6372b..c7ace466e 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -418,14 +418,16 @@ impl Compiler { fn c_dotstar(&mut self) -> Result { Ok(if !self.compiled.only_utf8() { self.c(&Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, + min: 0, + max: None, greedy: false, hir: Box::new(Hir::any(true)), }))? .unwrap() } else { self.c(&Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, + min: 0, + max: None, greedy: false, hir: Box::new(Hir::any(false)), }))? @@ -591,18 +593,14 @@ impl Compiler { } fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { - use regex_syntax::hir::RepetitionKind::*; - match rep.kind { - ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), - ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), - OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), - Range(hir::RepetitionRange::Exactly(min_max)) => { - self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) - } - Range(hir::RepetitionRange::AtLeast(min)) => { + match (rep.min, rep.max) { + (0, Some(1)) => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), + (0, None) => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), + (1, None) => self.c_repeat_one_or_more(&rep.hir, rep.greedy), + (min, None) => { self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) } - Range(hir::RepetitionRange::Bounded(min, max)) => { + (min, Some(max)) => { self.c_repeat_range(&rep.hir, rep.greedy, min, max) } } From 6e59f32b261ac749cef779982df01686dfbaa65b Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 29 Aug 2022 13:04:08 -0400 Subject: [PATCH 21/79] syntax: fix HIR printer This fixes some corner cases in the HIR printer where it would print the concrete syntax of a regex that does not match the natural interpretation of the HIR. One such example of this is: concat(a, alt(b, c)) This would get printed as ab|c But clearly, it should be printed as: a(?:b|c) The issue here is that the printer only considers the current HirKind when determining how to print it. Sometimes a group is needed to print an alt (and even a concat, in the case of 'rep(+, concat(a, b))'), but sometimes it isn't. We could address this in a few different ways: 1) Always print concats and alts inside a non-capturing group. 2) Make the printer detect precisely the cases where a non-capturing group is needed. 3) Make the HIR smart constructors insert non-capturing groups when needed. 4) Do some other thing to change the HIR to prevent these sorts of things by construction. This patch goes with (1). The reason in favor of it is that HIR printer was always about printing an equivalent regex and never about trying to print a "nice" regex. Indeed, the HIR printer can't print a nice regex, because the HIR represents a rigorously simplifed view of a regex to make analysis easier. (The most obvious such example are Unicode character classes. For example, the HIR printer never prints '\w'.) So inserting some extra groups (which it already does) even when they aren't strictly needed is perfectly okay. But still, it's useful to say why we didn't do the other choices: 2) Modifying the printer to only print groups when they're actually needed is pretty difficult. I tried this briefly, and handling this case requires some notion of what the parent expression is. This winds up being a possible but hairy change. 3) Making the HIR more complicated to make the printer correct seems like it's optimizing for the wrong thing. Inserting extra groups in places just obfuscates HIR values that already have clear semantics. That is, use concat(a, alt(b, c)) over concat(a, group(alt(b, c))). 4) It's not clear how we would change the HIR to guarantee this sort of thing wouldn't happen. At the very least, it seems likely it would require a more complex data type. At first, I had thought (1) seemed inelegant. But the more I thought about it, the more it seemed quite consistent with how the HIR printer already worked. So that's the path I took here. Closes #516, Closes #731 --- regex-syntax/src/hir/print.rs | 49 +++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 9d7b2f70e..f905f78fc 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -89,10 +89,9 @@ impl Visitor for Writer { fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { match *hir.kind() { - HirKind::Empty - | HirKind::Repetition(_) - | HirKind::Concat(_) - | HirKind::Alternation(_) => {} + // Empty is represented by nothing in the concrete syntax, and + // repetition operators are strictly suffix oriented. + HirKind::Empty | HirKind::Repetition(_) => {} HirKind::Literal(hir::Literal::Unicode(c)) => { self.write_literal_char(c)?; } @@ -162,6 +161,22 @@ impl Visitor for Writer { self.wtr.write_str("(?:")?; } }, + // Why do this? Wrapping concats and alts in non-capturing groups + // is not *always* necessary, but is sometimes necessary. For + // example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)' + // and not 'ab|c'. The former is clearly the intended meaning, but + // the latter is actually 'alt(concat(a, b), c)'. + // + // It would be possible to only group these things in cases where + // it's strictly necessary, but it requires knowing the parent + // expression. And since this technique is simpler and always + // correct, we take this route. More to the point, it is a non-goal + // of an HIR printer to show a nice easy-to-read regex. Indeed, + // its construction forbids it from doing so. Therefore, inserting + // extra groups where they aren't necessary is perfectly okay. + HirKind::Concat(_) | HirKind::Alternation(_) => { + self.wtr.write_str(r"(?:")?; + } } Ok(()) } @@ -172,9 +187,7 @@ impl Visitor for Writer { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) - | HirKind::Look(_) - | HirKind::Concat(_) - | HirKind::Alternation(_) => {} + | HirKind::Look(_) => {} HirKind::Repetition(ref x) => { match (x.min, x.max) { (0, Some(1)) => { @@ -206,8 +219,10 @@ impl Visitor for Writer { self.wtr.write_str("?")?; } } - HirKind::Group(_) => { - self.wtr.write_str(")")?; + HirKind::Group(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => { + self.wtr.write_str(r")")?; } } Ok(()) @@ -374,12 +389,12 @@ mod tests { #[test] fn print_alternation() { - roundtrip("|", "|"); - roundtrip("||", "||"); + roundtrip("|", "(?:|)"); + roundtrip("||", "(?:||)"); - roundtrip("a|b", "a|b"); - roundtrip("a|b|c", "a|b|c"); - roundtrip("foo|bar|quux", "foo|bar|quux"); + roundtrip("a|b", "(?:a|b)"); + roundtrip("a|b|c", "(?:a|b|c)"); + roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))"); } // This is a regression test that stresses a peculiarity of how the HIR @@ -415,7 +430,7 @@ mod tests { }), Hir::literal(hir::Literal::Unicode('y')), ]); - assert_eq!(r"x(?:ab)+y", expr.to_string()); + assert_eq!(r"(?:x(?:ab)+y)", expr.to_string()); } // Just like regression_repetition_concat, but with the repetition using @@ -437,7 +452,7 @@ mod tests { }), Hir::literal(hir::Literal::Unicode('y')), ]); - assert_eq!(r"x(?:a|b)+y", expr.to_string()); + assert_eq!(r"(?:x(?:a|b)+y)", expr.to_string()); } // This regression test is very similar in flavor to @@ -460,6 +475,6 @@ mod tests { Hir::literal(hir::Literal::Unicode('c')), ]), ]); - assert_eq!(r"a(?:b|c)", expr.to_string()); + assert_eq!(r"(?:a(?:b|c))", expr.to_string()); } } From 62802facf917a17bffae648afe851543ee295d98 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 29 Aug 2022 20:11:07 -0400 Subject: [PATCH 22/79] syntax: 'a{0}' should compile to Hir::empty No matter what 'a' is, 'a{0}' is always equivalent to an empty regex. --- regex-syntax/src/hir/mod.rs | 6 ++++++ regex-syntax/src/hir/print.rs | 11 +++++++++++ 2 files changed, 17 insertions(+) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 9d41f312f..26f649d8a 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -311,6 +311,12 @@ impl Hir { /// Creates a repetition HIR expression. pub fn repetition(rep: Repetition) -> Hir { + // The regex 'a{0}' is always equivalent to the empty regex. This is + // true even when 'a' is an expression that never matches anything + // (like '\P{any}'). + if rep.min == 0 && rep.max == Some(0) { + return Hir::empty(); + } let mut info = HirInfo::new(); info.set_always_utf8(rep.hir.is_always_utf8()); info.set_all_assertions(rep.hir.is_all_assertions()); diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index f905f78fc..94f25a2fd 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -372,6 +372,17 @@ mod tests { roundtrip("(?U)a{2}", "a{2}"); roundtrip("(?U)a{1,}", "a+?"); roundtrip("(?U)a{1,5}", "a{1,5}?"); + + // Test that various zero-length repetitions always translate to an + // empty regex. This is more a property of HIR's smart constructors + // than the printer though. + roundtrip("a{0}", ""); + roundtrip("(?:ab){0}", ""); + #[cfg(feature = "unicode-gencat")] + { + roundtrip(r"\p{any}{0}", ""); + roundtrip(r"\P{any}{0}", ""); + } } #[test] From 9c2b01e9c965f32e3961dbf8a0fa60e51ffa502e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 1 Sep 2022 13:46:07 -0400 Subject: [PATCH 23/79] syntax: switch to 'Vec' to represent literals This gets rid of the old 'Literal' type: enum Literal { Unicode(char), Byte(u8), } and replaces it with struct Literal(Box<[u8]>); I did this primarily because I perceive the new version to be a bit simpler and is very likely to be more space efficient given some of the changes I have in mind (upcoming in subsequent commits). Namely, I want to include more analysis information beyond just simply booleans, and this means using up more space. Putting that analysis information on every single byte/char seems gratuitous. But putting it on every single sequence of byte/chars seems more justifiable. I also have a hand-wavy idea that this might make analysis a bit easier. And another hand-wavy idea that debug-printing such an HIR will make it a bit more comprehensible. Overall, this isn't a completely obvious win and I do wonder whether I'll regret this. For one thing, the translator is now a fair bit more complicated in exchange for not creating a 'Vec' for every 'ast::Literal' node. This also gives up the Unicode vs byte distinct and just commits to "all bytes." Instead, we do a UTF-8 check on every 'Hir::literal' call, and that in turn sets the UTF-8 property. This does seem a bit wasteful, and indeed, we do another UTF-8 check in the compiler (even though we could use 'unsafe' correctly and avoid it). However, once the new NFA compiler lands from regex-automata, it operates purely in byte-land and will not need to do another UTF-8 check. Moreover, a UTF-8 check, even on every literal, is likely barely measureable in the grand scheme of things. I do also worry that this is overwrought. In particular, the AST creates a node for each character. Then the HIR smooths them out to sequences of characters (that is, Vec). And then NFA compilation splits them back out into states where a state handles at most one character (or range of characters). But, I am taking somewhat of a leap-of-judgment here that this will make analysis easier and will overall use less space. But we'll see. --- regex-syntax/src/hir/literal/mod.rs | 21 +- regex-syntax/src/hir/mod.rs | 49 ++--- regex-syntax/src/hir/print.rs | 96 +++++++-- regex-syntax/src/hir/translate.rs | 319 +++++++++++++++++++--------- regex-syntax/src/lib.rs | 6 +- src/compile.rs | 58 ++++- src/exec.rs | 18 +- 7 files changed, 380 insertions(+), 187 deletions(-) diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs index b32a02db7..d127063bc 100644 --- a/regex-syntax/src/hir/literal/mod.rs +++ b/regex-syntax/src/hir/literal/mod.rs @@ -582,12 +582,8 @@ impl Literals { fn prefixes(expr: &Hir, lits: &mut Literals) { match *expr.kind() { - HirKind::Literal(hir::Literal::Unicode(c)) => { - let mut buf = [0; 4]; - lits.cross_add(c.encode_utf8(&mut buf).as_bytes()); - } - HirKind::Literal(hir::Literal::Byte(b)) => { - lits.cross_add(&[b]); + HirKind::Literal(hir::Literal(ref bytes)) => { + lits.cross_add(bytes); } HirKind::Class(hir::Class::Unicode(ref cls)) => { if !lits.add_char_class(cls) { @@ -648,15 +644,10 @@ fn prefixes(expr: &Hir, lits: &mut Literals) { fn suffixes(expr: &Hir, lits: &mut Literals) { match *expr.kind() { - HirKind::Literal(hir::Literal::Unicode(c)) => { - let mut buf = [0u8; 4]; - let i = c.encode_utf8(&mut buf).len(); - let buf = &mut buf[..i]; - buf.reverse(); - lits.cross_add(buf); - } - HirKind::Literal(hir::Literal::Byte(b)) => { - lits.cross_add(&[b]); + HirKind::Literal(hir::Literal(ref bytes)) => { + let mut bytes = bytes.to_vec(); + bytes.reverse(); + lits.cross_add(&bytes); } HirKind::Class(hir::Class::Unicode(ref cls)) => { if !lits.add_char_class_reverse(cls) { diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 26f649d8a..379deb955 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -169,7 +169,7 @@ pub enum HirKind { /// The empty regular expression, which matches everything, including the /// empty string. Empty, - /// A single literal character that matches exactly this character. + /// A literalstring that matches exactly these bytes. Literal(Literal), /// A single character class that matches any of the characters in the /// class. A class can either consist of Unicode scalar values as @@ -231,13 +231,14 @@ impl Hir { /// If the given literal has a `Byte` variant with an ASCII byte, then this /// method panics. This enforces the invariant that `Byte` variants are /// only used to express matching of invalid UTF-8. - pub fn literal(lit: Literal) -> Hir { - if let Literal::Byte(b) = lit { - assert!(b > 0x7F); + pub fn literal>>(lit: B) -> Hir { + let bytes = lit.into(); + if bytes.is_empty() { + return Hir::empty(); } let mut info = HirInfo::new(); - info.set_always_utf8(lit.is_unicode()); + info.set_always_utf8(core::str::from_utf8(&bytes).is_ok()); info.set_all_assertions(false); info.set_anchored_start(false); info.set_anchored_end(false); @@ -248,7 +249,7 @@ impl Hir { info.set_match_empty(false); info.set_literal(true); info.set_alternation_literal(true); - Hir { kind: HirKind::Literal(lit), info } + Hir { kind: HirKind::Literal(Literal(bytes)), info } } /// Creates a class HIR expression. @@ -710,24 +711,7 @@ impl core::fmt::Display for Hir { /// are preferred whenever possible. In particular, a `Byte` variant is only /// ever produced when it could match invalid UTF-8. #[derive(Clone, Debug, Eq, PartialEq)] -pub enum Literal { - /// A single character represented by a Unicode scalar value. - Unicode(char), - /// A single character represented by an arbitrary byte. - Byte(u8), -} - -impl Literal { - /// Returns true if and only if this literal corresponds to a Unicode - /// scalar value. - pub fn is_unicode(&self) -> bool { - match *self { - Literal::Unicode(_) => true, - Literal::Byte(b) if b <= 0x7F => true, - Literal::Byte(_) => false, - } - } -} +pub struct Literal(pub Box<[u8]>); /// The high-level intermediate representation of a character class. /// @@ -739,12 +723,11 @@ impl Literal { /// A character class, regardless of its character type, is represented by a /// sequence of non-overlapping non-adjacent ranges of characters. /// -/// Note that unlike [`Literal`], a `Bytes` variant may be produced even when -/// it exclusively matches valid UTF-8. This is because a `Bytes` variant -/// represents an intention by the author of the regular expression to disable -/// Unicode mode, which in turn impacts the semantics of case insensitive -/// matching. For example, `(?i)k` and `(?i-u)k` will not match the same set of -/// strings. +/// Note that `Bytes` variant may be produced even when it exclusively matches +/// valid UTF-8. This is because a `Bytes` variant represents an intention by +/// the author of the regular expression to disable Unicode mode, which in turn +/// impacts the semantics of case insensitive matching. For example, `(?i)k` +/// and `(?i-u)k` will not match the same set of strings. #[derive(Clone, Debug, Eq, PartialEq)] pub enum Class { /// A set of characters represented by Unicode scalar values. @@ -2222,12 +2205,6 @@ mod tests { assert_eq!(expected, bsymdifference(&cls1, &cls2)); } - #[test] - #[should_panic] - fn hir_byte_literal_non_ascii() { - Hir::literal(Literal::Byte(b'a')); - } - // We use a thread with an explicit stack size to test that our destructor // for Hir can handle arbitrarily sized expressions in constant stack // space. In case we run on a platform without threads (WASM?), we limit diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 94f25a2fd..7f861151f 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -92,11 +92,37 @@ impl Visitor for Writer { // Empty is represented by nothing in the concrete syntax, and // repetition operators are strictly suffix oriented. HirKind::Empty | HirKind::Repetition(_) => {} - HirKind::Literal(hir::Literal::Unicode(c)) => { - self.write_literal_char(c)?; - } - HirKind::Literal(hir::Literal::Byte(b)) => { - self.write_literal_byte(b)?; + HirKind::Literal(hir::Literal(ref bytes)) => { + // See the comment on the 'Concat' and 'Alternation' case below + // for why we put parens here. Literals are, conceptually, + // a special case of concatenation where each element is a + // character. The HIR flattens this into a Box<[u8]>, but we + // still need to treat it like a concatenation for correct + // printing. As a special case, we don't write parens if there + // is only one character. One character means there is no + // concat so we don't need parens. Adding parens would still be + // correct, but we drop them here because it tends to create + // rather noisy regexes even in simple cases. + let result = core::str::from_utf8(bytes); + let len = result.map_or(bytes.len(), |s| s.chars().count()); + if len > 1 { + self.wtr.write_str(r"(?:")?; + } + match result { + Ok(string) => { + for c in string.chars() { + self.write_literal_char(c)?; + } + } + Err(_) => { + for &b in bytes.iter() { + self.write_literal_byte(b)?; + } + } + } + if len > 1 { + self.wtr.write_str(r")")?; + } } HirKind::Class(hir::Class::Unicode(ref cls)) => { self.wtr.write_str("[")?; @@ -429,19 +455,31 @@ mod tests { #[test] fn regression_repetition_concat() { let expr = Hir::concat(alloc::vec![ - Hir::literal(hir::Literal::Unicode('x')), + Hir::literal("x".as_bytes()), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + hir: Box::new(Hir::literal("ab".as_bytes())), + }), + Hir::literal("y".as_bytes()), + ]); + assert_eq!(r"(?:x(?:ab)+y)", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), Hir::repetition(hir::Repetition { min: 1, max: None, greedy: true, hir: Box::new(Hir::concat(alloc::vec![ - Hir::literal(hir::Literal::Unicode('a')), - Hir::literal(hir::Literal::Unicode('b')), + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), ])), }), - Hir::literal(hir::Literal::Unicode('y')), + Hir::look(hir::Look::End), ]); - assert_eq!(r"(?:x(?:ab)+y)", expr.to_string()); + assert_eq!(r"(?:\A(?:\A\z)+\z)", expr.to_string()); } // Just like regression_repetition_concat, but with the repetition using @@ -451,19 +489,34 @@ mod tests { #[test] fn regression_repetition_alternation() { let expr = Hir::concat(alloc::vec![ - Hir::literal(hir::Literal::Unicode('x')), + Hir::literal("x".as_bytes()), Hir::repetition(hir::Repetition { min: 1, max: None, greedy: true, hir: Box::new(Hir::alternation(alloc::vec![ - Hir::literal(hir::Literal::Unicode('a')), - Hir::literal(hir::Literal::Unicode('b')), + Hir::literal("a".as_bytes()), + Hir::literal("b".as_bytes()), ])), }), - Hir::literal(hir::Literal::Unicode('y')), + Hir::literal("y".as_bytes()), ]); assert_eq!(r"(?:x(?:a|b)+y)", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + hir: Box::new(Hir::alternation(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), + ])), + }), + Hir::look(hir::Look::End), + ]); + assert_eq!(r"(?:\A(?:\A|\z)+\z)", expr.to_string()); } // This regression test is very similar in flavor to @@ -480,12 +533,21 @@ mod tests { #[test] fn regression_alternation_concat() { let expr = Hir::concat(alloc::vec![ - Hir::literal(hir::Literal::Unicode('a')), + Hir::literal("a".as_bytes()), Hir::alternation(alloc::vec![ - Hir::literal(hir::Literal::Unicode('b')), - Hir::literal(hir::Literal::Unicode('c')), + Hir::literal("b".as_bytes()), + Hir::literal("c".as_bytes()), ]), ]); assert_eq!(r"(?:a(?:b|c))", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::alternation(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), + ]), + ]); + assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string()); } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 021a53db7..a88ae33c8 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -8,6 +8,7 @@ use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; use crate::{ ast::{self, Ast, Span, Visitor}, + either::Either, hir::{self, Error, ErrorKind, Hir}, unicode::{self, ClassQuery}, }; @@ -146,6 +147,12 @@ enum HirFrame { /// case in the Ast. They get popped after an inductive (i.e., recursive) /// step is complete. Expr(Hir), + /// A literal that is being constructed, character by character, from the + /// AST. We need this because the AST gives each individual character its + /// own node. So as we see characters, we peek at the top-most HirFrame. + /// If it's a literal, then we add to it. Otherwise, we push a new literal. + /// When it comes time to pop it, we convert it to an Hir via Hir::literal. + Literal(Vec), /// A Unicode character class. This frame is mutated as we descend into /// the Ast of a character class (which is itself its own mini recursive /// structure). @@ -158,6 +165,13 @@ enum HirFrame { /// If `allow_invalid_utf8` is disabled (the default), then a byte /// character is only permitted to match ASCII text. ClassBytes(hir::ClassBytes), + /// This is pushed whenever a repetition is observed. After visiting every + /// sub-expression in the repetition, the translator's stack is expected to + /// have this sentinel at the top. + /// + /// This sentinel only exists to stop other things (like flattening + /// literals) from reaching across repetition operators. + Repetition, /// This is pushed on to the stack upon first seeing any kind of group, /// indicated by parentheses (including non-capturing groups). It is popped /// upon leaving a group. @@ -184,6 +198,14 @@ enum HirFrame { /// every sub-expression in the alternation, the translator's stack is /// popped until it sees an Alternation frame. Alternation, + /// This is pushed immediately before each sub-expression in an + /// alternation. This separates the branches of an alternation on the + /// stack and prevents literal flattening from reaching across alternation + /// branches. + /// + /// It is popped after each expression in a branch until an 'Alternation' + /// frame is observed when doing a post visit on an alternation. + AlternationBranch, } impl HirFrame { @@ -191,6 +213,7 @@ impl HirFrame { fn unwrap_expr(self) -> Hir { match self { HirFrame::Expr(expr) => expr, + HirFrame::Literal(lit) => Hir::literal(lit), _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self), } } @@ -221,6 +244,20 @@ impl HirFrame { } } + /// Assert that the current stack frame is a repetition sentinel. If it + /// isn't, then panic. + fn unwrap_repetition(self) { + match self { + HirFrame::Repetition => {} + _ => { + panic!( + "tried to unwrap repetition from HirFrame, got: {:?}", + self + ) + } + } + } + /// Assert that the current stack frame is a group indicator and return /// its corresponding flags (the flags that were active at the time the /// group was entered). @@ -232,6 +269,20 @@ impl HirFrame { } } } + + /// Assert that the current stack frame is an alternation pipe sentinel. If + /// it isn't, then panic. + fn unwrap_alternation_pipe(self) { + match self { + HirFrame::AlternationBranch => {} + _ => { + panic!( + "tried to unwrap alt pipe from HirFrame, got: {:?}", + self + ) + } + } + } } impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { @@ -255,6 +306,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::ClassBytes(cls)); } } + Ast::Repetition(_) => self.push(HirFrame::Repetition), Ast::Group(ref x) => { let old_flags = x .flags() @@ -269,6 +321,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { Ast::Alternation(ref x) if x.asts.is_empty() => {} Ast::Alternation(_) => { self.push(HirFrame::Alternation); + self.push(HirFrame::AlternationBranch); } _ => {} } @@ -294,7 +347,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(Hir::empty())); } Ast::Literal(ref x) => { - self.push(HirFrame::Expr(self.hir_literal(x)?)); + match self.ast_literal_to_scalar(x)? { + Either::Right(byte) => self.push_byte(byte), + Either::Left(ch) => { + if !self.flags().unicode() && ch.len_utf8() > 1 { + return Err(self + .error(x.span, ErrorKind::UnicodeNotAllowed)); + } + match self.case_fold_char(x.span, ch)? { + None => self.push_char(ch), + Some(expr) => self.push(HirFrame::Expr(expr)), + } + } + } + // self.push(HirFrame::Expr(self.hir_literal(x)?)); } Ast::Dot(span) => { self.push(HirFrame::Expr(self.hir_dot(span)?)); @@ -340,6 +406,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } Ast::Repetition(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); + self.pop().unwrap().unwrap_repetition(); self.push(HirFrame::Expr(self.hir_repetition(x, expr))); } Ast::Group(ref x) => { @@ -350,7 +417,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } Ast::Concat(_) => { let mut exprs = vec![]; - while let Some(HirFrame::Expr(expr)) = self.pop() { + while let Some(expr) = self.pop_concat_expr() { if !expr.kind().is_empty() { exprs.push(expr); } @@ -360,7 +427,8 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } Ast::Alternation(_) => { let mut exprs = vec![]; - while let Some(HirFrame::Expr(expr)) = self.pop() { + while let Some(expr) = self.pop_alt_expr() { + self.pop().unwrap().unwrap_alternation_pipe(); exprs.push(expr); } exprs.reverse(); @@ -370,6 +438,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { Ok(()) } + fn visit_alternation_in(&mut self) -> Result<()> { + self.push(HirFrame::AlternationBranch); + Ok(()) + } + fn visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, @@ -592,11 +665,103 @@ impl<'t, 'p> TranslatorI<'t, 'p> { self.trans().stack.borrow_mut().push(frame); } + /// Push the given literal char on to the call stack. + /// + /// If the top-most element of the stack is a literal, then the char + /// is appended to the end of that literal. Otherwise, a new literal + /// containing just the given char is pushed to the top of the stack. + fn push_char(&self, ch: char) { + let mut buf = [0; 4]; + let bytes = ch.encode_utf8(&mut buf).as_bytes(); + let mut stack = self.trans().stack.borrow_mut(); + if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { + literal.extend_from_slice(bytes); + } else { + stack.push(HirFrame::Literal(bytes.to_vec())); + } + } + + /// Push the given literal byte on to the call stack. + /// + /// If the top-most element of the stack is a literal, then the byte + /// is appended to the end of that literal. Otherwise, a new literal + /// containing just the given byte is pushed to the top of the stack. + fn push_byte(&self, byte: u8) { + let mut stack = self.trans().stack.borrow_mut(); + if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { + literal.push(byte); + } else { + stack.push(HirFrame::Literal(vec![byte])); + } + } + /// Pop the top of the call stack. If the call stack is empty, return None. fn pop(&self) -> Option { self.trans().stack.borrow_mut().pop() } + /// Pop an HIR expression from the top of the stack for a concatenation. + /// + /// This returns None if the stack is empty or when a concat frame is seen. + /// Otherwise, it panics if it could not find an HIR expression. + fn pop_concat_expr(&self) -> Option { + let frame = self.pop()?; + match frame { + HirFrame::Concat => None, + HirFrame::Expr(expr) => Some(expr), + HirFrame::Literal(lit) => Some(Hir::literal(lit)), + HirFrame::ClassUnicode(_) => { + unreachable!("expected expr or concat, got Unicode class") + } + HirFrame::ClassBytes(_) => { + unreachable!("expected expr or concat, got byte class") + } + HirFrame::Repetition => { + unreachable!("expected expr or concat, got repetition") + } + HirFrame::Group { .. } => { + unreachable!("expected expr or concat, got group") + } + HirFrame::Alternation => { + unreachable!("expected expr or concat, got alt marker") + } + HirFrame::AlternationBranch => { + unreachable!("expected expr or concat, got alt branch marker") + } + } + } + + /// Pop an HIR expression from the top of the stack for an alternation. + /// + /// This returns None if the stack is empty or when an alternation frame is + /// seen. Otherwise, it panics if it could not find an HIR expression. + fn pop_alt_expr(&self) -> Option { + let frame = self.pop()?; + match frame { + HirFrame::Alternation => None, + HirFrame::Expr(expr) => Some(expr), + HirFrame::Literal(lit) => Some(Hir::literal(lit)), + HirFrame::ClassUnicode(_) => { + unreachable!("expected expr or alt, got Unicode class") + } + HirFrame::ClassBytes(_) => { + unreachable!("expected expr or alt, got byte class") + } + HirFrame::Repetition => { + unreachable!("expected expr or alt, got repetition") + } + HirFrame::Group { .. } => { + unreachable!("expected expr or alt, got group") + } + HirFrame::Concat => { + unreachable!("expected expr or alt, got concat marker") + } + HirFrame::AlternationBranch => { + unreachable!("expected expr or alt, got alt branch marker") + } + } + } + /// Create a new error with the given span and error type. fn error(&self, span: Span, kind: ErrorKind) -> Error { Error { kind, pattern: self.pattern.to_string(), span } @@ -617,55 +782,39 @@ impl<'t, 'p> TranslatorI<'t, 'p> { old_flags } - fn hir_literal(&self, lit: &ast::Literal) -> Result { - let ch = match self.literal_to_char(lit)? { - byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)), - hir::Literal::Unicode(ch) => ch, - }; - if self.flags().case_insensitive() { - self.hir_from_char_case_insensitive(lit.span, ch) - } else { - self.hir_from_char(lit.span, ch) - } - } - /// Convert an Ast literal to its scalar representation. /// /// When Unicode mode is enabled, then this always succeeds and returns a /// `char` (Unicode scalar value). /// - /// When Unicode mode is disabled, then a raw byte is returned. If that - /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns - /// an error. - fn literal_to_char(&self, lit: &ast::Literal) -> Result { + /// When Unicode mode is disabled, then a `char` will still be returned + /// whenever possible. A byte is returned only when invalid UTF-8 is + /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte + /// will result in an error when invalid UTF-8 is not allowed. + fn ast_literal_to_scalar( + &self, + lit: &ast::Literal, + ) -> Result> { if self.flags().unicode() { - return Ok(hir::Literal::Unicode(lit.c)); + return Ok(Either::Left(lit.c)); } let byte = match lit.byte() { - None => return Ok(hir::Literal::Unicode(lit.c)), + None => return Ok(Either::Left(lit.c)), Some(byte) => byte, }; if byte <= 0x7F { - return Ok(hir::Literal::Unicode(char::try_from(byte).unwrap())); + return Ok(Either::Left(char::try_from(byte).unwrap())); } if !self.trans().allow_invalid_utf8 { return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); } - Ok(hir::Literal::Byte(byte)) + Ok(Either::Right(byte)) } - fn hir_from_char(&self, span: Span, c: char) -> Result { - if !self.flags().unicode() && c.len_utf8() > 1 { - return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + fn case_fold_char(&self, span: Span, c: char) -> Result> { + if !self.flags().case_insensitive() { + return Ok(None); } - Ok(Hir::literal(hir::Literal::Unicode(c))) - } - - fn hir_from_char_case_insensitive( - &self, - span: Span, - c: char, - ) -> Result { if self.flags().unicode() { // If case folding won't do anything, then don't bother trying. let map = @@ -673,7 +822,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { self.error(span, ErrorKind::UnicodeCaseUnavailable) })?; if !map { - return self.hir_from_char(span, c); + return Ok(None); } let mut cls = hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( @@ -682,7 +831,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { cls.try_case_fold_simple().map_err(|_| { self.error(span, ErrorKind::UnicodeCaseUnavailable) })?; - Ok(Hir::class(hir::Class::Unicode(cls))) + Ok(Some(Hir::class(hir::Class::Unicode(cls)))) } else { if c.len_utf8() > 1 { return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); @@ -690,7 +839,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { // If case folding won't do anything, then don't bother trying. match c { 'A'..='Z' | 'a'..='z' => {} - _ => return self.hir_from_char(span, c), + _ => return Ok(None), } let mut cls = hir::ClassBytes::new(vec![hir::ClassBytesRange::new( @@ -702,7 +851,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { u8::try_from(u32::from(c)).unwrap(), )]); cls.case_fold_simple(); - Ok(Hir::class(hir::Class::Bytes(cls))) + Ok(Some(Hir::class(hir::Class::Bytes(cls)))) } } @@ -973,9 +1122,9 @@ impl<'t, 'p> TranslatorI<'t, 'p> { /// Return a scalar byte value suitable for use as a literal in a byte /// character class. fn class_literal_byte(&self, ast: &ast::Literal) -> Result { - match self.literal_to_char(ast)? { - hir::Literal::Byte(byte) => Ok(byte), - hir::Literal::Unicode(ch) => { + match self.ast_literal_to_scalar(ast)? { + Either::Right(byte) => Ok(byte), + Either::Left(ch) => { let cp = u32::from(ch); if cp <= 0x7F { Ok(u8::try_from(cp).unwrap()) @@ -1177,33 +1326,11 @@ mod tests { } fn hir_lit(s: &str) -> Hir { - match s.len() { - 0 => Hir::empty(), - _ => { - let lits = s - .chars() - .map(hir::Literal::Unicode) - .map(Hir::literal) - .collect(); - Hir::concat(lits) - } - } + Hir::literal(s.as_bytes()) } fn hir_blit(s: &[u8]) -> Hir { - match s.len() { - 0 => Hir::empty(), - 1 => Hir::literal(hir::Literal::Byte(s[0])), - _ => { - let lits = s - .iter() - .cloned() - .map(hir::Literal::Byte) - .map(Hir::literal) - .collect(); - Hir::concat(lits) - } - } + Hir::literal(s) } fn hir_group(i: u32, expr: Hir) -> Hir { @@ -1763,13 +1890,7 @@ mod tests { t("ab?"), hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) ); - assert_eq!( - t("(ab)?"), - hir_quest( - true, - hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),])) - ) - ); + assert_eq!(t("(ab)?"), hir_quest(true, hir_group(1, hir_lit("ab")))); assert_eq!( t("a|b?"), hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) @@ -1778,48 +1899,46 @@ mod tests { #[test] fn cat_alt() { + let a = || hir_look(hir::Look::Start); + let b = || hir_look(hir::Look::End); + let c = || hir_look(hir::Look::WordUnicode); + let d = || hir_look(hir::Look::WordUnicodeNegate); + + assert_eq!(t("(^$)"), hir_group(1, hir_cat(vec![a(), b()]))); + assert_eq!(t("^|$"), hir_alt(vec![a(), b()])); + assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()])); assert_eq!( - t("(ab)"), - hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),])) - ); - assert_eq!(t("a|b"), hir_alt(vec![hir_lit("a"), hir_lit("b"),])); - assert_eq!( - t("a|b|c"), - hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),]) - ); - assert_eq!( - t("ab|bc|cd"), - hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),]) - ); - assert_eq!( - t("(a|b)"), - hir_group(1, hir_alt(vec![hir_lit("a"), hir_lit("b"),])) - ); - assert_eq!( - t("(a|b|c)"), - hir_group( - 1, - hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),]) - ) + t(r"^$|$\b|\b\B"), + hir_alt(vec![ + hir_cat(vec![a(), b()]), + hir_cat(vec![b(), c()]), + hir_cat(vec![c(), d()]), + ]) ); + assert_eq!(t("(^|$)"), hir_group(1, hir_alt(vec![a(), b()]))); + assert_eq!(t(r"(^|$|\b)"), hir_group(1, hir_alt(vec![a(), b(), c()]))); assert_eq!( - t("(ab|bc|cd)"), + t(r"(^$|$\b|\b\B)"), hir_group( 1, - hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),]) + hir_alt(vec![ + hir_cat(vec![a(), b()]), + hir_cat(vec![b(), c()]), + hir_cat(vec![c(), d()]), + ]) ) ); assert_eq!( - t("(ab|(bc|(cd)))"), + t(r"(^$|($\b|(\b\B)))"), hir_group( 1, hir_alt(vec![ - hir_lit("ab"), + hir_cat(vec![a(), b()]), hir_group( 2, hir_alt(vec![ - hir_lit("bc"), - hir_group(3, hir_lit("cd")), + hir_cat(vec![b(), c()]), + hir_group(3, hir_cat(vec![c(), d()])), ]) ), ]) diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index f87360f48..8a10279f3 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -38,12 +38,12 @@ This example shows how to parse a pattern string into its HIR: ``` use regex_syntax::Parser; -use regex_syntax::hir::{self, Hir}; +use regex_syntax::hir::Hir; let hir = Parser::new().parse("a|b").unwrap(); assert_eq!(hir, Hir::alternation(vec![ - Hir::literal(hir::Literal::Unicode('a')), - Hir::literal(hir::Literal::Unicode('b')), + Hir::literal("a".as_bytes()), + Hir::literal("b".as_bytes()), ])); ``` diff --git a/src/compile.rs b/src/compile.rs index c7ace466e..1907bc0ae 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -272,10 +272,14 @@ impl Compiler { self.check_size()?; match *expr.kind() { Empty => self.c_empty(), - Literal(hir::Literal::Unicode(c)) => self.c_char(c), - Literal(hir::Literal::Byte(b)) => { - assert!(self.compiled.uses_bytes()); - self.c_byte(b) + Literal(hir::Literal(ref bytes)) => { + if self.compiled.is_reverse { + let mut bytes = bytes.to_vec(); + bytes.reverse(); + self.c_literal(&bytes) + } else { + self.c_literal(bytes) + } } Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), Class(hir::Class::Bytes(ref cls)) => { @@ -519,6 +523,52 @@ impl Compiler { Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } + fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty { + match core::str::from_utf8(bytes) { + Ok(string) => { + let mut it = string.chars(); + let Patch { mut hole, entry } = loop { + match it.next() { + None => return self.c_empty(), + Some(ch) => { + if let Some(p) = self.c_char(ch)? { + break p; + } + } + } + }; + for ch in it { + if let Some(p) = self.c_char(ch)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + Err(_) => { + assert!(self.compiled.uses_bytes()); + let mut it = bytes.iter().copied(); + let Patch { mut hole, entry } = loop { + match it.next() { + None => return self.c_empty(), + Some(byte) => { + if let Some(p) = self.c_byte(byte)? { + break p; + } + } + } + }; + for byte in it { + if let Some(p) = self.c_byte(byte)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + } + } + fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty where I: IntoIterator, diff --git a/src/exec.rs b/src/exec.rs index b9abcdc04..50685bfb5 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -1565,25 +1565,19 @@ fn alternation_literals(expr: &Hir) -> Option>> { _ => return None, // one literal isn't worth it }; - let extendlit = |lit: &Literal, dst: &mut Vec| match *lit { - Literal::Unicode(c) => { - let mut buf = [0; 4]; - dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes()); - } - Literal::Byte(b) => { - dst.push(b); - } - }; - let mut lits = vec![]; for alt in alts { let mut lit = vec![]; match *alt.kind() { - HirKind::Literal(ref x) => extendlit(x, &mut lit), + HirKind::Literal(Literal(ref bytes)) => { + lit.extend_from_slice(bytes) + } HirKind::Concat(ref exprs) => { for e in exprs { match *e.kind() { - HirKind::Literal(ref x) => extendlit(x, &mut lit), + HirKind::Literal(Literal(ref bytes)) => { + lit.extend_from_slice(bytes); + } _ => unreachable!("expected literal, got {:?}", e), } } From 9f6f3678839333af26f9cab87c80086ca4f5e237 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 1 Sep 2022 14:44:31 -0400 Subject: [PATCH 24/79] syntax: improve Debug impls This makes the Debug impls for Literal and ClassRangeBytes a bit better. The former in particular. Instead of just printing a sequence of decimal numbers, we now print them as characters. Given the lackluster support for Vec as a string in the standard library, we copy a little bit of code from regex-automata to make the debug print for the Vec basically as nice as a String. --- regex-syntax/src/debug.rs | 108 ++++++++++++++++++++++++++++++ regex-syntax/src/hir/mod.rs | 29 ++++---- regex-syntax/src/hir/translate.rs | 2 +- regex-syntax/src/lib.rs | 1 + 4 files changed, 122 insertions(+), 18 deletions(-) create mode 100644 regex-syntax/src/debug.rs diff --git a/regex-syntax/src/debug.rs b/regex-syntax/src/debug.rs new file mode 100644 index 000000000..846e68156 --- /dev/null +++ b/regex-syntax/src/debug.rs @@ -0,0 +1,108 @@ +/// A type that wraps a single byte with a convenient fmt::Debug impl that +/// escapes the byte. +pub(crate) struct Byte(pub(crate) u8); + +impl core::fmt::Debug for Byte { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + // Special case ASCII space. It's too hard to read otherwise, so + // put quotes around it. I sometimes wonder whether just '\x20' would + // be better... + if self.0 == b' ' { + return write!(f, "' '"); + } + // 10 bytes is enough to cover any output from ascii::escape_default. + let mut bytes = [0u8; 10]; + let mut len = 0; + for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { + // capitalize \xab to \xAB + if i >= 2 && b'a' <= b && b <= b'f' { + b -= 32; + } + bytes[len] = b; + len += 1; + } + write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) + } +} + +/// A type that provides a human readable debug impl for arbitrary bytes. +/// +/// This generally works best when the bytes are presumed to be mostly UTF-8, +/// but will work for anything. +/// +/// N.B. This is copied nearly verbatim from regex-automata. Sigh. +pub(crate) struct Bytes<'a>(pub(crate) &'a [u8]); + +impl<'a> core::fmt::Debug for Bytes<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "\"")?; + // This is a sad re-implementation of a similar impl found in bstr. + let mut bytes = self.0; + while let Some(result) = utf8_decode(bytes) { + let ch = match result { + Ok(ch) => ch, + Err(byte) => { + write!(f, r"\x{:02x}", byte)?; + bytes = &bytes[1..]; + continue; + } + }; + bytes = &bytes[ch.len_utf8()..]; + match ch { + '\0' => write!(f, "\\0")?, + // ASCII control characters except \0, \n, \r, \t + '\x01'..='\x08' + | '\x0b' + | '\x0c' + | '\x0e'..='\x19' + | '\x7f' => { + write!(f, "\\x{:02x}", u32::from(ch))?; + } + '\n' | '\r' | '\t' | _ => { + write!(f, "{}", ch.escape_debug())?; + } + } + } + write!(f, "\"")?; + Ok(()) + } +} + +/// Decodes the next UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the beginning of the given +/// byte slice, then the first byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +fn utf8_decode(bytes: &[u8]) -> Option> { + if bytes.is_empty() { + return None; + } + match core::str::from_utf8(&bytes[..core::cmp::min(4, bytes.len())]) { + Ok(s) => Some(Ok(s.chars().next().unwrap())), + Err(_) => Some(Err(bytes[0])), + } +} + +/* +/// Given a UTF-8 leading byte, this returns the total number of code units +/// in the following encoded codepoint. +/// +/// If the given byte is not a valid UTF-8 leading byte, then this returns +/// `None`. +fn len(byte: u8) -> Option { + if byte <= 0x7F { + return Some(1); + } else if byte & 0b1100_0000 == 0b1000_0000 { + return None; + } else if byte <= 0b1101_1111 { + Some(2) + } else if byte <= 0b1110_1111 { + Some(3) + } else if byte <= 0b1111_0111 { + Some(4) + } else { + None + } +} +*/ diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 379deb955..cb0ffb080 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -699,8 +699,7 @@ impl HirKind { /// to the size of the `Hir`. impl core::fmt::Display for Hir { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - use crate::hir::print::Printer; - Printer::new().print(self, f) + crate::hir::print::Printer::new().print(self, f) } } @@ -710,9 +709,15 @@ impl core::fmt::Display for Hir { /// defined by a Unicode scalar value or an arbitrary byte. Unicode characters /// are preferred whenever possible. In particular, a `Byte` variant is only /// ever produced when it could match invalid UTF-8. -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Eq, PartialEq)] pub struct Literal(pub Box<[u8]>); +impl core::fmt::Debug for Literal { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + crate::debug::Bytes(&self.0).fmt(f) + } +} + /// The high-level intermediate representation of a character class. /// /// A character class corresponds to a set of characters. A character is either @@ -1262,20 +1267,10 @@ impl ClassBytesRange { impl core::fmt::Debug for ClassBytesRange { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - let mut debug = f.debug_struct("ClassBytesRange"); - if self.start <= 0x7F { - let ch = char::try_from(self.start).unwrap(); - debug.field("start", &ch); - } else { - debug.field("start", &self.start); - } - if self.end <= 0x7F { - let ch = char::try_from(self.start).unwrap(); - debug.field("end", &ch); - } else { - debug.field("end", &self.end); - } - debug.finish() + f.debug_struct("ClassBytesRange") + .field("start", &crate::debug::Byte(self.start)) + .field("end", &crate::debug::Byte(self.end)) + .finish() } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index a88ae33c8..aaf58b75d 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -1326,7 +1326,7 @@ mod tests { } fn hir_lit(s: &str) -> Hir { - Hir::literal(s.as_bytes()) + hir_blit(s.as_bytes()) } fn hir_blit(s: &[u8]) -> Hir { diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 8a10279f3..79b48af82 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -180,6 +180,7 @@ pub use crate::{ use alloc::string::String; pub mod ast; +mod debug; mod either; mod error; pub mod hir; From c2daa3bd0dcadd0525fbf0acb601aa772b051841 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 31 Aug 2022 13:26:44 -0400 Subject: [PATCH 25/79] syntax: replace HirInfo with new Properties type This commit completely rewrites how HIR properties are computed inductively. Firstly, 'Properties' is now boxed, so that it contributes less space to each HIR value. This does add an allocation for each HIR expression, but most HIR expressions already require at least one alloc anyway. And there should be far fewer of them now that we collapse literals together. Secondly, 'Properties' now computes far more general attributes instead of hyper-specific things. For example, instead of 'is_match_empty', we now have 'minimum_len' and 'maximum_len'. Similarly, instead of 'is_anchored_start' and 'is_anchored_end', we now compute sets of look-around assertions found anywhere, only as a prefix and only as a suffix. We also remove 'is_line_anchored_{start,end}'. There were only used in the 'grep-regex' crate and they seem unnecessary. They were otherwise fairly weird properties to compute. --- regex-syntax/src/hir/mod.rs | 1007 ++++++++++++++++++----------- regex-syntax/src/hir/translate.rs | 433 ++++++------- src/compile.rs | 18 +- src/exec.rs | 22 +- 4 files changed, 858 insertions(+), 622 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index cb0ffb080..ea41028d5 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -160,7 +160,7 @@ pub struct Hir { /// The underlying HIR kind. kind: HirKind, /// Analysis info about this HIR, computed during construction. - info: HirInfo, + props: Properties, } /// The kind of an arbitrary `Hir` expression. @@ -195,6 +195,7 @@ pub enum HirKind { Alternation(Vec), } +/// Methods for accessing the underlying `HirKind` and `Properties`. impl Hir { /// Returns a reference to the underlying HIR kind. pub fn kind(&self) -> &HirKind { @@ -207,23 +208,30 @@ impl Hir { core::mem::replace(&mut self.kind, HirKind::Empty) } + /// Returns the properties computed for this `Hir`. + pub fn properties(&self) -> &Properties { + &self.props + } +} + +/// Smart constructors for HIR values. +/// +/// These constructors are called "smart" because they inductive work or +/// simplifications. For example, calling `Hir::repetition` with a repetition +/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind +/// since it is equivalent to an empty regex. Another example is calling +/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll +/// just get back the original `expr` since it's precisely equivalent. +/// +/// Smart constructors enable maintaining invariants about the HIR data type +/// while also simulanteously keeping the representation as simple as possible. +impl Hir { /// Returns an empty HIR expression. /// /// An empty HIR expression always matches, including the empty string. pub fn empty() -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(true); - info.set_literal(false); - info.set_alternation_literal(false); - Hir { kind: HirKind::Empty, info } + let props = Properties::empty(); + Hir { kind: HirKind::Empty, props } } /// Creates a literal HIR expression. @@ -237,77 +245,21 @@ impl Hir { return Hir::empty(); } - let mut info = HirInfo::new(); - info.set_always_utf8(core::str::from_utf8(&bytes).is_ok()); - info.set_all_assertions(false); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(false); - info.set_literal(true); - info.set_alternation_literal(true); - Hir { kind: HirKind::Literal(Literal(bytes)), info } + let lit = Literal(bytes); + let props = Properties::literal(&lit); + Hir { kind: HirKind::Literal(lit), props } } /// Creates a class HIR expression. pub fn class(class: Class) -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(class.is_always_utf8()); - info.set_all_assertions(false); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(false); - info.set_literal(false); - info.set_alternation_literal(false); - Hir { kind: HirKind::Class(class), info } + let props = Properties::class(&class); + Hir { kind: HirKind::Class(class), props } } /// Creates a look-around assertion HIR expression. pub fn look(look: Look) -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - // All look-around assertions always produce zero-length or "empty" - // matches. This is true even though not all of them (like \b) match - // the empty string itself. That is, '\b' does not match ''. But it - // does match the empty string between '!' and 'a' in '!a'. - info.set_match_empty(true); - info.set_literal(false); - info.set_alternation_literal(false); - if let Look::Start = look { - info.set_anchored_start(true); - info.set_line_anchored_start(true); - info.set_any_anchored_start(true); - } - if let Look::End = look { - info.set_anchored_end(true); - info.set_line_anchored_end(true); - info.set_any_anchored_end(true); - } - if let Look::StartLF = look { - info.set_line_anchored_start(true); - } - if let Look::EndLF = look { - info.set_line_anchored_end(true); - } - if let Look::WordAsciiNegate = look { - // Negated ASCII word boundaries can match invalid UTF-8. - info.set_always_utf8(false); - } - Hir { kind: HirKind::Look(look), info } + let props = Properties::look(look); + Hir { kind: HirKind::Look(look), props } } /// Creates a repetition HIR expression. @@ -318,46 +270,14 @@ impl Hir { if rep.min == 0 && rep.max == Some(0) { return Hir::empty(); } - let mut info = HirInfo::new(); - info.set_always_utf8(rep.hir.is_always_utf8()); - info.set_all_assertions(rep.hir.is_all_assertions()); - // If this operator can match the empty string, then it can never - // be anchored. - info.set_anchored_start( - !rep.is_match_empty() && rep.hir.is_anchored_start(), - ); - info.set_anchored_end( - !rep.is_match_empty() && rep.hir.is_anchored_end(), - ); - info.set_line_anchored_start( - !rep.is_match_empty() && rep.hir.is_anchored_start(), - ); - info.set_line_anchored_end( - !rep.is_match_empty() && rep.hir.is_anchored_end(), - ); - info.set_any_anchored_start(rep.hir.is_any_anchored_start()); - info.set_any_anchored_end(rep.hir.is_any_anchored_end()); - info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty()); - info.set_literal(false); - info.set_alternation_literal(false); - Hir { kind: HirKind::Repetition(rep), info } + let props = Properties::repetition(&rep); + Hir { kind: HirKind::Repetition(rep), props } } /// Creates a group HIR expression. pub fn group(group: Group) -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(group.hir.is_always_utf8()); - info.set_all_assertions(group.hir.is_all_assertions()); - info.set_anchored_start(group.hir.is_anchored_start()); - info.set_anchored_end(group.hir.is_anchored_end()); - info.set_line_anchored_start(group.hir.is_line_anchored_start()); - info.set_line_anchored_end(group.hir.is_line_anchored_end()); - info.set_any_anchored_start(group.hir.is_any_anchored_start()); - info.set_any_anchored_end(group.hir.is_any_anchored_end()); - info.set_match_empty(group.hir.is_match_empty()); - info.set_literal(false); - info.set_alternation_literal(false); - Hir { kind: HirKind::Group(group), info } + let props = Properties::group(&group); + Hir { kind: HirKind::Group(group), props } } /// Returns the concatenation of the given expressions. @@ -368,87 +288,8 @@ impl Hir { 0 => Hir::empty(), 1 => exprs.pop().unwrap(), _ => { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(true); - info.set_literal(true); - info.set_alternation_literal(true); - - // Some attributes require analyzing all sub-expressions. - for e in &exprs { - let x = info.is_always_utf8() && e.is_always_utf8(); - info.set_always_utf8(x); - - let x = info.is_all_assertions() && e.is_all_assertions(); - info.set_all_assertions(x); - - let x = info.is_any_anchored_start() - || e.is_any_anchored_start(); - info.set_any_anchored_start(x); - - let x = - info.is_any_anchored_end() || e.is_any_anchored_end(); - info.set_any_anchored_end(x); - - let x = info.is_match_empty() && e.is_match_empty(); - info.set_match_empty(x); - - let x = info.is_literal() && e.is_literal(); - info.set_literal(x); - - let x = info.is_alternation_literal() - && e.is_alternation_literal(); - info.set_alternation_literal(x); - } - // Anchored attributes require something slightly more - // sophisticated. Normally, WLOG, to determine whether an - // expression is anchored to the start, we'd only need to check - // the first expression of a concatenation. However, - // expressions like `$\b^` are still anchored to the start, - // but the first expression in the concatenation *isn't* - // anchored to the start. So the "first" expression to look at - // is actually one that is either not an assertion or is - // specifically the StartText assertion. - info.set_anchored_start( - exprs - .iter() - .take_while(|e| { - e.is_anchored_start() || e.is_all_assertions() - }) - .any(|e| e.is_anchored_start()), - ); - // Similarly for the end anchor, but in reverse. - info.set_anchored_end( - exprs - .iter() - .rev() - .take_while(|e| { - e.is_anchored_end() || e.is_all_assertions() - }) - .any(|e| e.is_anchored_end()), - ); - // Repeat the process for line anchors. - info.set_line_anchored_start( - exprs - .iter() - .take_while(|e| { - e.is_line_anchored_start() || e.is_all_assertions() - }) - .any(|e| e.is_line_anchored_start()), - ); - info.set_line_anchored_end( - exprs - .iter() - .rev() - .take_while(|e| { - e.is_line_anchored_end() || e.is_all_assertions() - }) - .any(|e| e.is_line_anchored_end()), - ); - Hir { kind: HirKind::Concat(exprs), info } + let props = Properties::concat(&exprs); + Hir { kind: HirKind::Concat(exprs), props } } } } @@ -461,56 +302,8 @@ impl Hir { 0 => Hir::empty(), 1 => exprs.pop().unwrap(), _ => { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_anchored_start(true); - info.set_anchored_end(true); - info.set_line_anchored_start(true); - info.set_line_anchored_end(true); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(false); - info.set_literal(false); - info.set_alternation_literal(true); - - // Some attributes require analyzing all sub-expressions. - for e in &exprs { - let x = info.is_always_utf8() && e.is_always_utf8(); - info.set_always_utf8(x); - - let x = info.is_all_assertions() && e.is_all_assertions(); - info.set_all_assertions(x); - - let x = info.is_anchored_start() && e.is_anchored_start(); - info.set_anchored_start(x); - - let x = info.is_anchored_end() && e.is_anchored_end(); - info.set_anchored_end(x); - - let x = info.is_line_anchored_start() - && e.is_line_anchored_start(); - info.set_line_anchored_start(x); - - let x = info.is_line_anchored_end() - && e.is_line_anchored_end(); - info.set_line_anchored_end(x); - - let x = info.is_any_anchored_start() - || e.is_any_anchored_start(); - info.set_any_anchored_start(x); - - let x = - info.is_any_anchored_end() || e.is_any_anchored_end(); - info.set_any_anchored_end(x); - - let x = info.is_match_empty() || e.is_match_empty(); - info.set_match_empty(x); - - let x = info.is_alternation_literal() && e.is_literal(); - info.set_alternation_literal(x); - } - Hir { kind: HirKind::Alternation(exprs), info } + let props = Properties::alternation(&exprs); + Hir { kind: HirKind::Alternation(exprs), props } } } } @@ -556,110 +349,6 @@ impl Hir { Hir::class(Class::Unicode(cls)) } } - - /// Return true if and only if this HIR will always match valid UTF-8. - /// - /// When this returns false, then it is possible for this HIR expression - /// to match invalid UTF-8. - pub fn is_always_utf8(&self) -> bool { - self.info.is_always_utf8() - } - - /// Returns true if and only if this entire HIR expression is made up of - /// zero-width assertions. - /// - /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but - /// not `^a`. - pub fn is_all_assertions(&self) -> bool { - self.info.is_all_assertions() - } - - /// Return true if and only if this HIR is required to match from the - /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`, - /// `^foo|^bar` but not `^foo|bar`. - pub fn is_anchored_start(&self) -> bool { - self.info.is_anchored_start() - } - - /// Return true if and only if this HIR is required to match at the end - /// of text. This includes expressions like `foo$`, `(foo|bar)$`, - /// `foo$|bar$` but not `foo$|bar`. - pub fn is_anchored_end(&self) -> bool { - self.info.is_anchored_end() - } - - /// Return true if and only if this HIR is required to match from the - /// beginning of text or the beginning of a line. This includes expressions - /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar` - /// but not `^foo|bar` or `(?m)^foo|bar`. - /// - /// Note that if `is_anchored_start` is `true`, then - /// `is_line_anchored_start` will also be `true`. The reverse implication - /// is not true. For example, `(?m)^foo` is line anchored, but not - /// `is_anchored_start`. - pub fn is_line_anchored_start(&self) -> bool { - self.info.is_line_anchored_start() - } - - /// Return true if and only if this HIR is required to match at the - /// end of text or the end of a line. This includes expressions like - /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`, - /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`. - /// - /// Note that if `is_anchored_end` is `true`, then - /// `is_line_anchored_end` will also be `true`. The reverse implication - /// is not true. For example, `(?m)foo$` is line anchored, but not - /// `is_anchored_end`. - pub fn is_line_anchored_end(&self) -> bool { - self.info.is_line_anchored_end() - } - - /// Return true if and only if this HIR contains any sub-expression that - /// is required to match at the beginning of text. Specifically, this - /// returns true if the `^` symbol (when multiline mode is disabled) or the - /// `\A` escape appear anywhere in the regex. - pub fn is_any_anchored_start(&self) -> bool { - self.info.is_any_anchored_start() - } - - /// Return true if and only if this HIR contains any sub-expression that is - /// required to match at the end of text. Specifically, this returns true - /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape - /// appear anywhere in the regex. - pub fn is_any_anchored_end(&self) -> bool { - self.info.is_any_anchored_end() - } - - /// Return true if and only if the empty string is part of the language - /// matched by this regular expression. - /// - /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b` - /// and `\B`, but not `a` or `a+`. - pub fn is_match_empty(&self) -> bool { - self.info.is_match_empty() - } - - /// Return true if and only if this HIR is a simple literal. This is only - /// true when this HIR expression is either itself a `Literal` or a - /// concatenation of only `Literal`s. - /// - /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`, - /// `` are not (even though that contain sub-expressions that are literals). - pub fn is_literal(&self) -> bool { - self.info.is_literal() - } - - /// Return true if and only if this HIR is either a simple literal or an - /// alternation of simple literals. This is only - /// true when this HIR expression is either itself a `Literal` or a - /// concatenation of only `Literal`s or an alternation of only `Literal`s. - /// - /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation - /// literals, but `f+`, `(foo)`, `foo()`, `` - /// are not (even though that contain sub-expressions that are literals). - pub fn is_alternation_literal(&self) -> bool { - self.info.is_alternation_literal() - } } impl HirKind { @@ -810,12 +499,40 @@ impl Class { /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete /// syntax or in the parser builder. By default, Unicode mode is /// enabled. - pub fn is_always_utf8(&self) -> bool { + pub fn is_utf8(&self) -> bool { match *self { Class::Unicode(_) => true, Class::Bytes(ref x) => x.is_all_ascii(), } } + + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. + /// + /// For non-empty byte oriented classes, this always returns `1`. For + /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or + /// `4`. For empty classes, `None` is returned. It is impossible for `0` to + /// be returned. + pub fn minimum_len(&self) -> Option { + match *self { + Class::Unicode(ref x) => x.minimum_len(), + Class::Bytes(ref x) => x.minimum_len(), + } + } + + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// For non-empty byte oriented classes, this always returns `1`. For + /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or + /// `4`. For empty classes, `None` is returned. It is impossible for `0` to + /// be returned. + pub fn maximum_len(&self) -> Option { + match *self { + Class::Unicode(ref x) => x.maximum_len(), + Class::Bytes(ref x) => x.maximum_len(), + } + } } /// A set of characters represented by Unicode scalar values. @@ -935,6 +652,26 @@ impl ClassUnicode { pub fn is_all_ascii(&self) -> bool { self.set.intervals().last().map_or(true, |r| r.end <= '\x7F') } + + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn minimum_len(&self) -> Option { + let first = self.ranges().get(0)?; + // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). + Some(first.start.len_utf8()) + } + + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn maximum_len(&self) -> Option { + let last = self.ranges().last()?; + // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). + Some(last.end.len_utf8()) + } } /// An iterator over all ranges in a Unicode character class. @@ -1163,6 +900,30 @@ impl ClassBytes { pub fn is_all_ascii(&self) -> bool { self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) } + + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn minimum_len(&self) -> Option { + if self.ranges().is_empty() { + None + } else { + Some(1) + } + } + + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn maximum_len(&self) -> Option { + if self.ranges().is_empty() { + None + } else { + Some(1) + } + } } /// An iterator over all ranges in a byte character class. @@ -1277,7 +1038,7 @@ impl core::fmt::Debug for ClassBytesRange { /// The high-level intermediate representation for a look-around assertion. /// /// An assertion match is always zero-length. Also called an "empty match." -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum Look { /// Match the beginning of text. Specifically, this matches at the starting /// position of the input. @@ -1307,6 +1068,48 @@ pub enum Look { WordUnicodeNegate, } +impl Look { + fn from_repr(repr: u8) -> Option { + match repr { + 0 => Some(Look::Start), + 1 => Some(Look::End), + 2 => Some(Look::StartLF), + 3 => Some(Look::EndLF), + 4 => Some(Look::WordAscii), + 5 => Some(Look::WordAsciiNegate), + 6 => Some(Look::WordUnicode), + 7 => Some(Look::WordUnicodeNegate), + _ => None, + } + } + + fn as_repr(&self) -> u8 { + match *self { + Look::Start => 0, + Look::End => 1, + Look::StartLF => 2, + Look::EndLF => 3, + Look::WordAscii => 4, + Look::WordAsciiNegate => 5, + Look::WordUnicode => 6, + Look::WordUnicodeNegate => 7, + } + } + + fn as_char(self) -> char { + match self { + Look::Start => 'A', + Look::End => 'z', + Look::StartLF => '^', + Look::EndLF => '$', + Look::WordAscii => 'b', + Look::WordAsciiNegate => 'B', + Look::WordUnicode => '𝛃', + Look::WordUnicodeNegate => '𝚩', + } + } +} + /// The high-level intermediate representation for a group. /// /// This represents one of three possible group types: @@ -1429,52 +1232,480 @@ impl Drop for Hir { } } -/// A type that documents various attributes of an HIR expression. +/// A type that collects various properties of an HIR value. /// -/// These attributes are typically defined inductively on the HIR. +/// Properties are always scalar values and represent meta data that is +/// computed inductively on an HIR value. Properties are defined for all +/// HIR values. +/// +/// All methods on a `Properties` value take constant time. #[derive(Clone, Debug, Eq, PartialEq)] -struct HirInfo { - /// Represent yes/no questions by a bitfield to conserve space, since - /// this is included in every HIR expression. +pub struct Properties(Box); + +/// The property definition. It is split out so that we can box it, and +/// there by make `Properties` use less stack size. This is kind-of important +/// because every HIR value has a `Properties` attached to it. +/// +/// This does have the unfortunate consequence that creating any HIR value +/// always leads to at least one alloc for properties, but this is generally +/// true anyway (for pretty much all HirKinds except for look-arounds). +#[derive(Clone, Debug, Eq, PartialEq)] +struct PropertiesI { + minimum_len: Option, + maximum_len: Option, + look_set: LookSet, + look_set_prefix: LookSet, + look_set_suffix: LookSet, + utf8: bool, + literal: bool, + alternation_literal: bool, +} + +impl Properties { + /// Returns the length (in bytes) of the smallest string matched by this + /// HIR. /// - /// If more attributes need to be added, it is OK to increase the size of - /// this as appropriate. - bools: u16, + /// A return value of `0` is possible and occurs when the HIR can match an + /// empty string. + /// + /// `None` is returned when there is no minimum length. This occurs in + /// precisely the cases where the HIR matches nothing. i.e., The language + /// the regex matches is empty. An example of such a regex is `\P{any}`. + pub fn minimum_len(&self) -> Option { + self.0.minimum_len + } + + /// Returns the length (in bytes) of the longest string matched by this + /// HIR. + /// + /// A return value of `0` is possible and occurs when nothing longer than + /// the empty string is in the language described by this HIR. + /// + /// `None` is returned when there is no longest matching string. This + /// occurs when the HIR matches nothing or when there is no upper bound + /// on the length of matching strings. An example of such a regex is + /// `\P{any}`. + pub fn maximum_len(&self) -> Option { + self.0.maximum_len + } + + /// Returns a set of all look-around assertions that appear at least once + /// in this HIR value. + pub fn look_set(&self) -> LookSet { + self.0.look_set + } + + /// Returns a set of all look-around assertions that appear as a prefix for + /// this HIR value. That is, the set returned corresponds to the set of + /// assertions that must be passed before matching any bytes in a haystack. + /// + /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true + /// if and only if the HIR is fully anchored at the start. + pub fn look_set_prefix(&self) -> LookSet { + self.0.look_set_prefix + } + + /// Returns a set of all look-around assertions that appear as a suffix for + /// this HIR value. That is, the set returned corresponds to the set of + /// assertions that must be passed in order to be considered a match after + /// all other consuming HIR expressions. + /// + /// For example, `hir.look_set_suffix().contains(Look::End)` returns true + /// if and only if the HIR is fully anchored at the end. + pub fn look_set_suffix(&self) -> LookSet { + self.0.look_set_suffix + } + + /// Return true if and only if the corresponding HIR will always match + /// valid UTF-8. + /// + /// When this returns false, then it is possible for this HIR expression to + /// match invalid UTF-8. + /// + /// Note that this returns true even when the corresponding HIR can match + /// the empty string. Since an empty string can technically appear between + /// UTF-8 code units, it is possible for a match to be reported that splits + /// a codepoint which could in turn be considered matching invalid UTF-8. + /// However, it is generally assumed that such empty matches are handled + /// specially by the search routine if it is absolutely required that + /// matches not split a codepoint. + pub fn is_utf8(&self) -> bool { + self.0.utf8 + } + + /// Return true if and only if this HIR is a simple literal. This is only + /// true when this HIR expression is either itself a `Literal` or a + /// concatenation of only `Literal`s. + /// + /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`, + /// `` are not (even though that contain sub-expressions that are literals). + pub fn is_literal(&self) -> bool { + self.0.literal + } + + /// Return true if and only if this HIR is either a simple literal or an + /// alternation of simple literals. This is only + /// true when this HIR expression is either itself a `Literal` or a + /// concatenation of only `Literal`s or an alternation of only `Literal`s. + /// + /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation + /// literals, but `f+`, `(foo)`, `foo()`, `` + /// are not (even though that contain sub-expressions that are literals). + pub fn is_alternation_literal(&self) -> bool { + self.0.alternation_literal + } } -// A simple macro for defining bitfield accessors/mutators. -macro_rules! define_bool { - ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => { - fn $is_fn_name(&self) -> bool { - self.bools & (0b1 << $bit) > 0 +impl Properties { + /// Create a new set of HIR properties for an empty regex. + fn empty() -> Properties { + let inner = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + // It is debatable whether an empty regex always matches at valid + // UTF-8 boundaries. Strictly speaking, at a byte oriented view, + // it is clearly false. There are, for example, many empty strings + // between the bytes encoding a '☃'. + // + // However, when Unicode mode is enabled, the fundamental atom + // of matching is really a codepoint. And in that scenario, an + // empty regex is defined to only match at valid UTF-8 boundaries + // and to never split a codepoint. It just so happens that this + // enforcement is somewhat tricky to do for regexes that match + // the empty string inside regex engines themselves. It usually + // requires some layer above the regex engine to filter out such + // matches. + // + // In any case, 'true' is really the only coherent option. If it + // were false, for example, then 'a*' would also need to be false + // since it too can match the empty string. + utf8: true, + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a literal regex. + fn literal(lit: &Literal) -> Properties { + let inner = PropertiesI { + minimum_len: Some(lit.0.len()), + maximum_len: Some(lit.0.len()), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + utf8: core::str::from_utf8(&lit.0).is_ok(), + literal: true, + alternation_literal: true, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a character class. + fn class(class: &Class) -> Properties { + let inner = PropertiesI { + minimum_len: class.minimum_len(), + maximum_len: class.maximum_len(), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + utf8: class.is_utf8(), + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a look-around assertion. + fn look(look: Look) -> Properties { + use self::Look::*; + + let utf8 = match look { + Start | End | StartLF | EndLF | WordAscii | WordUnicode + | WordUnicodeNegate => true, + // FIXME: Negated ASCII word boundaries can match invalid UTF-8. + // But why is this 'false' when 'HirKind::Empty' is true? After + // all, isn't WordAsciiNegate just a subset of HirKind::Empty? It + // seems to me that if we handle HirKind::Empty correctly even when + // it splits a codepoint, then we should be able to automatically + // handle WordAsciiNegate correctly too... + // + // For now, this returns 'false' because that's what it did before. + // But we should revisit this before the next release. + WordAsciiNegate => false, + }; + let inner = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::singleton(look), + look_set_prefix: LookSet::singleton(look), + look_set_suffix: LookSet::singleton(look), + utf8, + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a repetition. + fn repetition(rep: &Repetition) -> Properties { + let minimum_len = + rep.hir.properties().minimum_len().map(|child_min| { + let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX); + child_min.saturating_mul(rep_min) + }); + let maximum_len = rep.max.and_then(|rep_max| { + let rep_max = usize::try_from(rep_max).ok()?; + let child_max = rep.hir.properties().maximum_len()?; + child_max.checked_mul(rep_max) + }); + + let mut inner = PropertiesI { + minimum_len, + maximum_len, + look_set: rep.hir.properties().look_set(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + utf8: rep.hir.properties().is_utf8(), + literal: false, + alternation_literal: false, + }; + if !rep.is_match_empty() { + let child_props = rep.hir.properties(); + inner.look_set_prefix = child_props.look_set_prefix(); + inner.look_set_suffix = child_props.look_set_suffix(); } - - fn $set_fn_name(&mut self, yes: bool) { - if yes { - self.bools |= 1 << $bit; - } else { - self.bools &= !(1 << $bit); + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a group. + fn group(group: &Group) -> Properties { + // FIXME: Groups really should always have the same properties as + // their child expressions. But the literal properties somewhat + // over-constrained in what they represent in order to make downstream + // analyses a bit more straight-forward. + Properties(Box::new(PropertiesI { + literal: false, + alternation_literal: false, + ..*group.hir.properties().0.clone() + })) + } + + /// Create a new set of HIR properties for a concatenation. + fn concat(concat: &[Hir]) -> Properties { + // The base case is an empty concatenation, which matches the empty + // string. Note though that empty concatenations aren't possible, + // because the Hir::concat smart constructor rewrites those as + // Hir::empty. + let mut props = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + utf8: true, + literal: true, + alternation_literal: true, + }; + // Handle properties that need to visit every child hir. + for x in concat.iter() { + props.look_set.union(x.properties().look_set()); + props.utf8 = props.utf8 && x.properties().is_utf8(); + props.literal = props.literal && x.properties().is_literal(); + props.alternation_literal = props.alternation_literal + && x.properties().is_alternation_literal(); + if let Some(ref mut minimum_len) = props.minimum_len { + match x.properties().minimum_len() { + None => props.minimum_len = None, + Some(x) => *minimum_len += x, + } + } + if let Some(ref mut maximum_len) = props.maximum_len { + match x.properties().maximum_len() { + None => props.maximum_len = None, + Some(x) => *maximum_len += x, + } + } + } + // Handle the prefix properties, which only requires visiting + // child exprs until one matches more than the empty string. + let mut it = concat.iter(); + while let Some(x) = it.next() { + props.look_set_prefix.union(x.properties().look_set_prefix()); + if x.properties().maximum_len().map_or(true, |x| x > 0) { + break; + } + } + // Same thing for the suffix properties, but in reverse. + let mut it = concat.iter().rev(); + while let Some(x) = it.next() { + props.look_set_suffix.union(x.properties().look_set_suffix()); + if x.properties().maximum_len().map_or(true, |x| x > 0) { + break; } } - }; + Properties(Box::new(props)) + } + + /// Create a new set of HIR properties for a concatenation. + fn alternation(alts: &[Hir]) -> Properties { + // While empty alternations aren't possible, we still behave as if they + // are. When we have an empty alternate, then clearly the look-around + // prefix and suffix is empty. Otherwise, it is the intersection of all + // prefixes and suffixes (respectively) of the branches. + let fix = + if alts.is_empty() { LookSet::empty() } else { LookSet::full() }; + // The base case is an empty alternation, which matches nothing. + // Note though that empty alternations aren't possible, because the + // Hir::alternation smart constructor rewrites those as empty character + // classes. + let mut props = PropertiesI { + minimum_len: None, + maximum_len: None, + look_set: LookSet::empty(), + look_set_prefix: fix, + look_set_suffix: fix, + utf8: true, + literal: false, + alternation_literal: true, + }; + // Handle properties that need to visit every child hir. + for x in alts.iter() { + props.look_set.union(x.properties().look_set()); + props.look_set_prefix.intersect(x.properties().look_set_prefix()); + props.look_set_suffix.intersect(x.properties().look_set_suffix()); + props.utf8 = props.utf8 && x.properties().is_utf8(); + props.alternation_literal = props.alternation_literal + && x.properties().is_alternation_literal(); + if let Some(xmin) = x.properties().minimum_len() { + if props.minimum_len.map_or(true, |pmin| xmin < pmin) { + props.minimum_len = Some(xmin); + } + } + if let Some(xmax) = x.properties().maximum_len() { + if props.maximum_len.map_or(true, |pmax| xmax > pmax) { + props.maximum_len = Some(xmax); + } + } + } + Properties(Box::new(props)) + } } -impl HirInfo { - fn new() -> HirInfo { - HirInfo { bools: 0 } - } - - define_bool!(0, is_always_utf8, set_always_utf8); - define_bool!(1, is_all_assertions, set_all_assertions); - define_bool!(2, is_anchored_start, set_anchored_start); - define_bool!(3, is_anchored_end, set_anchored_end); - define_bool!(4, is_line_anchored_start, set_line_anchored_start); - define_bool!(5, is_line_anchored_end, set_line_anchored_end); - define_bool!(6, is_any_anchored_start, set_any_anchored_start); - define_bool!(7, is_any_anchored_end, set_any_anchored_end); - define_bool!(8, is_match_empty, set_match_empty); - define_bool!(9, is_literal, set_literal); - define_bool!(10, is_alternation_literal, set_alternation_literal); +/// A set of look-around assertions. +/// +/// This is useful for efficiently tracking look-around assertions. For +/// example, an [`Hir`] provides properties that return `LookSet`s. +#[derive(Clone, Copy, Default, Eq, PartialEq)] +pub struct LookSet { + bits: u8, +} + +impl LookSet { + /// Create an empty set of look-around assertions. + pub fn empty() -> LookSet { + LookSet { bits: 0 } + } + + /// Create a full set of look-around assertions. + /// + /// This set contains all possible look-around assertions. + pub fn full() -> LookSet { + LookSet { bits: !0 } + } + + /// Create a look-around set containing the look-around assertion given. + /// + /// This is a convenience routine for creating an empty set and inserting + /// one look-around assertions. + pub fn singleton(look: Look) -> LookSet { + let mut set = LookSet::empty(); + set.insert(look); + set + } + + /// Returns the total number of look-around assertions in this set. + pub fn len(&self) -> usize { + // OK because max value always fits in a u8, which in turn always + // fits in a usize, regardless of target. + usize::try_from(self.bits.count_ones()).unwrap() + } + + /// Returns true if and only if this set is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Insert the given look-around assertions into this set. If the assertion + /// is already in the set, then this is a no-op. + pub fn insert(&mut self, look: Look) { + self.bits |= 1 << look.as_repr(); + } + + /// Remove the given look-around assertion from this set. If it wasn't + /// previously in the set, then this is a no-op. + pub fn remove(&mut self, look: Look) { + self.bits &= !(1 << look.as_repr()); + } + + /// Returns true if and only if the given look-around assertion is in this + /// set. + pub fn contains(&self, look: Look) -> bool { + self.bits & (1 << look.as_repr()) != 0 + } + + /// Modifies this set to be the union of itself and the set given. + pub fn union(&mut self, other: LookSet) { + self.bits |= other.bits; + } + + /// Modifies this set to be the intersection of itself and the set given. + pub fn intersect(&mut self, other: LookSet) { + self.bits &= other.bits; + } + + /// Returns an iterator over all of the look-around assertions in this set. + #[inline] + pub fn iter(self) -> LookSetIter { + LookSetIter { set: self } + } +} + +impl core::fmt::Debug for LookSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + if self.is_empty() { + return write!(f, "∅"); + } + for look in self.iter() { + write!(f, "{}", look.as_char())?; + } + Ok(()) + } +} + +/// An iterator over all look-around assertions in a [`LookSet`]. +/// +/// This iterator is created by [`LookSet::iter`]. +#[derive(Clone, Debug)] +pub struct LookSetIter { + set: LookSet, +} + +impl Iterator for LookSetIter { + type Item = Look; + + #[inline] + fn next(&mut self) -> Option { + // We'll never have more than u8::MAX distinct look-around assertions, + // so 'repr' will always fit into a usize. + let repr = u8::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(repr)?; + self.set.remove(look); + Some(look) + } } #[cfg(test)] @@ -2225,11 +2456,11 @@ mod tests { expr = Hir { kind: HirKind::Concat(vec![expr]), - info: HirInfo::new(), + props: Properties::empty(), }; expr = Hir { kind: HirKind::Alternation(vec![expr]), - info: HirInfo::new(), + props: Properties::empty(), }; } assert!(!expr.kind.is_empty()); diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index aaf58b75d..831548c2c 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -1270,7 +1270,7 @@ fn ascii_class_as_chars( mod tests { use crate::{ ast::{self, parse::ParserBuilder, Ast, Position, Span}, - hir::{self, Hir, HirKind}, + hir::{self, Hir, HirKind, Look, Properties}, unicode::{self, ClassQuery}, }; @@ -1325,6 +1325,14 @@ mod tests { .unwrap() } + fn props(pattern: &str) -> Properties { + t(pattern).properties().clone() + } + + fn props_bytes(pattern: &str) -> Properties { + t_bytes(pattern).properties().clone() + } + fn hir_lit(s: &str) -> Hir { hir_blit(s.as_bytes()) } @@ -3027,273 +3035,258 @@ mod tests { } #[test] - fn analysis_is_always_utf8() { + fn analysis_is_utf8() { // Positive examples. - assert!(t_bytes(r"a").is_always_utf8()); - assert!(t_bytes(r"ab").is_always_utf8()); - assert!(t_bytes(r"(?-u)a").is_always_utf8()); - assert!(t_bytes(r"(?-u)ab").is_always_utf8()); - assert!(t_bytes(r"\xFF").is_always_utf8()); - assert!(t_bytes(r"\xFF\xFF").is_always_utf8()); - assert!(t_bytes(r"[^a]").is_always_utf8()); - assert!(t_bytes(r"[^a][^a]").is_always_utf8()); - assert!(t_bytes(r"\b").is_always_utf8()); - assert!(t_bytes(r"\B").is_always_utf8()); - assert!(t_bytes(r"(?-u)\b").is_always_utf8()); + assert!(props_bytes(r"a").is_utf8()); + assert!(props_bytes(r"ab").is_utf8()); + assert!(props_bytes(r"(?-u)a").is_utf8()); + assert!(props_bytes(r"(?-u)ab").is_utf8()); + assert!(props_bytes(r"\xFF").is_utf8()); + assert!(props_bytes(r"\xFF\xFF").is_utf8()); + assert!(props_bytes(r"[^a]").is_utf8()); + assert!(props_bytes(r"[^a][^a]").is_utf8()); + assert!(props_bytes(r"\b").is_utf8()); + assert!(props_bytes(r"\B").is_utf8()); + assert!(props_bytes(r"(?-u)\b").is_utf8()); // Negative examples. - assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8()); - assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8()); - assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8()); - assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8()); - assert!(!t_bytes(r"(?-u)\B").is_always_utf8()); + assert!(!props_bytes(r"(?-u)\xFF").is_utf8()); + assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8()); + assert!(!props_bytes(r"(?-u)[^a]").is_utf8()); + assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8()); + assert!(!props_bytes(r"(?-u)\B").is_utf8()); } #[test] fn analysis_is_all_assertions() { // Positive examples. - assert!(t(r"\b").is_all_assertions()); - assert!(t(r"\B").is_all_assertions()); - assert!(t(r"^").is_all_assertions()); - assert!(t(r"$").is_all_assertions()); - assert!(t(r"\A").is_all_assertions()); - assert!(t(r"\z").is_all_assertions()); - assert!(t(r"$^\z\A\b\B").is_all_assertions()); - assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions()); - assert!(t(r"^$|$^").is_all_assertions()); - assert!(t(r"((\b)+())*^").is_all_assertions()); + let p = props(r"\b"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\A"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\z"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$^\z\A\b\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$|^|\z|\A|\b|\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"^$|$^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"((\b)+())*^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); // Negative examples. - assert!(!t(r"^a").is_all_assertions()); + let p = props(r"^a"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(1)); } #[test] fn analysis_is_anchored() { + let is_start = |p| props(p).look_set_prefix().contains(Look::Start); + let is_end = |p| props(p).look_set_suffix().contains(Look::End); + // Positive examples. - assert!(t(r"^").is_anchored_start()); - assert!(t(r"$").is_anchored_end()); - assert!(t(r"^").is_line_anchored_start()); - assert!(t(r"$").is_line_anchored_end()); - - assert!(t(r"^^").is_anchored_start()); - assert!(t(r"$$").is_anchored_end()); - assert!(t(r"^^").is_line_anchored_start()); - assert!(t(r"$$").is_line_anchored_end()); - - assert!(t(r"^$").is_anchored_start()); - assert!(t(r"^$").is_anchored_end()); - assert!(t(r"^$").is_line_anchored_start()); - assert!(t(r"^$").is_line_anchored_end()); - - assert!(t(r"^foo").is_anchored_start()); - assert!(t(r"foo$").is_anchored_end()); - assert!(t(r"^foo").is_line_anchored_start()); - assert!(t(r"foo$").is_line_anchored_end()); - - assert!(t(r"^foo|^bar").is_anchored_start()); - assert!(t(r"foo$|bar$").is_anchored_end()); - assert!(t(r"^foo|^bar").is_line_anchored_start()); - assert!(t(r"foo$|bar$").is_line_anchored_end()); - - assert!(t(r"^(foo|bar)").is_anchored_start()); - assert!(t(r"(foo|bar)$").is_anchored_end()); - assert!(t(r"^(foo|bar)").is_line_anchored_start()); - assert!(t(r"(foo|bar)$").is_line_anchored_end()); - - assert!(t(r"^+").is_anchored_start()); - assert!(t(r"$+").is_anchored_end()); - assert!(t(r"^+").is_line_anchored_start()); - assert!(t(r"$+").is_line_anchored_end()); - assert!(t(r"^++").is_anchored_start()); - assert!(t(r"$++").is_anchored_end()); - assert!(t(r"^++").is_line_anchored_start()); - assert!(t(r"$++").is_line_anchored_end()); - assert!(t(r"(^)+").is_anchored_start()); - assert!(t(r"($)+").is_anchored_end()); - assert!(t(r"(^)+").is_line_anchored_start()); - assert!(t(r"($)+").is_line_anchored_end()); - - assert!(t(r"$^").is_anchored_start()); - assert!(t(r"$^").is_anchored_start()); - assert!(t(r"$^").is_line_anchored_end()); - assert!(t(r"$^").is_line_anchored_end()); - assert!(t(r"$^|^$").is_anchored_start()); - assert!(t(r"$^|^$").is_anchored_end()); - assert!(t(r"$^|^$").is_line_anchored_start()); - assert!(t(r"$^|^$").is_line_anchored_end()); - - assert!(t(r"\b^").is_anchored_start()); - assert!(t(r"$\b").is_anchored_end()); - assert!(t(r"\b^").is_line_anchored_start()); - assert!(t(r"$\b").is_line_anchored_end()); - assert!(t(r"^(?m:^)").is_anchored_start()); - assert!(t(r"(?m:$)$").is_anchored_end()); - assert!(t(r"^(?m:^)").is_line_anchored_start()); - assert!(t(r"(?m:$)$").is_line_anchored_end()); - assert!(t(r"(?m:^)^").is_anchored_start()); - assert!(t(r"$(?m:$)").is_anchored_end()); - assert!(t(r"(?m:^)^").is_line_anchored_start()); - assert!(t(r"$(?m:$)").is_line_anchored_end()); + assert!(is_start(r"^")); + assert!(is_end(r"$")); - // Negative examples. - assert!(!t(r"(?m)^").is_anchored_start()); - assert!(!t(r"(?m)$").is_anchored_end()); - assert!(!t(r"(?m:^$)|$^").is_anchored_start()); - assert!(!t(r"(?m:^$)|$^").is_anchored_end()); - assert!(!t(r"$^|(?m:^$)").is_anchored_start()); - assert!(!t(r"$^|(?m:^$)").is_anchored_end()); - - assert!(!t(r"a^").is_anchored_start()); - assert!(!t(r"$a").is_anchored_start()); - assert!(!t(r"a^").is_line_anchored_start()); - assert!(!t(r"$a").is_line_anchored_start()); - - assert!(!t(r"a^").is_anchored_end()); - assert!(!t(r"$a").is_anchored_end()); - assert!(!t(r"a^").is_line_anchored_end()); - assert!(!t(r"$a").is_line_anchored_end()); - - assert!(!t(r"^foo|bar").is_anchored_start()); - assert!(!t(r"foo|bar$").is_anchored_end()); - assert!(!t(r"^foo|bar").is_line_anchored_start()); - assert!(!t(r"foo|bar$").is_line_anchored_end()); - - assert!(!t(r"^*").is_anchored_start()); - assert!(!t(r"$*").is_anchored_end()); - assert!(!t(r"^*").is_line_anchored_start()); - assert!(!t(r"$*").is_line_anchored_end()); - assert!(!t(r"^*+").is_anchored_start()); - assert!(!t(r"$*+").is_anchored_end()); - assert!(!t(r"^*+").is_line_anchored_start()); - assert!(!t(r"$*+").is_line_anchored_end()); - assert!(!t(r"^+*").is_anchored_start()); - assert!(!t(r"$+*").is_anchored_end()); - assert!(!t(r"^+*").is_line_anchored_start()); - assert!(!t(r"$+*").is_line_anchored_end()); - assert!(!t(r"(^)*").is_anchored_start()); - assert!(!t(r"($)*").is_anchored_end()); - assert!(!t(r"(^)*").is_line_anchored_start()); - assert!(!t(r"($)*").is_line_anchored_end()); - } + assert!(is_start(r"^^")); + assert!(props(r"$$").look_set_suffix().contains(Look::End)); - #[test] - fn analysis_is_line_anchored() { - assert!(t(r"(?m)^(foo|bar)").is_line_anchored_start()); - assert!(t(r"(?m)(foo|bar)$").is_line_anchored_end()); + assert!(is_start(r"^$")); + assert!(is_end(r"^$")); + + assert!(is_start(r"^foo")); + assert!(is_end(r"foo$")); - assert!(t(r"(?m)^foo|^bar").is_line_anchored_start()); - assert!(t(r"(?m)foo$|bar$").is_line_anchored_end()); + assert!(is_start(r"^foo|^bar")); + assert!(is_end(r"foo$|bar$")); - assert!(t(r"(?m)^").is_line_anchored_start()); - assert!(t(r"(?m)$").is_line_anchored_end()); + assert!(is_start(r"^(foo|bar)")); + assert!(is_end(r"(foo|bar)$")); - assert!(t(r"(?m:^$)|$^").is_line_anchored_start()); - assert!(t(r"(?m:^$)|$^").is_line_anchored_end()); + assert!(is_start(r"^+")); + assert!(is_end(r"$+")); + assert!(is_start(r"^++")); + assert!(is_end(r"$++")); + assert!(is_start(r"(^)+")); + assert!(is_end(r"($)+")); - assert!(t(r"$^|(?m:^$)").is_line_anchored_start()); - assert!(t(r"$^|(?m:^$)").is_line_anchored_end()); + assert!(is_start(r"$^")); + assert!(is_start(r"$^")); + assert!(is_start(r"$^|^$")); + assert!(is_end(r"$^|^$")); + + assert!(is_start(r"\b^")); + assert!(is_end(r"$\b")); + assert!(is_start(r"^(?m:^)")); + assert!(is_end(r"(?m:$)$")); + assert!(is_start(r"(?m:^)^")); + assert!(is_end(r"$(?m:$)")); + + // Negative examples. + assert!(!is_start(r"(?m)^")); + assert!(!is_end(r"(?m)$")); + assert!(!is_start(r"(?m:^$)|$^")); + assert!(!is_end(r"(?m:^$)|$^")); + assert!(!is_start(r"$^|(?m:^$)")); + assert!(!is_end(r"$^|(?m:^$)")); + + assert!(!is_start(r"a^")); + assert!(!is_start(r"$a")); + + assert!(!is_end(r"a^")); + assert!(!is_end(r"$a")); + + assert!(!is_start(r"^foo|bar")); + assert!(!is_end(r"foo|bar$")); + + assert!(!is_start(r"^*")); + assert!(!is_end(r"$*")); + assert!(!is_start(r"^*+")); + assert!(!is_end(r"$*+")); + assert!(!is_start(r"^+*")); + assert!(!is_end(r"$+*")); + assert!(!is_start(r"(^)*")); + assert!(!is_end(r"($)*")); } #[test] fn analysis_is_any_anchored() { + let is_start = |p| props(p).look_set().contains(Look::Start); + let is_end = |p| props(p).look_set().contains(Look::End); + // Positive examples. - assert!(t(r"^").is_any_anchored_start()); - assert!(t(r"$").is_any_anchored_end()); - assert!(t(r"\A").is_any_anchored_start()); - assert!(t(r"\z").is_any_anchored_end()); + assert!(is_start(r"^")); + assert!(is_end(r"$")); + assert!(is_start(r"\A")); + assert!(is_end(r"\z")); // Negative examples. - assert!(!t(r"(?m)^").is_any_anchored_start()); - assert!(!t(r"(?m)$").is_any_anchored_end()); - assert!(!t(r"$").is_any_anchored_start()); - assert!(!t(r"^").is_any_anchored_end()); + assert!(!is_start(r"(?m)^")); + assert!(!is_end(r"(?m)$")); + assert!(!is_start(r"$")); + assert!(!is_end(r"^")); } #[test] - fn analysis_is_match_empty() { + fn analysis_can_empty() { // Positive examples. - assert!(t(r"").is_match_empty()); - assert!(t(r"()").is_match_empty()); - assert!(t(r"()*").is_match_empty()); - assert!(t(r"()+").is_match_empty()); - assert!(t(r"()?").is_match_empty()); - assert!(t(r"a*").is_match_empty()); - assert!(t(r"a?").is_match_empty()); - assert!(t(r"a{0}").is_match_empty()); - assert!(t(r"a{0,}").is_match_empty()); - assert!(t(r"a{0,1}").is_match_empty()); - assert!(t(r"a{0,10}").is_match_empty()); + let assert_empty = + |p| assert_eq!(Some(0), props_bytes(p).minimum_len()); + assert_empty(r""); + assert_empty(r"()"); + assert_empty(r"()*"); + assert_empty(r"()+"); + assert_empty(r"()?"); + assert_empty(r"a*"); + assert_empty(r"a?"); + assert_empty(r"a{0}"); + assert_empty(r"a{0,}"); + assert_empty(r"a{0,1}"); + assert_empty(r"a{0,10}"); #[cfg(feature = "unicode-gencat")] - assert!(t(r"\pL*").is_match_empty()); - assert!(t(r"a*|b").is_match_empty()); - assert!(t(r"b|a*").is_match_empty()); - assert!(t(r"a|").is_match_empty()); - assert!(t(r"|a").is_match_empty()); - assert!(t(r"a||b").is_match_empty()); - assert!(t(r"a*a?(abcd)*").is_match_empty()); - assert!(t(r"^").is_match_empty()); - assert!(t(r"$").is_match_empty()); - assert!(t(r"(?m)^").is_match_empty()); - assert!(t(r"(?m)$").is_match_empty()); - assert!(t(r"\A").is_match_empty()); - assert!(t(r"\z").is_match_empty()); - assert!(t(r"\B").is_match_empty()); - assert!(t_bytes(r"(?-u)\B").is_match_empty()); - assert!(t(r"\b").is_match_empty()); - assert!(t(r"(?-u)\b").is_match_empty()); + assert_empty(r"\pL*"); + assert_empty(r"a*|b"); + assert_empty(r"b|a*"); + assert_empty(r"a|"); + assert_empty(r"|a"); + assert_empty(r"a||b"); + assert_empty(r"a*a?(abcd)*"); + assert_empty(r"^"); + assert_empty(r"$"); + assert_empty(r"(?m)^"); + assert_empty(r"(?m)$"); + assert_empty(r"\A"); + assert_empty(r"\z"); + assert_empty(r"\B"); + assert_empty(r"(?-u)\B"); + assert_empty(r"\b"); + assert_empty(r"(?-u)\b"); // Negative examples. - assert!(!t(r"a+").is_match_empty()); - assert!(!t(r"a{1}").is_match_empty()); - assert!(!t(r"a{1,}").is_match_empty()); - assert!(!t(r"a{1,2}").is_match_empty()); - assert!(!t(r"a{1,10}").is_match_empty()); - assert!(!t(r"b|a").is_match_empty()); - assert!(!t(r"a*a+(abcd)*").is_match_empty()); + let assert_non_empty = + |p| assert_ne!(Some(0), props_bytes(p).minimum_len()); + assert_non_empty(r"a+"); + assert_non_empty(r"a{1}"); + assert_non_empty(r"a{1,}"); + assert_non_empty(r"a{1,2}"); + assert_non_empty(r"a{1,10}"); + assert_non_empty(r"b|a"); + assert_non_empty(r"a*a+(abcd)*"); + #[cfg(feature = "unicode-gencat")] + assert_non_empty(r"\P{any}"); + assert_non_empty(r"[a--a]"); + assert_non_empty(r"[a&&b]"); } #[test] fn analysis_is_literal() { // Positive examples. - assert!(t(r"a").is_literal()); - assert!(t(r"ab").is_literal()); - assert!(t(r"abc").is_literal()); - assert!(t(r"(?m)abc").is_literal()); + assert!(props(r"a").is_literal()); + assert!(props(r"ab").is_literal()); + assert!(props(r"abc").is_literal()); + assert!(props(r"(?m)abc").is_literal()); // Negative examples. - assert!(!t(r"").is_literal()); - assert!(!t(r"^").is_literal()); - assert!(!t(r"a|b").is_literal()); - assert!(!t(r"(a)").is_literal()); - assert!(!t(r"a+").is_literal()); - assert!(!t(r"foo(a)").is_literal()); - assert!(!t(r"(a)foo").is_literal()); - assert!(!t(r"[a]").is_literal()); + assert!(!props(r"").is_literal()); + assert!(!props(r"^").is_literal()); + assert!(!props(r"a|b").is_literal()); + assert!(!props(r"(a)").is_literal()); + assert!(!props(r"a+").is_literal()); + assert!(!props(r"foo(a)").is_literal()); + assert!(!props(r"(a)foo").is_literal()); + assert!(!props(r"[a]").is_literal()); } #[test] fn analysis_is_alternation_literal() { // Positive examples. - assert!(t(r"a").is_alternation_literal()); - assert!(t(r"ab").is_alternation_literal()); - assert!(t(r"abc").is_alternation_literal()); - assert!(t(r"(?m)abc").is_alternation_literal()); - assert!(t(r"a|b").is_alternation_literal()); - assert!(t(r"a|b|c").is_alternation_literal()); - assert!(t(r"foo|bar").is_alternation_literal()); - assert!(t(r"foo|bar|baz").is_alternation_literal()); + assert!(props(r"a").is_alternation_literal()); + assert!(props(r"ab").is_alternation_literal()); + assert!(props(r"abc").is_alternation_literal()); + assert!(props(r"(?m)abc").is_alternation_literal()); + assert!(props(r"a|b").is_alternation_literal()); + assert!(props(r"a|b|c").is_alternation_literal()); + assert!(props(r"foo|bar").is_alternation_literal()); + assert!(props(r"foo|bar|baz").is_alternation_literal()); // Negative examples. - assert!(!t(r"").is_alternation_literal()); - assert!(!t(r"^").is_alternation_literal()); - assert!(!t(r"(a)").is_alternation_literal()); - assert!(!t(r"a+").is_alternation_literal()); - assert!(!t(r"foo(a)").is_alternation_literal()); - assert!(!t(r"(a)foo").is_alternation_literal()); - assert!(!t(r"[a]").is_alternation_literal()); - assert!(!t(r"[a]|b").is_alternation_literal()); - assert!(!t(r"a|[b]").is_alternation_literal()); - assert!(!t(r"(a)|b").is_alternation_literal()); - assert!(!t(r"a|(b)").is_alternation_literal()); + assert!(!props(r"").is_alternation_literal()); + assert!(!props(r"^").is_alternation_literal()); + assert!(!props(r"(a)").is_alternation_literal()); + assert!(!props(r"a+").is_alternation_literal()); + assert!(!props(r"foo(a)").is_alternation_literal()); + assert!(!props(r"(a)foo").is_alternation_literal()); + assert!(!props(r"[a]").is_alternation_literal()); + assert!(!props(r"[a]|b").is_alternation_literal()); + assert!(!props(r"a|[b]").is_alternation_literal()); + assert!(!props(r"(a)|b").is_alternation_literal()); + assert!(!props(r"a|(b)").is_alternation_literal()); } } diff --git a/src/compile.rs b/src/compile.rs index 1907bc0ae..da9f1bd06 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -4,7 +4,7 @@ use std::iter; use std::result; use std::sync::Arc; -use regex_syntax::hir::{self, Hir}; +use regex_syntax::hir::{self, Hir, Look}; use regex_syntax::is_word_byte; use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; @@ -142,8 +142,10 @@ impl Compiler { // Other matching engines handle this by baking the logic into the // matching engine itself. let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; - self.compiled.is_anchored_start = expr.is_anchored_start(); - self.compiled.is_anchored_end = expr.is_anchored_end(); + self.compiled.is_anchored_start = + expr.properties().look_set_prefix().contains(Look::Start); + self.compiled.is_anchored_end = + expr.properties().look_set_suffix().contains(Look::End); if self.compiled.needs_dotstar() { dotstar_patch = self.c_dotstar()?; self.compiled.start = dotstar_patch.entry; @@ -168,10 +170,12 @@ impl Compiler { ) -> result::Result { debug_assert!(exprs.len() > 1); - self.compiled.is_anchored_start = - exprs.iter().all(|e| e.is_anchored_start()); - self.compiled.is_anchored_end = - exprs.iter().all(|e| e.is_anchored_end()); + self.compiled.is_anchored_start = exprs + .iter() + .all(|e| e.properties().look_set_prefix().contains(Look::Start)); + self.compiled.is_anchored_end = exprs + .iter() + .all(|e| e.properties().look_set_suffix().contains(Look::End)); let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; if self.compiled.needs_dotstar() { dotstar_patch = self.c_dotstar()?; diff --git a/src/exec.rs b/src/exec.rs index 50685bfb5..eafd4e63b 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -6,7 +6,7 @@ use std::sync::Arc; #[cfg(feature = "perf-literal")] use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; use regex_syntax::hir::literal::Literals; -use regex_syntax::hir::Hir; +use regex_syntax::hir::{Hir, Look}; use regex_syntax::ParserBuilder; use crate::backtrack; @@ -248,14 +248,19 @@ impl ExecBuilder { .build(); let expr = parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; - bytes = bytes || !expr.is_always_utf8(); + let props = expr.properties(); + bytes = bytes || !props.is_utf8(); if cfg!(feature = "perf-literal") { - if !expr.is_anchored_start() && expr.is_any_anchored_start() { + if !props.look_set_prefix().contains(Look::Start) + && props.look_set().contains(Look::Start) + { // Partial anchors unfortunately make it hard to use // prefixes, so disable them. prefixes = None; - } else if is_set && expr.is_anchored_start() { + } else if is_set + && props.look_set_prefix().contains(Look::Start) + { // Regex sets with anchors do not go well with literal // optimizations. prefixes = None; @@ -268,11 +273,14 @@ impl ExecBuilder { } }); - if !expr.is_anchored_end() && expr.is_any_anchored_end() { + if !props.look_set_suffix().contains(Look::End) + && props.look_set().contains(Look::End) + { // Partial anchors unfortunately make it hard to use // suffixes, so disable them. suffixes = None; - } else if is_set && expr.is_anchored_end() { + } else if is_set && props.look_set_suffix().contains(Look::End) + { // Regex sets with anchors do not go well with literal // optimizations. suffixes = None; @@ -1557,7 +1565,7 @@ fn alternation_literals(expr: &Hir) -> Option>> { // optimization pipeline, because this is a terribly inflexible way to go // about things. - if !expr.is_alternation_literal() { + if !expr.properties().is_alternation_literal() { return None; } let alts = match *expr.kind() { From 2c119ead5bec8fafae05f7818e1e6d3631962fba Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 15 Sep 2022 10:37:44 -0400 Subject: [PATCH 26/79] syntax: rejigger Hir::{dot,any} Instead of using a boolean parameter, we just split them into dot_char, dot_byte, any_char, any_byte. Another path would be to use an enum, but this appeals to me a little more. --- regex-syntax/src/hir/mod.rs | 76 +++++++++++++++++-------------- regex-syntax/src/hir/translate.rs | 12 ++++- src/compile.rs | 20 ++++---- 3 files changed, 60 insertions(+), 48 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index ea41028d5..117cd1570 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -310,44 +310,52 @@ impl Hir { /// Build an HIR expression for `.`. /// - /// A `.` expression matches any character except for `\n`. To build an - /// expression that matches any character, including `\n`, use the `any` - /// method. - /// - /// If `bytes` is `true`, then this assumes characters are limited to a - /// single byte. - pub fn dot(bytes: bool) -> Hir { - if bytes { - let mut cls = ClassBytes::empty(); - cls.push(ClassBytesRange::new(b'\0', b'\x09')); - cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); - Hir::class(Class::Bytes(cls)) - } else { - let mut cls = ClassUnicode::empty(); - cls.push(ClassUnicodeRange::new('\0', '\x09')); - cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); - Hir::class(Class::Unicode(cls)) - } + /// A `.` expression matches any character except for a newline terminator. + /// To build an expression that matches any character, including newline + /// terminators, use the `any_char` method. + pub fn dot_char() -> Hir { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) } - /// Build an HIR expression for `(?s).`. + /// Build an HIR expression for `(?-u:.)`. /// - /// A `(?s).` expression matches any character, including `\n`. To build an - /// expression that matches any character except for `\n`, then use the - /// `dot` method. + /// A non-Unicode `.` expression matches any byte except for a newline + /// terminator. To build an expression that matches any byte, including + /// newline terminators, use the `any_byte` method. + pub fn dot_byte() -> Hir { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + + /// Build an HIR expression for `(?s:.)`. /// - /// If `bytes` is `true`, then this assumes characters are limited to a - /// single byte. - pub fn any(bytes: bool) -> Hir { - if bytes { - let mut cls = ClassBytes::empty(); - cls.push(ClassBytesRange::new(b'\0', b'\xFF')); - Hir::class(Class::Bytes(cls)) - } else { - let mut cls = ClassUnicode::empty(); - cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); - Hir::class(Class::Unicode(cls)) - } + /// A `(?s:.)` expression matches any character, including newline + /// terminators. To build an expression that matches any character except + /// for newline terminators, use the `dot_char` method. + /// + /// Note that `(?s:)` is equivalent to `\p{any}`. + pub fn any_char() -> Hir { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + + /// Build an HIR expression for `(?s-u:.)`. + /// + /// A `(?s-u:.)` expression matches any byte, including newline terminators. + /// To build an expression that matches any byte except for newline + /// terminators, use the `dot_byte` method. + /// + /// Note that `(?s-u:.)` is equivalent to `(?-u:[\x00-\xFF])`. + pub fn any_byte() -> Hir { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\xFF')); + Hir::class(Class::Bytes(cls)) } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 831548c2c..5c8c03768 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -861,9 +861,17 @@ impl<'t, 'p> TranslatorI<'t, 'p> { return Err(self.error(span, ErrorKind::InvalidUtf8)); } Ok(if self.flags().dot_matches_new_line() { - Hir::any(!unicode) + if unicode { + Hir::any_char() + } else { + Hir::any_byte() + } } else { - Hir::dot(!unicode) + if unicode { + Hir::dot_char() + } else { + Hir::dot_byte() + } }) } diff --git a/src/compile.rs b/src/compile.rs index da9f1bd06..e3a859b39 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -424,23 +424,19 @@ impl Compiler { } fn c_dotstar(&mut self) -> Result { - Ok(if !self.compiled.only_utf8() { - self.c(&Hir::repetition(hir::Repetition { - min: 0, - max: None, - greedy: false, - hir: Box::new(Hir::any(true)), - }))? - .unwrap() + let hir = if self.compiled.only_utf8() { + Hir::any_char() } else { - self.c(&Hir::repetition(hir::Repetition { + Hir::any_byte() + }; + Ok(self + .c(&Hir::repetition(hir::Repetition { min: 0, max: None, greedy: false, - hir: Box::new(Hir::any(false)), + hir: Box::new(hir), }))? - .unwrap() - }) + .unwrap()) } fn c_char(&mut self, c: char) -> ResultOrEmpty { From 3a8313e5db70fbab84c211bdef2aa792f7df92d1 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 15 Sep 2022 11:10:46 -0400 Subject: [PATCH 27/79] syntax: remove non-capturing groups from HIR It turns out they are completely superfluous in the HIR, so we can drop them completely. We only need to explicitly represent capturing groups. --- regex-syntax/src/hir/mod.rs | 39 +++++--------- regex-syntax/src/hir/print.rs | 19 +++---- regex-syntax/src/hir/translate.rs | 85 ++++++++++++------------------- src/compile.rs | 22 ++++---- 4 files changed, 63 insertions(+), 102 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 117cd1570..186acde49 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1118,39 +1118,25 @@ impl Look { } } -/// The high-level intermediate representation for a group. +/// The high-level intermediate representation for a capturing group. /// -/// This represents one of three possible group types: +/// A capturing group always has an index and a child expression. It may +/// also have a name associated with it (e.g., `(?P\w)`), but it's not +/// necessary. /// -/// 1. A non-capturing group (e.g., `(?:expr)`). -/// 2. A capturing group (e.g., `(expr)`). -/// 3. A named capturing group (e.g., `(?Pexpr)`). +/// Note that there is no explicit representation of a non-capturing group +/// in a `Hir`. Instead, non-capturing grouping is handled automatically by +/// the recursive structure of the `Hir` itself. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Group { - /// The kind of this group. If it is a capturing group, then the kind - /// contains the capture group index (and the name, if it is a named - /// group). - pub kind: GroupKind, + /// The capture index of the group. + pub index: u32, + /// The name of the group, if it exists. + pub name: Option>, /// The expression inside the capturing group, which may be empty. pub hir: Box, } -/// The kind of group. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum GroupKind { - /// A non-capturing group. - NonCapturing, - /// A capturing group with an optional name. - /// - /// The value is the capture index of the group. - Capture { - /// The capture index of the group. - index: u32, - /// The name of the group, if it exists. - name: Option, - }, -} - /// The high-level intermediate representation of a repetition operator. /// /// A repetition operator permits the repetition of an arbitrary @@ -2452,7 +2438,8 @@ mod tests { let mut expr = Hir::empty(); for _ in 0..100 { expr = Hir::group(Group { - kind: GroupKind::NonCapturing, + index: 1, + name: None, hir: Box::new(expr), }); expr = Hir::repetition(Repetition { diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 7f861151f..577e06bda 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -176,17 +176,12 @@ impl Visitor for Writer { self.wtr.write_str(r"\B")?; } }, - HirKind::Group(ref x) => match x.kind { - hir::GroupKind::Capture { ref name, .. } => { - self.wtr.write_str("(")?; - if let Some(ref name) = *name { - write!(self.wtr, "?P<{}>", name)?; - } - } - hir::GroupKind::NonCapturing => { - self.wtr.write_str("(?:")?; + HirKind::Group(hir::Group { ref name, .. }) => { + self.wtr.write_str("(")?; + if let Some(ref name) = *name { + write!(self.wtr, "?P<{}>", name)?; } - }, + } // Why do this? Wrapping concats and alts in non-capturing groups // is not *always* necessary, but is sometimes necessary. For // example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)' @@ -415,11 +410,11 @@ mod tests { fn print_group() { roundtrip("()", "()"); roundtrip("(?P)", "(?P)"); - roundtrip("(?:)", "(?:)"); + roundtrip("(?:)", ""); roundtrip("(a)", "(a)"); roundtrip("(?Pa)", "(?Pa)"); - roundtrip("(?:a)", "(?:a)"); + roundtrip("(?:a)", "a"); roundtrip("((((a))))", "((((a))))"); } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 5c8c03768..a6f988e47 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -915,17 +915,16 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir { - let kind = match group.kind { - ast::GroupKind::CaptureIndex(index) => { - hir::GroupKind::Capture { index, name: None } - } - ast::GroupKind::CaptureName(ref cap) => hir::GroupKind::Capture { - index: cap.index, - name: Some(cap.name.clone()), - }, - ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing, + let (index, name) = match group.kind { + ast::GroupKind::CaptureIndex(index) => (index, None), + ast::GroupKind::CaptureName(ref cap) => { + (cap.index, Some(cap.name.clone().into_boxed_str())) + } + // The HIR doesn't need to use non-capturing groups, since the way + // in which the data type is defined handles this automatically. + ast::GroupKind::NonCapturing(_) => return expr, }; - Hir::group(hir::Group { kind, hir: Box::new(expr) }) + Hir::group(hir::Group { index, name, hir: Box::new(expr) }) } fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { @@ -1349,26 +1348,14 @@ mod tests { Hir::literal(s) } - fn hir_group(i: u32, expr: Hir) -> Hir { - Hir::group(hir::Group { - kind: hir::GroupKind::Capture { index: i, name: None }, - hir: Box::new(expr), - }) + fn hir_group(index: u32, expr: Hir) -> Hir { + Hir::group(hir::Group { index, name: None, hir: Box::new(expr) }) } - fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir { + fn hir_group_name(index: u32, name: &str, expr: Hir) -> Hir { Hir::group(hir::Group { - kind: hir::GroupKind::Capture { - index: i, - name: Some(name.to_string()), - }, - hir: Box::new(expr), - }) - } - - fn hir_group_nocap(expr: Hir) -> Hir { - Hir::group(hir::Group { - kind: hir::GroupKind::NonCapturing, + index, + name: Some(name.into()), hir: Box::new(expr), }) } @@ -1519,7 +1506,7 @@ mod tests { assert_eq!(t(""), Hir::empty()); assert_eq!(t("(?i)"), Hir::empty()); assert_eq!(t("()"), hir_group(1, Hir::empty())); - assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); + assert_eq!(t("(?:)"), Hir::empty()); assert_eq!(t("(?P)"), hir_group_name(1, "wat", Hir::empty())); assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()])); assert_eq!( @@ -1592,10 +1579,7 @@ mod tests { #[cfg(feature = "unicode-case")] assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),])); #[cfg(feature = "unicode-case")] - assert_eq!( - t("(?i:a)"), - hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],)) - ); + assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')])); #[cfg(feature = "unicode-case")] assert_eq!( t("a(?i)a(?-i)a"), @@ -1757,20 +1741,17 @@ mod tests { hir_group_name(2, "bar", hir_lit("b")), ]) ); - assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); - assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a"))); + assert_eq!(t("(?:)"), Hir::empty()); + assert_eq!(t("(?:a)"), hir_lit("a")); assert_eq!( t("(?:a)(b)"), - hir_cat(vec![ - hir_group_nocap(hir_lit("a")), - hir_group(1, hir_lit("b")), - ]) + hir_cat(vec![hir_lit("a"), hir_group(1, hir_lit("b")),]) ); assert_eq!( t("(a)(?:b)(c)"), hir_cat(vec![ hir_group(1, hir_lit("a")), - hir_group_nocap(hir_lit("b")), + hir_lit("b"), hir_group(2, hir_lit("c")), ]) ); @@ -1793,22 +1774,21 @@ mod tests { #[cfg(feature = "unicode-case")] assert_eq!( t("(?i:a)a"), - hir_cat(vec![ - hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])), - hir_lit("a"), - ]) + hir_cat( + vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),] + ) ); assert_eq!( t("(?i-u:a)β"), hir_cat(vec![ - hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), hir_lit("β"), ]) ); assert_eq!( t("(?:(?i-u)a)b"), hir_cat(vec![ - hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), hir_lit("b"), ]) ); @@ -1822,10 +1802,9 @@ mod tests { #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)(?-i:a)a"), - hir_cat(vec![ - hir_group_nocap(hir_lit("a")), - hir_uclass(&[('A', 'A'), ('a', 'a')]), - ]) + hir_cat( + vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),] + ) ); #[cfg(feature = "unicode-case")] assert_eq!( @@ -1858,10 +1837,10 @@ mod tests { assert_eq!( t("(?:a(?i)a)a"), hir_cat(vec![ - hir_group_nocap(hir_cat(vec![ + hir_cat(vec![ hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]), - ])), + ]), hir_lit("a"), ]) ); @@ -1869,10 +1848,10 @@ mod tests { assert_eq!( t("(?i)(?:a(?-i)a)a"), hir_cat(vec![ - hir_group_nocap(hir_cat(vec![ + hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"), - ])), + ]), hir_uclass(&[('A', 'A'), ('a', 'a')]), ]) ); diff --git a/src/compile.rs b/src/compile.rs index e3a859b39..306de186c 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -368,19 +368,19 @@ impl Compiler { self.c_empty_look(prog::EmptyLook::NotWordBoundary) } }, - Group(ref g) => match g.kind { - hir::GroupKind::NonCapturing => self.c(&g.hir), - hir::GroupKind::Capture { index, ref name } => { - if index as usize >= self.compiled.captures.len() { - self.compiled.captures.push(name.clone()); - if let Some(ref name) = *name { - self.capture_name_idx - .insert(name.clone(), index as usize); - } + Group(hir::Group { index, ref name, ref hir }) => { + if index as usize >= self.compiled.captures.len() { + let name = match *name { + None => None, + Some(ref boxed_str) => Some(boxed_str.to_string()), + }; + self.compiled.captures.push(name.clone()); + if let Some(name) = name { + self.capture_name_idx.insert(name, index as usize); } - self.c_capture(2 * index as usize, &g.hir) } - }, + self.c_capture(2 * index as usize, hir) + } Concat(ref es) => { if self.compiled.is_reverse { self.c_concat(es.iter().rev()) From 05cf8619a9c0d64bb08abc6128a397d28f770d27 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 15 Sep 2022 13:42:54 -0400 Subject: [PATCH 28/79] syntax: small HIR simplifications This makes it so 'a{1}' is rewritten as 'a' and '[a]' is rewritten as 'a'. A lot of the tests expected '[a]' to get preserved as a class in the HIR, so this required a bit of surgery. --- regex-syntax/src/hir/mod.rs | 63 +++++++++++++++++++++++--- regex-syntax/src/hir/print.rs | 22 ++++++--- regex-syntax/src/hir/translate.rs | 75 +++++++++++++++++++++---------- 3 files changed, 124 insertions(+), 36 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 186acde49..fceb8e13d 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -122,16 +122,16 @@ impl core::fmt::Display for ErrorKind { /// A high-level intermediate representation (HIR) for a regular expression. /// -/// The HIR of a regular expression represents an intermediate step between its -/// abstract syntax (a structured description of the concrete syntax) and -/// compiled byte codes. The purpose of HIR is to make regular expressions +/// The HIR of a regular expression represents an intermediate step between +/// its abstract syntax (a structured description of the concrete syntax) and +/// an actual regex matcher. The purpose of HIR is to make regular expressions /// easier to analyze. In particular, the AST is much more complex than the /// HIR. For example, while an AST supports arbitrarily nested character /// classes, the HIR will flatten all nested classes into a single set. The HIR /// will also "compile away" every flag present in the concrete syntax. For /// example, users of HIR expressions never need to worry about case folding; -/// it is handled automatically by the translator (e.g., by translating `(?i)A` -/// to `[aA]`). +/// it is handled automatically by the translator (e.g., by translating +/// `(?i:A)` to `[aA]`). /// /// If the HIR was produced by a translator that disallows invalid UTF-8, then /// the HIR is guaranteed to match UTF-8 exclusively. @@ -150,11 +150,13 @@ impl core::fmt::Display for ErrorKind { /// 2. Every HIR expression contains attributes that are defined inductively, /// and can be computed cheaply during the construction process. For /// example, one such attribute is whether the expression must match at the -/// beginning of the text. +/// beginning of the haystack. /// /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular /// expression pattern string, and uses constant stack space and heap space -/// proportional to the size of the `Hir`. +/// proportional to the size of the `Hir`. The regex it prints is guaranteed to +/// be _semantically_ equivalent to the original concrete syntax, but it may +/// look very different. (And potentially not practically readable by a human.) #[derive(Clone, Debug, Eq, PartialEq)] pub struct Hir { /// The underlying HIR kind. @@ -252,6 +254,9 @@ impl Hir { /// Creates a class HIR expression. pub fn class(class: Class) -> Hir { + if let Some(bytes) = class.literal() { + return Hir::literal(bytes); + } let props = Properties::class(&class); Hir { kind: HirKind::Class(class), props } } @@ -267,8 +272,12 @@ impl Hir { // The regex 'a{0}' is always equivalent to the empty regex. This is // true even when 'a' is an expression that never matches anything // (like '\P{any}'). + // + // Additionally, the regex 'a{1}' is always equivalent to 'a'. if rep.min == 0 && rep.max == Some(0) { return Hir::empty(); + } else if rep.min == 1 && rep.max == Some(1) { + return *rep.hir; } let props = Properties::repetition(&rep); Hir { kind: HirKind::Repetition(rep), props } @@ -541,6 +550,18 @@ impl Class { Class::Bytes(ref x) => x.maximum_len(), } } + + /// If this class consists of exactly one element (whether a codepoint or a + /// byte), then return it as a literal byte string. + /// + /// If this class is empty or contains more than one element, then `None` + /// is returned. + pub fn literal(&self) -> Option> { + match *self { + Class::Unicode(ref x) => x.literal(), + Class::Bytes(ref x) => x.literal(), + } + } } /// A set of characters represented by Unicode scalar values. @@ -680,6 +701,20 @@ impl ClassUnicode { // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). Some(last.end.len_utf8()) } + + /// If this class consists of exactly one codepoint, then return it as + /// a literal byte string. + /// + /// If this class is empty or contains more than one codepoint, then `None` + /// is returned. + pub fn literal(&self) -> Option> { + let rs = self.ranges(); + if rs.len() == 1 && rs[0].start == rs[0].end { + Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes()) + } else { + None + } + } } /// An iterator over all ranges in a Unicode character class. @@ -932,6 +967,20 @@ impl ClassBytes { Some(1) } } + + /// If this class consists of exactly one byte, then return it as + /// a literal byte string. + /// + /// If this class is empty or contains more than one byte, then `None` + /// is returned. + pub fn literal(&self) -> Option> { + let rs = self.ranges(); + if rs.len() == 1 && rs[0].start == rs[0].end { + Some(vec![rs[0].start]) + } else { + None + } + } } /// An iterator over all ranges in a byte character class. diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 577e06bda..357d4e770 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -129,6 +129,11 @@ impl Visitor for Writer { for range in cls.iter() { if range.start() == range.end() { self.write_literal_char(range.start())?; + } else if u32::from(range.start()) + 1 + == u32::from(range.end()) + { + self.write_literal_char(range.start())?; + self.write_literal_char(range.end())?; } else { self.write_literal_char(range.start())?; self.wtr.write_str("-")?; @@ -142,6 +147,9 @@ impl Visitor for Writer { for range in cls.iter() { if range.start() == range.end() { self.write_literal_class_byte(range.start())?; + } else if range.start() + 1 == range.end() { + self.write_literal_class_byte(range.start())?; + self.write_literal_class_byte(range.end())?; } else { self.write_literal_class_byte(range.start())?; self.wtr.write_str("-")?; @@ -327,26 +335,28 @@ mod tests { #[test] fn print_class() { - roundtrip(r"[a]", r"[a]"); + roundtrip(r"[a]", r"a"); + roundtrip(r"[ab]", r"[ab]"); roundtrip(r"[a-z]", r"[a-z]"); roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]"); - roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]"); - roundtrip(r"[-]", r"[\-]"); + roundtrip(r"[^\x01-\u{10FFFF}]", "\u{0}"); + roundtrip(r"[-]", r"\-"); roundtrip(r"[☃-⛄]", r"[☃-⛄]"); - roundtrip(r"(?-u)[a]", r"(?-u:[a])"); + roundtrip(r"(?-u)[a]", r"a"); + roundtrip(r"(?-u)[ab]", r"(?-u:[ab])"); roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])"); roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])"); // The following test that the printer escapes meta characters // in character classes. - roundtrip(r"[\[]", r"[\[]"); + roundtrip(r"[\[]", r"\["); roundtrip(r"[Z-_]", r"[Z-_]"); roundtrip(r"[Z-_--Z]", r"[\[-_]"); // The following test that the printer escapes meta characters // in byte oriented character classes. - roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])"); + roundtrip_bytes(r"(?-u)[\[]", r"\["); roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])"); roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])"); } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index a6f988e47..eca69751b 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -1428,19 +1428,11 @@ mod tests { } fn hir_uclass(ranges: &[(char, char)]) -> Hir { - let ranges: Vec = ranges - .iter() - .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) - .collect(); - Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges))) + Hir::class(uclass(ranges)) } fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { - let ranges: Vec = ranges - .iter() - .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) - .collect(); - Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) + Hir::class(bclass(ranges)) } fn hir_case_fold(expr: Hir) -> Hir { @@ -1463,6 +1455,33 @@ mod tests { } } + fn uclass(ranges: &[(char, char)]) -> hir::Class { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) + .collect(); + hir::Class::Unicode(hir::ClassUnicode::new(ranges)) + } + + fn bclass(ranges: &[(u8, u8)]) -> hir::Class { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) + .collect(); + hir::Class::Bytes(hir::ClassBytes::new(ranges)) + } + + #[cfg(feature = "unicode-case")] + fn class_case_fold(mut cls: hir::Class) -> Hir { + cls.case_fold_simple(); + Hir::class(cls) + } + + fn class_negate(mut cls: hir::Class) -> Hir { + cls.negate(); + Hir::class(cls) + } + #[allow(dead_code)] fn hir_union(expr1: Hir, expr2: Hir) -> Hir { use crate::hir::Class::{Bytes, Unicode}; @@ -2522,8 +2541,9 @@ mod tests { #[test] fn class_bracketed() { - assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')])); - assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')]))); + assert_eq!(t("[a]"), hir_lit("a")); + assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')])); + assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')]))); assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')])); assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')])); assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')])); @@ -2586,11 +2606,11 @@ mod tests { ); assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),])); - assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')]))); - assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')]))); + assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')]))); + assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')]))); assert_eq!( t_bytes("(?-u)[^a]"), - hir_negate(hir_bclass(&[(b'a', b'a')])) + class_negate(bclass(&[(b'a', b'a')])) ); #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!( @@ -2778,8 +2798,8 @@ mod tests { #[test] fn class_bracketed_nested() { - assert_eq!(t(r"[a[^c]]"), hir_negate(hir_uclass(&[('c', 'c')]))); - assert_eq!(t(r"[a-b[^c]]"), hir_negate(hir_uclass(&[('c', 'c')]))); + assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')]))); + assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')]))); assert_eq!(t(r"[a-c[^c]]"), hir_negate(hir_uclass(&[]))); assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); @@ -2788,12 +2808,12 @@ mod tests { #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[a[^c]]"), - hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) + hir_negate(class_case_fold(uclass(&[('c', 'c')]))) ); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[a-b[^c]]"), - hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) + hir_negate(class_case_fold(uclass(&[('c', 'c')]))) ); #[cfg(feature = "unicode-case")] @@ -3239,6 +3259,10 @@ mod tests { assert!(props(r"ab").is_literal()); assert!(props(r"abc").is_literal()); assert!(props(r"(?m)abc").is_literal()); + assert!(props(r"(?:a)").is_literal()); + assert!(props(r"foo(?:a)").is_literal()); + assert!(props(r"(?:a)foo").is_literal()); + assert!(props(r"[a]").is_literal()); // Negative examples. assert!(!props(r"").is_literal()); @@ -3248,7 +3272,7 @@ mod tests { assert!(!props(r"a+").is_literal()); assert!(!props(r"foo(a)").is_literal()); assert!(!props(r"(a)foo").is_literal()); - assert!(!props(r"[a]").is_literal()); + assert!(!props(r"[ab]").is_literal()); } #[test] @@ -3262,6 +3286,11 @@ mod tests { assert!(props(r"a|b|c").is_alternation_literal()); assert!(props(r"foo|bar").is_alternation_literal()); assert!(props(r"foo|bar|baz").is_alternation_literal()); + assert!(props(r"[a]").is_alternation_literal()); + assert!(props(r"[a]|b").is_alternation_literal()); + assert!(props(r"a|[b]").is_alternation_literal()); + assert!(props(r"(?:a)|b").is_alternation_literal()); + assert!(props(r"a|(?:b)").is_alternation_literal()); // Negative examples. assert!(!props(r"").is_alternation_literal()); @@ -3270,9 +3299,9 @@ mod tests { assert!(!props(r"a+").is_alternation_literal()); assert!(!props(r"foo(a)").is_alternation_literal()); assert!(!props(r"(a)foo").is_alternation_literal()); - assert!(!props(r"[a]").is_alternation_literal()); - assert!(!props(r"[a]|b").is_alternation_literal()); - assert!(!props(r"a|[b]").is_alternation_literal()); + assert!(!props(r"[ab]").is_alternation_literal()); + assert!(!props(r"[ab]|b").is_alternation_literal()); + assert!(!props(r"a|[ab]").is_alternation_literal()); assert!(!props(r"(a)|b").is_alternation_literal()); assert!(!props(r"a|(b)").is_alternation_literal()); } From 22a361221ce1fae6a0fee495219552ffdba47d9f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 15 Sep 2022 14:24:58 -0400 Subject: [PATCH 29/79] syntax: add 'Hir::dot' method to replace 'Hir::{any,dot}_{char,byte}' In a previous commit, I replaced 'Hir::{any,dot}' a total of four methods. Essentially, I expanded out the boolean parameter to 'Hir::{any,dot}'. I later realized that we'll probably need a "dot except for CR and LF" too. And having four methods all for the same 'dot' construct seemed a bit much. So I've turned it into one method with a new 'Dot' enum. Eventually, that enum should grow two more variants: 'AnyCharExceptCRLF' and 'AnyByteExceptCRLF'. That sort of expansion would have been pretty annoying to do (because of naming) in the prior scheme. --- regex-syntax/src/hir/mod.rs | 108 +++++++++++++++++------------- regex-syntax/src/hir/translate.rs | 33 ++++----- src/compile.rs | 4 +- 3 files changed, 80 insertions(+), 65 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index fceb8e13d..3c21d487e 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -317,54 +317,41 @@ impl Hir { } } - /// Build an HIR expression for `.`. - /// - /// A `.` expression matches any character except for a newline terminator. - /// To build an expression that matches any character, including newline - /// terminators, use the `any_char` method. - pub fn dot_char() -> Hir { - let mut cls = ClassUnicode::empty(); - cls.push(ClassUnicodeRange::new('\0', '\x09')); - cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); - Hir::class(Class::Unicode(cls)) - } - - /// Build an HIR expression for `(?-u:.)`. - /// - /// A non-Unicode `.` expression matches any byte except for a newline - /// terminator. To build an expression that matches any byte, including - /// newline terminators, use the `any_byte` method. - pub fn dot_byte() -> Hir { - let mut cls = ClassBytes::empty(); - cls.push(ClassBytesRange::new(b'\0', b'\x09')); - cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); - Hir::class(Class::Bytes(cls)) - } - - /// Build an HIR expression for `(?s:.)`. - /// - /// A `(?s:.)` expression matches any character, including newline - /// terminators. To build an expression that matches any character except - /// for newline terminators, use the `dot_char` method. - /// - /// Note that `(?s:)` is equivalent to `\p{any}`. - pub fn any_char() -> Hir { - let mut cls = ClassUnicode::empty(); - cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); - Hir::class(Class::Unicode(cls)) - } - - /// Build an HIR expression for `(?s-u:.)`. - /// - /// A `(?s-u:.)` expression matches any byte, including newline terminators. - /// To build an expression that matches any byte except for newline - /// terminators, use the `dot_byte` method. - /// - /// Note that `(?s-u:.)` is equivalent to `(?-u:[\x00-\xFF])`. - pub fn any_byte() -> Hir { - let mut cls = ClassBytes::empty(); - cls.push(ClassBytesRange::new(b'\0', b'\xFF')); - Hir::class(Class::Bytes(cls)) + /// Returns an HIR expression for `.`. + /// + /// * [`Dot::AnyChar`] maps to `(?su:.)`. + /// * [`Dot::AnyByte`] maps to `(?s-u:.)`. + /// * [`Dot::AnyCharExceptNL`] maps to `(?u-s:.)`. + /// * [`Dot::AnyByteExceptNL`] maps to `(?-su:.)`. + /// + /// Note that this is a convenience routine for constructing the correct + /// character class based on the value of `Dot`. There is no explicit "dot" + /// HIR value. It is just an abbreviation for a common character class. + pub fn dot(dot: Dot) -> Hir { + match dot { + Dot::AnyChar => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyByte => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + Dot::AnyCharExceptNL => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyByteExceptNL => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + } } } @@ -1233,6 +1220,31 @@ impl Repetition { } } +/// A type describing the different flavors of `.`. +/// +/// This type is meant to be used with [`Hir::dot`], which is a convenience +/// routine for building HIR values derived from the `.` regex. +#[non_exhaustive] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Dot { + /// Matches the UTF-8 encoding of any Unicode scalar value. + /// + /// This is equivalent to `(?su:.)` and also `\p{any}`. + AnyChar, + /// Matches any byte value. + /// + /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`. + AnyByte, + /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`. + /// + /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`. + AnyCharExceptNL, + /// Matches any byte value except for `\n`. + /// + /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`. + AnyByteExceptNL, +} + /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack /// space but heap space proportional to the depth of the total `Hir`. impl Drop for Hir { diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index eca69751b..85596b436 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -856,23 +856,10 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } fn hir_dot(&self, span: Span) -> Result { - let unicode = self.flags().unicode(); - if !unicode && !self.trans().allow_invalid_utf8 { + if !self.flags().unicode() && !self.trans().allow_invalid_utf8 { return Err(self.error(span, ErrorKind::InvalidUtf8)); } - Ok(if self.flags().dot_matches_new_line() { - if unicode { - Hir::any_char() - } else { - Hir::any_byte() - } - } else { - if unicode { - Hir::dot_char() - } else { - Hir::dot_byte() - } - }) + Ok(Hir::dot(self.flags().dot())) } fn hir_assertion(&self, asst: &ast::Assertion) -> Result { @@ -1210,6 +1197,22 @@ impl Flags { } } + fn dot(&self) -> hir::Dot { + if self.dot_matches_new_line() { + if self.unicode() { + hir::Dot::AnyChar + } else { + hir::Dot::AnyByte + } + } else { + if self.unicode() { + hir::Dot::AnyCharExceptNL + } else { + hir::Dot::AnyByteExceptNL + } + } + } + fn case_insensitive(&self) -> bool { self.case_insensitive.unwrap_or(false) } diff --git a/src/compile.rs b/src/compile.rs index 306de186c..692533340 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -425,9 +425,9 @@ impl Compiler { fn c_dotstar(&mut self) -> Result { let hir = if self.compiled.only_utf8() { - Hir::any_char() + Hir::dot(hir::Dot::AnyChar) } else { - Hir::any_byte() + Hir::dot(hir::Dot::AnyByte) }; Ok(self .c(&Hir::repetition(hir::Repetition { From a5ee3ccd6bc20085c960c4a7b2987289e058a0f1 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 15 Sep 2022 14:49:06 -0400 Subject: [PATCH 30/79] syntax: tweak concat and alternation construction We simplify construction a bit to prepare for bigger simplifications. We also fix a bug in 'Hir::alternation' where it would incorrectly return 'Hir::empty()' when given an empty alternation. That's correct for an empty concatenation, but an alternation with no branches is equivalent to an expression that never matches anything. To fix that, we create a new 'Hir::fail' that canonicalizes the HIR value used to indicate "impossible to match." Thankfully this bug was unlikely to be observed unless one was constructing HIR values manually. Namely, it is impossible to spell "empty alternation" in the concrete syntax of a regex. --- regex-syntax/src/hir/mod.rs | 56 ++++++++++++++++++++++++++----- regex-syntax/src/hir/translate.rs | 2 +- 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 3c21d487e..42f59ba6b 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -236,6 +236,29 @@ impl Hir { Hir { kind: HirKind::Empty, props } } + /// Returns an HIR expression that can never match anything. That is, the + /// set of strings in the language described by the HIR returned is `0`. + /// + /// This is distinct from [`Hir::empty`] in that the empty string matches + /// the HIR returned by `Hir::empty`. That is, the set of strings in the + /// language describe described by `Hir::empty` is non-empty. + /// + /// Note that currently, the HIR returned uses an empty character class to + /// indicate that nothing can match. An equivalent expression that cannot + /// match is an empty alternation, but all such "fail" expressions are + /// normalized (via smart constructors) to empty character classes. This is + /// because empty character classes can be spelled in the concrete syntax + /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but + /// empty alternations cannot. + pub fn fail() -> Hir { + let class = Class::Bytes(ClassBytes::empty()); + let props = Properties::class(&class); + // We can't just call Hir::class here because it defers to Hir::fail + // in order to canonicalize the Hir value used to represent "cannot + // match." + Hir { kind: HirKind::Class(class), props } + } + /// Creates a literal HIR expression. /// /// If the given literal has a `Byte` variant with an ASCII byte, then this @@ -254,7 +277,9 @@ impl Hir { /// Creates a class HIR expression. pub fn class(class: Class) -> Hir { - if let Some(bytes) = class.literal() { + if class.is_empty() { + return Hir::fail(); + } else if let Some(bytes) = class.literal() { return Hir::literal(bytes); } let props = Properties::class(&class); @@ -293,20 +318,24 @@ impl Hir { /// /// This flattens the concatenation as appropriate. pub fn concat(mut exprs: Vec) -> Hir { - match exprs.len() { - 0 => Hir::empty(), - 1 => exprs.pop().unwrap(), - _ => { - let props = Properties::concat(&exprs); - Hir { kind: HirKind::Concat(exprs), props } - } + if exprs.is_empty() { + return Hir::empty(); + } else if exprs.len() == 1 { + return exprs.pop().unwrap(); } + let props = Properties::concat(&exprs); + Hir { kind: HirKind::Concat(exprs), props } } /// Returns the alternation of the given expressions. /// /// This flattens the alternation as appropriate. pub fn alternation(mut exprs: Vec) -> Hir { + if exprs.is_empty() { + return Hir::fail(); + } else if exprs.len() == 1 { + return exprs.pop().unwrap(); + } match exprs.len() { 0 => Hir::empty(), 1 => exprs.pop().unwrap(), @@ -538,6 +567,17 @@ impl Class { } } + /// Returns true if and only if this character class is empty. That is, + /// it has no elements. + /// + /// An empty character can never match anything, including an empty string. + pub fn is_empty(&self) -> bool { + match *self { + Class::Unicode(ref x) => x.ranges().is_empty(), + Class::Bytes(ref x) => x.ranges().is_empty(), + } + } + /// If this class consists of exactly one element (whether a codepoint or a /// byte), then return it as a literal byte string. /// diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 85596b436..6ce7f84de 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -2803,7 +2803,7 @@ mod tests { fn class_bracketed_nested() { assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')]))); assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')]))); - assert_eq!(t(r"[a-c[^c]]"), hir_negate(hir_uclass(&[]))); + assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[]))); assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')])); From 7e412474312275ebc7d45493066bdd391e19f409 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 16 Sep 2022 11:00:55 -0400 Subject: [PATCH 31/79] syntax: tweak Debug impl for Hir The default derive(Debug) impl for Hir is very noisy because it lists out the properties for every Hir value. We change the default to just print out the actual expressions and omit the properties. But one can opt back into seeing the properties via the "alternate" impl. i.e., {:#?} instead of {:?}. --- regex-syntax/src/hir/mod.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 42f59ba6b..7d3ba5082 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -157,7 +157,14 @@ impl core::fmt::Display for ErrorKind { /// proportional to the size of the `Hir`. The regex it prints is guaranteed to /// be _semantically_ equivalent to the original concrete syntax, but it may /// look very different. (And potentially not practically readable by a human.) -#[derive(Clone, Debug, Eq, PartialEq)] +/// +/// An `Hir`'s `fmt::Debug` implementation currently does not use constant +/// stack space. The default implementation will also suppress some details +/// (such as the `Properties` inlined into every `Hir` value to make it less +/// noisy), but using the "[alternate]" format option will show everything. +/// +/// [alternate]: https://doc.rust-lang.org/std/fmt/struct.Formatter.html#method.alternate +#[derive(Clone, Eq, PartialEq)] pub struct Hir { /// The underlying HIR kind. kind: HirKind, @@ -413,6 +420,19 @@ impl HirKind { } } +impl core::fmt::Debug for Hir { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + if f.alternate() { + f.debug_struct("Hir") + .field("kind", &self.kind) + .field("props", &self.props) + .finish() + } else { + self.kind.fmt(f) + } + } +} + /// Print a display representation of this Hir. /// /// The result of this is a valid regular expression pattern string. From 73518e9447c30a166109cdcce57c3c41ec9f29e1 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 16 Sep 2022 11:02:15 -0400 Subject: [PATCH 32/79] syntax: flatten concatenations This makes the Hir::concat constructor a bit smarter by combining adjacent literals and flattening child concatenations into the parent concatenation. --- regex-syntax/src/hir/mod.rs | 76 ++++++++++++++++++++++++++++--- regex-syntax/src/hir/translate.rs | 27 +++++++++++ 2 files changed, 97 insertions(+), 6 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 7d3ba5082..d35440ec2 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -221,6 +221,17 @@ impl Hir { pub fn properties(&self) -> &Properties { &self.props } + + /// Splits this HIR into its constituent parts. + /// + /// This is useful because `let Hir { kind, props } = hir;` does not work + /// because of `Hir`'s custom `Drop` implementation. + fn into_parts(mut self) -> (HirKind, Properties) { + ( + core::mem::replace(&mut self.kind, HirKind::Empty), + core::mem::replace(&mut self.props, Properties::empty()), + ) + } } /// Smart constructors for HIR values. @@ -324,14 +335,67 @@ impl Hir { /// Returns the concatenation of the given expressions. /// /// This flattens the concatenation as appropriate. - pub fn concat(mut exprs: Vec) -> Hir { - if exprs.is_empty() { + pub fn concat(hirs: Vec) -> Hir { + // We rebuild the concatenation by simplifying it. Would be nice to do + // it in place, but that seems a little tricky? + let mut new = vec![]; + // This gobbles up any adjacent literals in a concatenation and smushes + // them together. Basically, when we see a literal, we add its bytes + // to 'prior_lit', and whenever we see anything else, we first take + // any bytes in 'prior_lit' and add it to the 'new' concatenation. + let mut prior_lit: Option> = None; + for hir in hirs { + let (kind, props) = hir.into_parts(); + match kind { + HirKind::Literal(Literal(bytes)) => { + if let Some(ref mut prior_bytes) = prior_lit { + prior_bytes.extend_from_slice(&bytes); + } else { + prior_lit = Some(bytes.to_vec()); + } + } + // We also flatten concats that are direct children of another + // concat. We only need to do this one level deep since + // Hir::concat is the only way to build concatenations, and so + // flattening happens inductively. + HirKind::Concat(hirs2) => { + for hir2 in hirs2 { + let (kind2, props2) = hir2.into_parts(); + match kind2 { + HirKind::Literal(Literal(bytes)) => { + if let Some(ref mut prior_bytes) = prior_lit { + prior_bytes.extend_from_slice(&bytes); + } else { + prior_lit = Some(bytes.to_vec()); + } + } + kind2 => { + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + new.push(Hir { kind: kind2, props: props2 }); + } + } + } + } + kind => { + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + new.push(Hir { kind, props }); + } + } + } + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + if new.is_empty() { return Hir::empty(); - } else if exprs.len() == 1 { - return exprs.pop().unwrap(); + } else if new.len() == 1 { + return new.pop().unwrap(); } - let props = Properties::concat(&exprs); - Hir { kind: HirKind::Concat(exprs), props } + let props = Properties::concat(&new); + Hir { kind: HirKind::Concat(new), props } } /// Returns the alternation of the given expressions. diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 6ce7f84de..ae1dcc5b6 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -3308,4 +3308,31 @@ mod tests { assert!(!props(r"(a)|b").is_alternation_literal()); assert!(!props(r"a|(b)").is_alternation_literal()); } + + // This tests that the smart Hir::concat constructor simplifies the given + // exprs in a way we expect. + #[test] + fn smart_concat() { + assert_eq!(t(""), Hir::empty()); + assert_eq!(t("(?:)"), Hir::empty()); + assert_eq!(t("abc"), hir_lit("abc")); + assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar")); + assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz")); + assert_eq!( + t("foo(?:bar^baz)quux"), + hir_cat(vec![ + hir_lit("foobar"), + hir_look(hir::Look::Start), + hir_lit("bazquux"), + ]) + ); + assert_eq!( + t("foo(?:ba(?:r^b)az)quux"), + hir_cat(vec![ + hir_lit("foobar"), + hir_look(hir::Look::Start), + hir_lit("bazquux"), + ]) + ); + } } From d9922ccb4ff2b6207cf29926b1cbb2fa7a8c00f2 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 19 Sep 2022 15:02:18 -0400 Subject: [PATCH 33/79] syntax: tweak Hir's debug impl again Just always strip Properties. It's so annoying to see it when you really just want to see the syntax. --- regex-syntax/src/hir/mod.rs | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index d35440ec2..88cabeffe 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -159,11 +159,8 @@ impl core::fmt::Display for ErrorKind { /// look very different. (And potentially not practically readable by a human.) /// /// An `Hir`'s `fmt::Debug` implementation currently does not use constant -/// stack space. The default implementation will also suppress some details -/// (such as the `Properties` inlined into every `Hir` value to make it less -/// noisy), but using the "[alternate]" format option will show everything. -/// -/// [alternate]: https://doc.rust-lang.org/std/fmt/struct.Formatter.html#method.alternate +/// stack space. The implementation will also suppress some details (such as +/// the `Properties` inlined into every `Hir` value to make it less noisy). #[derive(Clone, Eq, PartialEq)] pub struct Hir { /// The underlying HIR kind. @@ -486,14 +483,7 @@ impl HirKind { impl core::fmt::Debug for Hir { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - if f.alternate() { - f.debug_struct("Hir") - .field("kind", &self.kind) - .field("props", &self.props) - .finish() - } else { - self.kind.fmt(f) - } + self.kind.fmt(f) } } From 232256ed891ecaced6cff6a69bfb2d2a527fc89c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 19 Sep 2022 21:34:32 -0400 Subject: [PATCH 34/79] syntax: simplify alternations This commit simplifies alternations by flattening them, similar to how a recent commit flattened concatenations. Although, this is simpler than concatenations, because we can't do anything with literals. Like concatenations, we only need to look one layer deep, since this is applied inductively. --- regex-syntax/src/hir/mod.rs | 33 ++++++++++++++++++++----------- regex-syntax/src/hir/translate.rs | 20 +++++++++++++++++++ 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 88cabeffe..841f304cf 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -398,20 +398,29 @@ impl Hir { /// Returns the alternation of the given expressions. /// /// This flattens the alternation as appropriate. - pub fn alternation(mut exprs: Vec) -> Hir { - if exprs.is_empty() { - return Hir::fail(); - } else if exprs.len() == 1 { - return exprs.pop().unwrap(); - } - match exprs.len() { - 0 => Hir::empty(), - 1 => exprs.pop().unwrap(), - _ => { - let props = Properties::alternation(&exprs); - Hir { kind: HirKind::Alternation(exprs), props } + pub fn alternation(hirs: Vec) -> Hir { + // We rebuild the alternation by simplifying it. We proceed similarly + // as the concatenation case. But in this case, there's no literal + // simplification happening. We're just flattening alternations. + let mut new = vec![]; + for hir in hirs { + let (kind, props) = hir.into_parts(); + match kind { + HirKind::Alternation(hirs2) => { + new.extend(hirs2); + } + kind => { + new.push(Hir { kind, props }); + } } } + if new.is_empty() { + return Hir::fail(); + } else if new.len() == 1 { + return new.pop().unwrap(); + } + let props = Properties::alternation(&new); + Hir { kind: HirKind::Alternation(new), props } } /// Returns an HIR expression for `.`. diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index ae1dcc5b6..0e41eb1df 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -3335,4 +3335,24 @@ mod tests { ]) ); } + + // This tests that the smart Hir::alternation constructor simplifies the + // given exprs in a way we expect. + #[test] + fn smart_alternation() { + assert_eq!( + t("(?:foo)|(?:bar)"), + hir_alt(vec![hir_lit("foo"), hir_lit("bar")]) + ); + assert_eq!( + t("quux|(?:abc|def|xyz)|baz"), + hir_alt(vec![ + hir_lit("quux"), + hir_lit("abc"), + hir_lit("def"), + hir_lit("xyz"), + hir_lit("baz"), + ]) + ); + } } From 05f38baf22e2fbfd657f7843b1747ad12bf6bc27 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 20 Sep 2022 09:35:41 -0400 Subject: [PATCH 35/79] syntax: simplify single char alternations In short, simplify 'a|b|..|z' to '[a-z]'. --- regex-syntax/src/debug.rs | 25 +------------ regex-syntax/src/hir/mod.rs | 60 +++++++++++++++++++++++++++++++ regex-syntax/src/hir/print.rs | 24 +++++++------ regex-syntax/src/hir/translate.rs | 29 +++++++++++---- 4 files changed, 97 insertions(+), 41 deletions(-) diff --git a/regex-syntax/src/debug.rs b/regex-syntax/src/debug.rs index 846e68156..1b0d527d4 100644 --- a/regex-syntax/src/debug.rs +++ b/regex-syntax/src/debug.rs @@ -74,7 +74,7 @@ impl<'a> core::fmt::Debug for Bytes<'a> { /// byte slice, then the first byte is returned instead. /// /// This returns `None` if and only if `bytes` is empty. -fn utf8_decode(bytes: &[u8]) -> Option> { +pub(crate) fn utf8_decode(bytes: &[u8]) -> Option> { if bytes.is_empty() { return None; } @@ -83,26 +83,3 @@ fn utf8_decode(bytes: &[u8]) -> Option> { Err(_) => Some(Err(bytes[0])), } } - -/* -/// Given a UTF-8 leading byte, this returns the total number of code units -/// in the following encoded codepoint. -/// -/// If the given byte is not a valid UTF-8 leading byte, then this returns -/// `None`. -fn len(byte: u8) -> Option { - if byte <= 0x7F { - return Some(1); - } else if byte & 0b1100_0000 == 0b1000_0000 { - return None; - } else if byte <= 0b1101_1111 { - Some(2) - } else if byte <= 0b1110_1111 { - Some(3) - } else if byte <= 0b1111_0111 { - Some(4) - } else { - None - } -} -*/ diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 841f304cf..6aba9f9ea 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -419,6 +419,25 @@ impl Hir { } else if new.len() == 1 { return new.pop().unwrap(); } + // Now that it's completely flattened, look for the special case of + // 'char1|char2|...|charN' and collapse that into a class. Note that we + // look for 'char' first and then bytes. The issue here is that if we + // find both non-ASCII codepoints and non-ASCII singleton bytes, then + // it isn't actually possible to smush them into a single class. So we + // look for all chars and then all bytes, and don't handle anything + // else. + if let Some(singletons) = singleton_chars(&new) { + let it = singletons + .into_iter() + .map(|ch| ClassUnicodeRange { start: ch, end: ch }); + return Hir::class(Class::Unicode(ClassUnicode::new(it))); + } + if let Some(singletons) = singleton_bytes(&new) { + let it = singletons + .into_iter() + .map(|b| ClassBytesRange { start: b, end: b }); + return Hir::class(Class::Bytes(ClassBytes::new(it))); + } let props = Properties::alternation(&new); Hir { kind: HirKind::Alternation(new), props } } @@ -1886,6 +1905,47 @@ impl Iterator for LookSetIter { } } +/// Given a sequence of HIR values where each value corresponds to a literal +/// that is a single `char`, return that sequence of `char`s. Otherwise return +/// None. No deduplication is done. +fn singleton_chars(hirs: &[Hir]) -> Option> { + let mut singletons = vec![]; + for hir in hirs.iter() { + let literal = match *hir.kind() { + HirKind::Literal(Literal(ref bytes)) => bytes, + _ => return None, + }; + let ch = match crate::debug::utf8_decode(literal) { + None => return None, + Some(Err(_)) => return None, + Some(Ok(ch)) => ch, + }; + if literal.len() != ch.len_utf8() { + return None; + } + singletons.push(ch); + } + Some(singletons) +} + +/// Given a sequence of HIR values where each value corresponds to a literal +/// that is a single byte, return that sequence of bytes. Otherwise return +/// None. No deduplication is done. +fn singleton_bytes(hirs: &[Hir]) -> Option> { + let mut singletons = vec![]; + for hir in hirs.iter() { + let literal = match *hir.kind() { + HirKind::Literal(Literal(ref bytes)) => bytes, + _ => return None, + }; + if literal.len() != 1 { + return None; + } + singletons.push(literal[0]); + } + Some(singletons) +} + #[cfg(test)] mod tests { use super::*; diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 357d4e770..6ba51f0b8 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -434,8 +434,10 @@ mod tests { roundtrip("|", "(?:|)"); roundtrip("||", "(?:||)"); - roundtrip("a|b", "(?:a|b)"); - roundtrip("a|b|c", "(?:a|b|c)"); + roundtrip("a|b", "[ab]"); + roundtrip("ab|cd", "(?:(?:ab)|(?:cd))"); + roundtrip("a|b|c", "[a-c]"); + roundtrip("ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))"); roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))"); } @@ -494,19 +496,19 @@ mod tests { #[test] fn regression_repetition_alternation() { let expr = Hir::concat(alloc::vec![ - Hir::literal("x".as_bytes()), + Hir::literal("ab".as_bytes()), Hir::repetition(hir::Repetition { min: 1, max: None, greedy: true, hir: Box::new(Hir::alternation(alloc::vec![ - Hir::literal("a".as_bytes()), - Hir::literal("b".as_bytes()), + Hir::literal("cd".as_bytes()), + Hir::literal("ef".as_bytes()), ])), }), - Hir::literal("y".as_bytes()), + Hir::literal("gh".as_bytes()), ]); - assert_eq!(r"(?:x(?:a|b)+y)", expr.to_string()); + assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string()); let expr = Hir::concat(alloc::vec![ Hir::look(hir::Look::Start), @@ -538,13 +540,13 @@ mod tests { #[test] fn regression_alternation_concat() { let expr = Hir::concat(alloc::vec![ - Hir::literal("a".as_bytes()), + Hir::literal("ab".as_bytes()), Hir::alternation(alloc::vec![ - Hir::literal("b".as_bytes()), - Hir::literal("c".as_bytes()), + Hir::literal("mn".as_bytes()), + Hir::literal("xy".as_bytes()), ]), ]); - assert_eq!(r"(?:a(?:b|c))", expr.to_string()); + assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string()); let expr = Hir::concat(alloc::vec![ Hir::look(hir::Look::Start), diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 0e41eb1df..8934139a2 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -3285,15 +3285,11 @@ mod tests { assert!(props(r"ab").is_alternation_literal()); assert!(props(r"abc").is_alternation_literal()); assert!(props(r"(?m)abc").is_alternation_literal()); - assert!(props(r"a|b").is_alternation_literal()); - assert!(props(r"a|b|c").is_alternation_literal()); assert!(props(r"foo|bar").is_alternation_literal()); assert!(props(r"foo|bar|baz").is_alternation_literal()); assert!(props(r"[a]").is_alternation_literal()); - assert!(props(r"[a]|b").is_alternation_literal()); - assert!(props(r"a|[b]").is_alternation_literal()); - assert!(props(r"(?:a)|b").is_alternation_literal()); - assert!(props(r"a|(?:b)").is_alternation_literal()); + assert!(props(r"(?:ab)|cd").is_alternation_literal()); + assert!(props(r"ab|(?:cd)").is_alternation_literal()); // Negative examples. assert!(!props(r"").is_alternation_literal()); @@ -3307,6 +3303,12 @@ mod tests { assert!(!props(r"a|[ab]").is_alternation_literal()); assert!(!props(r"(a)|b").is_alternation_literal()); assert!(!props(r"a|(b)").is_alternation_literal()); + assert!(!props(r"a|b").is_alternation_literal()); + assert!(!props(r"a|b|c").is_alternation_literal()); + assert!(!props(r"[a]|b").is_alternation_literal()); + assert!(!props(r"a|[b]").is_alternation_literal()); + assert!(!props(r"(?:a)|b").is_alternation_literal()); + assert!(!props(r"a|(?:b)").is_alternation_literal()); } // This tests that the smart Hir::concat constructor simplifies the given @@ -3354,5 +3356,20 @@ mod tests { hir_lit("baz"), ]) ); + assert_eq!( + t("quux|(?:abc|(?:def|mno)|xyz)|baz"), + hir_alt(vec![ + hir_lit("quux"), + hir_lit("abc"), + hir_lit("def"), + hir_lit("mno"), + hir_lit("xyz"), + hir_lit("baz"), + ]) + ); + assert_eq!( + t("a|b|c|d|e|f|x|y|z"), + hir_uclass(&[('a', 'f'), ('x', 'z')]), + ); } } From 25d103dbf443e7d3406d030d4c288b6edca6a70d Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 21 Sep 2022 11:15:17 -0400 Subject: [PATCH 36/79] syntax: fix empty char class bug in HIR printer When a character class is empty, the HIR printer would emit '[]', which is not a valid regex. (Since if a ']' immediately follows an opening '[', then the ']' is interpreted literally and not a closing bracket.) Instead, we write '[a&&b]'. We could also do things like '(?u:\P{any})' or '(?-u:[\x00-\xFF])', but '[a&&b]' doesn't require any flags and also seems really obvious: the intersection of two distinct characters is obviously empty. --- regex-syntax/src/hir/print.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 6ba51f0b8..d976ff668 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -125,6 +125,9 @@ impl Visitor for Writer { } } HirKind::Class(hir::Class::Unicode(ref cls)) => { + if cls.ranges().is_empty() { + return self.wtr.write_str("[a&&b]"); + } self.wtr.write_str("[")?; for range in cls.iter() { if range.start() == range.end() { @@ -143,6 +146,9 @@ impl Visitor for Writer { self.wtr.write_str("]")?; } HirKind::Class(hir::Class::Bytes(ref cls)) => { + if cls.ranges().is_empty() { + return self.wtr.write_str("[a&&b]"); + } self.wtr.write_str("(?-u:[")?; for range in cls.iter() { if range.start() == range.end() { @@ -359,6 +365,11 @@ mod tests { roundtrip_bytes(r"(?-u)[\[]", r"\["); roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])"); roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])"); + + // This tests that an empty character class is correctly roundtripped. + #[cfg(feature = "unicode-gencat")] + roundtrip(r"\P{any}", r"[a&&b]"); + roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]"); } #[test] From d92cb559d9060293aebf3220e6c9acd6c9088ce2 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 24 Sep 2022 12:58:12 -0400 Subject: [PATCH 37/79] syntax: add some 'inline' annotations Since these functions are tiny and not polymorphic, we should permit them to be inlined across crate boundaries. --- regex-syntax/src/hir/mod.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 6aba9f9ea..9ae96a2ca 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -246,6 +246,7 @@ impl Hir { /// Returns an empty HIR expression. /// /// An empty HIR expression always matches, including the empty string. + #[inline] pub fn empty() -> Hir { let props = Properties::empty(); Hir { kind: HirKind::Empty, props } @@ -265,6 +266,7 @@ impl Hir { /// because empty character classes can be spelled in the concrete syntax /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but /// empty alternations cannot. + #[inline] pub fn fail() -> Hir { let class = Class::Bytes(ClassBytes::empty()); let props = Properties::class(&class); @@ -279,6 +281,7 @@ impl Hir { /// If the given literal has a `Byte` variant with an ASCII byte, then this /// method panics. This enforces the invariant that `Byte` variants are /// only used to express matching of invalid UTF-8. + #[inline] pub fn literal>>(lit: B) -> Hir { let bytes = lit.into(); if bytes.is_empty() { @@ -291,6 +294,7 @@ impl Hir { } /// Creates a class HIR expression. + #[inline] pub fn class(class: Class) -> Hir { if class.is_empty() { return Hir::fail(); @@ -302,12 +306,14 @@ impl Hir { } /// Creates a look-around assertion HIR expression. + #[inline] pub fn look(look: Look) -> Hir { let props = Properties::look(look); Hir { kind: HirKind::Look(look), props } } /// Creates a repetition HIR expression. + #[inline] pub fn repetition(rep: Repetition) -> Hir { // The regex 'a{0}' is always equivalent to the empty regex. This is // true even when 'a' is an expression that never matches anything @@ -324,6 +330,7 @@ impl Hir { } /// Creates a group HIR expression. + #[inline] pub fn group(group: Group) -> Hir { let props = Properties::group(&group); Hir { kind: HirKind::Group(group), props } @@ -375,6 +382,8 @@ impl Hir { } } } + // We can just skip empty HIRs. + HirKind::Empty => {} kind => { if let Some(prior_bytes) = prior_lit.take() { new.push(Hir::literal(prior_bytes)); @@ -452,6 +461,7 @@ impl Hir { /// Note that this is a convenience routine for constructing the correct /// character class based on the value of `Dot`. There is no explicit "dot" /// HIR value. It is just an abbreviation for a common character class. + #[inline] pub fn dot(dot: Dot) -> Hir { match dot { Dot::AnyChar => { From 561ed40c8c33f1062e08d1cbbfcbfd9810ae98e4 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 3 Oct 2022 12:04:19 -0400 Subject: [PATCH 38/79] syntax: fix utf-8 decoder We need to know the length of the next codepoint we want to debug, otherwise it's possible for a naive 'slice[..4]' to fail if the end of the slice happens to split a codepoint. --- regex-syntax/src/debug.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/regex-syntax/src/debug.rs b/regex-syntax/src/debug.rs index 1b0d527d4..a0b051b44 100644 --- a/regex-syntax/src/debug.rs +++ b/regex-syntax/src/debug.rs @@ -75,10 +75,32 @@ impl<'a> core::fmt::Debug for Bytes<'a> { /// /// This returns `None` if and only if `bytes` is empty. pub(crate) fn utf8_decode(bytes: &[u8]) -> Option> { + fn len(byte: u8) -> Option { + if byte <= 0x7F { + return Some(1); + } else if byte & 0b1100_0000 == 0b1000_0000 { + return None; + } else if byte <= 0b1101_1111 { + Some(2) + } else if byte <= 0b1110_1111 { + Some(3) + } else if byte <= 0b1111_0111 { + Some(4) + } else { + None + } + } + if bytes.is_empty() { return None; } - match core::str::from_utf8(&bytes[..core::cmp::min(4, bytes.len())]) { + let len = match len(bytes[0]) { + None => return Some(Err(bytes[0])), + Some(len) if len > bytes.len() => return Some(Err(bytes[0])), + Some(1) => return Some(Ok(char::from(bytes[0]))), + Some(len) => len, + }; + match core::str::from_utf8(&bytes[..len]) { Ok(s) => Some(Ok(s.chars().next().unwrap())), Err(_) => Some(Err(bytes[0])), } From 781d2644df99746307c6fc91d2e9ddf22e9529c7 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 9 Oct 2022 14:25:46 -0400 Subject: [PATCH 39/79] syntax: add new LookSet::contains_word convenience routine And also add some inline annotations on non-generic but tiny functions. --- regex-syntax/src/hir/mod.rs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 9ae96a2ca..f2a95204a 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1813,6 +1813,7 @@ pub struct LookSet { impl LookSet { /// Create an empty set of look-around assertions. + #[inline] pub fn empty() -> LookSet { LookSet { bits: 0 } } @@ -1820,6 +1821,7 @@ impl LookSet { /// Create a full set of look-around assertions. /// /// This set contains all possible look-around assertions. + #[inline] pub fn full() -> LookSet { LookSet { bits: !0 } } @@ -1828,6 +1830,7 @@ impl LookSet { /// /// This is a convenience routine for creating an empty set and inserting /// one look-around assertions. + #[inline] pub fn singleton(look: Look) -> LookSet { let mut set = LookSet::empty(); set.insert(look); @@ -1835,6 +1838,7 @@ impl LookSet { } /// Returns the total number of look-around assertions in this set. + #[inline] pub fn len(&self) -> usize { // OK because max value always fits in a u8, which in turn always // fits in a usize, regardless of target. @@ -1842,34 +1846,51 @@ impl LookSet { } /// Returns true if and only if this set is empty. + #[inline] pub fn is_empty(&self) -> bool { self.len() == 0 } /// Insert the given look-around assertions into this set. If the assertion /// is already in the set, then this is a no-op. + #[inline] pub fn insert(&mut self, look: Look) { self.bits |= 1 << look.as_repr(); } /// Remove the given look-around assertion from this set. If it wasn't /// previously in the set, then this is a no-op. + #[inline] pub fn remove(&mut self, look: Look) { self.bits &= !(1 << look.as_repr()); } /// Returns true if and only if the given look-around assertion is in this /// set. + #[inline] pub fn contains(&self, look: Look) -> bool { self.bits & (1 << look.as_repr()) != 0 } + /// Returns true if and only if this set contains any word boundary or + /// negated word boundary assertions. This include both Unicode and ASCII + /// word boundaries. + #[inline] + pub fn contains_word(&self) -> bool { + self.contains(Look::WordAscii) + || self.contains(Look::WordAsciiNegate) + || self.contains(Look::WordUnicode) + || self.contains(Look::WordUnicodeNegate) + } + /// Modifies this set to be the union of itself and the set given. + #[inline] pub fn union(&mut self, other: LookSet) { self.bits |= other.bits; } /// Modifies this set to be the intersection of itself and the set given. + #[inline] pub fn intersect(&mut self, other: LookSet) { self.bits &= other.bits; } From c15240b0c81b071b8b2f1f4ed7fa495ab9c24f46 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 7 Oct 2022 18:14:06 -0400 Subject: [PATCH 40/79] syntax: rewrite literal extraction After years of saying "literal extraction needs to be rewritten," I've finally gathered up the courage to do it. While this commit doesn't show it, this is actually now the third time I rewrote it. I rewrote it a second time about a week prior to this and got close to the finish line when I realized I had to throw it away. In that approach, I tried to abandon the "mark each individual literal as exact" idea in the original literal extraction code and instead treat the entire set of literals as "exact" or not. (I also changed the terminology from "complete" to "exact," which I think is maybe a bit better. I also got rid of "cut" and instead use "inexact.") The main problem with not marking each individual literal as exact or not is that it potentially inhibits longer literal extraction. For example, in the regex 'ab*cd', with individual literals marked as exact, we can extract the sequence [inexact(ab), exact(acd)]. But with the entire set being all exact or all inexact, there's no real way to let extraction continue through the empty string produced by the '*' repetition operator. There were some other problems with my second rewrite around short-circuiting concats/alternations when sequences got too big, but I think I could have resolved them. In the end, the third rewrite is quite good. It actually roughly corresponds to the original code, but is cleaned up and much more principled. The original code didn't do these things for example: 1. Didn't care about order and thus didn't correctly produce literals in a sequence for which leftmost-first match semantics were preserved. 2. Didn't differentiate between "empty set" and "infinite set." These are two pretty subtle cases and them not being distinct in the code was really quite messy. 3. The old code tried to carry a literal set throughout extraction and this has the effect of forcing every part of extraction to care about concatenation. But now we just force a stronger separation of responsibility. We might wind up with a few more allocs, but the in-practice small set size limits and short circuiting means that it usually doesn't matter relative to the other costs of parsing, translating and compiling regexes. I ported over pretty much all of the older tests and added more of my own. Overall, I feel much more confident about this new literal extraction than I do the old. We do also insert some heuristics for trimming literal sets in src/exec.rs that didn't exist before. This is because the new extraction code tends to the respect the limits a bit more faithfully and sometimes returned bigger sets than the old code. This is bad because more literals means prefilters are probably less effective. So we write a little bit of code to mitigate this. We also do let a few cases get slower for the time being. The suffix handling is not quite ideal, so many of the easy/medium/hard benchmarks are now a little slower. The name_alt3_nocase benchmark is also slower because the new extraction code notices that the literals blow the limits and only returns an infinite sequence. The old extraction code had (some in practice and unprincipled) techniques for shrinking its set as it went, and this caused literals to get extracted for it. We can fix this, but it will take a little more effort that I don't want to spend right now. In any case, the hope is to smooth out any issues as we head towards bringing regex-automata in. --- .../10-last-frontier/rust-after-literal.log | 124 + .../10-last-frontier/rust-before-literal.log | 124 + .../rust-bytes-after-literal.log | 112 + .../rust-bytes-before-literal.log | 112 + regex-syntax/src/hir/literal.rs | 2767 +++++++++++++++++ regex-syntax/src/hir/literal/mod.rs | 1672 ---------- regex-syntax/src/lib.rs | 5 +- src/exec.rs | 111 +- src/literal/imp.rs | 65 +- src/literal/mod.rs | 6 +- 10 files changed, 3374 insertions(+), 1724 deletions(-) create mode 100644 bench/log/10-last-frontier/rust-after-literal.log create mode 100644 bench/log/10-last-frontier/rust-before-literal.log create mode 100644 bench/log/10-last-frontier/rust-bytes-after-literal.log create mode 100644 bench/log/10-last-frontier/rust-bytes-before-literal.log create mode 100644 regex-syntax/src/hir/literal.rs delete mode 100644 regex-syntax/src/hir/literal/mod.rs diff --git a/bench/log/10-last-frontier/rust-after-literal.log b/bench/log/10-last-frontier/rust-after-literal.log new file mode 100644 index 000000000..c45b55cac --- /dev/null +++ b/bench/log/10-last-frontier/rust-after-literal.log @@ -0,0 +1,124 @@ + +running 119 tests +test misc::anchored_literal_long_match ... bench: 18 ns/iter (+/- 0) = 21666 MB/s +test misc::anchored_literal_long_non_match ... bench: 20 ns/iter (+/- 0) = 19500 MB/s +test misc::anchored_literal_short_match ... bench: 18 ns/iter (+/- 0) = 1444 MB/s +test misc::anchored_literal_short_non_match ... bench: 20 ns/iter (+/- 0) = 1300 MB/s +test misc::easy0_1K ... bench: 51 ns/iter (+/- 2) = 20607 MB/s +test misc::easy0_1MB ... bench: 56 ns/iter (+/- 1) = 18725053 MB/s +test misc::easy0_32 ... bench: 51 ns/iter (+/- 0) = 1156 MB/s +test misc::easy0_32K ... bench: 53 ns/iter (+/- 1) = 618773 MB/s +test misc::easy1_1K ... bench: 41 ns/iter (+/- 0) = 25463 MB/s +test misc::easy1_1MB ... bench: 44 ns/iter (+/- 1) = 23831727 MB/s +test misc::easy1_32 ... bench: 40 ns/iter (+/- 1) = 1300 MB/s +test misc::easy1_32K ... bench: 40 ns/iter (+/- 1) = 819700 MB/s +test misc::hard_1K ... bench: 51 ns/iter (+/- 2) = 20607 MB/s +test misc::hard_1MB ... bench: 56 ns/iter (+/- 1) = 18725053 MB/s +test misc::hard_32 ... bench: 51 ns/iter (+/- 2) = 1156 MB/s +test misc::hard_32K ... bench: 51 ns/iter (+/- 1) = 643039 MB/s +test misc::is_match_set ... bench: 61 ns/iter (+/- 2) = 409 MB/s +test misc::literal ... bench: 13 ns/iter (+/- 0) = 3923 MB/s +test misc::long_needle1 ... bench: 3,242 ns/iter (+/- 79) = 30845 MB/s +test misc::long_needle2 ... bench: 350,572 ns/iter (+/- 6,860) = 285 MB/s +test misc::match_class ... bench: 62 ns/iter (+/- 6) = 1306 MB/s +test misc::match_class_in_range ... bench: 14 ns/iter (+/- 0) = 5785 MB/s +test misc::match_class_unicode ... bench: 259 ns/iter (+/- 15) = 621 MB/s +test misc::matches_set ... bench: 462 ns/iter (+/- 9) = 54 MB/s +test misc::medium_1K ... bench: 53 ns/iter (+/- 0) = 19849 MB/s +test misc::medium_1MB ... bench: 58 ns/iter (+/- 1) = 18079379 MB/s +test misc::medium_32 ... bench: 53 ns/iter (+/- 1) = 1132 MB/s +test misc::medium_32K ... bench: 53 ns/iter (+/- 1) = 618792 MB/s +test misc::no_exponential ... bench: 423 ns/iter (+/- 13) = 236 MB/s +test misc::not_literal ... bench: 89 ns/iter (+/- 0) = 573 MB/s +test misc::one_pass_long_prefix ... bench: 52 ns/iter (+/- 0) = 500 MB/s +test misc::one_pass_long_prefix_not ... bench: 52 ns/iter (+/- 1) = 500 MB/s +test misc::one_pass_short ... bench: 38 ns/iter (+/- 1) = 447 MB/s +test misc::one_pass_short_not ... bench: 41 ns/iter (+/- 1) = 414 MB/s +test misc::reallyhard2_1K ... bench: 81 ns/iter (+/- 1) = 12839 MB/s +test misc::reallyhard_1K ... bench: 1,592 ns/iter (+/- 1) = 660 MB/s +test misc::reallyhard_1MB ... bench: 1,575,822 ns/iter (+/- 39,203) = 665 MB/s +test misc::reallyhard_32 ... bench: 102 ns/iter (+/- 0) = 578 MB/s +test misc::reallyhard_32K ... bench: 49,328 ns/iter (+/- 2,598) = 664 MB/s +test misc::replace_all ... bench: 132 ns/iter (+/- 3) +test misc::reverse_suffix_no_quadratic ... bench: 4,171 ns/iter (+/- 134) = 1918 MB/s +test misc::short_haystack_1000000x ... bench: 132,251 ns/iter (+/- 729) = 60491 MB/s +test misc::short_haystack_100000x ... bench: 13,184 ns/iter (+/- 408) = 60680 MB/s +test misc::short_haystack_10000x ... bench: 6,036 ns/iter (+/- 167) = 13255 MB/s +test misc::short_haystack_1000x ... bench: 602 ns/iter (+/- 14) = 13307 MB/s +test misc::short_haystack_100x ... bench: 230 ns/iter (+/- 7) = 3526 MB/s +test misc::short_haystack_10x ... bench: 218 ns/iter (+/- 3) = 417 MB/s +test misc::short_haystack_1x ... bench: 210 ns/iter (+/- 8) = 90 MB/s +test misc::short_haystack_2x ... bench: 225 ns/iter (+/- 6) = 120 MB/s +test misc::short_haystack_3x ... bench: 211 ns/iter (+/- 8) = 165 MB/s +test misc::short_haystack_4x ... bench: 212 ns/iter (+/- 6) = 202 MB/s +test regexdna::find_new_lines ... bench: 12,245,066 ns/iter (+/- 117,141) = 415 MB/s +test regexdna::subst1 ... bench: 786,357 ns/iter (+/- 14,200) = 6464 MB/s +test regexdna::subst10 ... bench: 788,550 ns/iter (+/- 26,456) = 6446 MB/s +test regexdna::subst11 ... bench: 782,161 ns/iter (+/- 15,583) = 6499 MB/s +test regexdna::subst2 ... bench: 784,902 ns/iter (+/- 23,379) = 6476 MB/s +test regexdna::subst3 ... bench: 786,640 ns/iter (+/- 27,063) = 6462 MB/s +test regexdna::subst4 ... bench: 785,591 ns/iter (+/- 20,498) = 6470 MB/s +test regexdna::subst5 ... bench: 787,447 ns/iter (+/- 20,892) = 6455 MB/s +test regexdna::subst6 ... bench: 784,994 ns/iter (+/- 19,687) = 6475 MB/s +test regexdna::subst7 ... bench: 801,921 ns/iter (+/- 15,391) = 6339 MB/s +test regexdna::subst8 ... bench: 785,541 ns/iter (+/- 11,908) = 6471 MB/s +test regexdna::subst9 ... bench: 785,848 ns/iter (+/- 28,020) = 6468 MB/s +test regexdna::variant1 ... bench: 2,195,058 ns/iter (+/- 44,066) = 2315 MB/s +test regexdna::variant2 ... bench: 3,219,968 ns/iter (+/- 59,372) = 1578 MB/s +test regexdna::variant3 ... bench: 3,776,467 ns/iter (+/- 54,326) = 1346 MB/s +test regexdna::variant4 ... bench: 3,803,674 ns/iter (+/- 95,281) = 1336 MB/s +test regexdna::variant5 ... bench: 2,661,333 ns/iter (+/- 46,408) = 1910 MB/s +test regexdna::variant6 ... bench: 2,645,716 ns/iter (+/- 38,659) = 1921 MB/s +test regexdna::variant7 ... bench: 3,228,352 ns/iter (+/- 69,155) = 1574 MB/s +test regexdna::variant8 ... bench: 3,305,563 ns/iter (+/- 59,321) = 1537 MB/s +test regexdna::variant9 ... bench: 3,225,039 ns/iter (+/- 49,720) = 1576 MB/s +test rust_compile::compile_huge ... bench: 100,381 ns/iter (+/- 2,052) +test rust_compile::compile_huge_bytes ... bench: 5,899,989 ns/iter (+/- 114,363) +test rust_compile::compile_huge_full ... bench: 11,650,995 ns/iter (+/- 172,285) +test rust_compile::compile_simple ... bench: 4,082 ns/iter (+/- 88) +test rust_compile::compile_simple_bytes ... bench: 4,153 ns/iter (+/- 120) +test rust_compile::compile_simple_full ... bench: 20,414 ns/iter (+/- 1,860) +test rust_compile::compile_small ... bench: 9,114 ns/iter (+/- 216) +test rust_compile::compile_small_bytes ... bench: 183,049 ns/iter (+/- 9,917) +test rust_compile::compile_small_full ... bench: 361,291 ns/iter (+/- 11,045) +test sherlock::before_after_holmes ... bench: 907,103 ns/iter (+/- 12,165) = 655 MB/s +test sherlock::before_holmes ... bench: 62,501 ns/iter (+/- 1,880) = 9518 MB/s +test sherlock::everything_greedy ... bench: 2,062,116 ns/iter (+/- 41,900) = 288 MB/s +test sherlock::everything_greedy_nl ... bench: 894,529 ns/iter (+/- 38,723) = 665 MB/s +test sherlock::holmes_cochar_watson ... bench: 103,305 ns/iter (+/- 3,798) = 5758 MB/s +test sherlock::holmes_coword_watson ... bench: 479,423 ns/iter (+/- 13,924) = 1240 MB/s +test sherlock::ing_suffix ... bench: 318,300 ns/iter (+/- 6,846) = 1869 MB/s +test sherlock::ing_suffix_limited_space ... bench: 1,066,300 ns/iter (+/- 19,375) = 557 MB/s +test sherlock::letters ... bench: 21,777,358 ns/iter (+/- 230,478) = 27 MB/s +test sherlock::letters_lower ... bench: 21,152,019 ns/iter (+/- 203,617) = 28 MB/s +test sherlock::letters_upper ... bench: 1,777,626 ns/iter (+/- 26,243) = 334 MB/s +test sherlock::line_boundary_sherlock_holmes ... bench: 897,509 ns/iter (+/- 24,983) = 662 MB/s +test sherlock::name_alt1 ... bench: 32,255 ns/iter (+/- 681) = 18444 MB/s +test sherlock::name_alt2 ... bench: 86,369 ns/iter (+/- 2,494) = 6888 MB/s +test sherlock::name_alt3 ... bench: 97,618 ns/iter (+/- 564) = 6094 MB/s +test sherlock::name_alt3_nocase ... bench: 944,848 ns/iter (+/- 31,039) = 629 MB/s +test sherlock::name_alt4 ... bench: 122,029 ns/iter (+/- 2,716) = 4875 MB/s +test sherlock::name_alt4_nocase ... bench: 225,544 ns/iter (+/- 5,783) = 2637 MB/s +test sherlock::name_alt5 ... bench: 91,897 ns/iter (+/- 3,796) = 6473 MB/s +test sherlock::name_alt5_nocase ... bench: 936,420 ns/iter (+/- 15,092) = 635 MB/s +test sherlock::name_holmes ... bench: 33,448 ns/iter (+/- 959) = 17786 MB/s +test sherlock::name_holmes_nocase ... bench: 115,864 ns/iter (+/- 1,645) = 5134 MB/s +test sherlock::name_sherlock ... bench: 22,474 ns/iter (+/- 674) = 26472 MB/s +test sherlock::name_sherlock_holmes ... bench: 22,184 ns/iter (+/- 497) = 26818 MB/s +test sherlock::name_sherlock_holmes_nocase ... bench: 99,629 ns/iter (+/- 2,398) = 5971 MB/s +test sherlock::name_sherlock_nocase ... bench: 99,523 ns/iter (+/- 2,674) = 5977 MB/s +test sherlock::name_whitespace ... bench: 30,815 ns/iter (+/- 107) = 19306 MB/s +test sherlock::no_match_common ... bench: 19,661 ns/iter (+/- 656) = 30259 MB/s +test sherlock::no_match_really_common ... bench: 27,544 ns/iter (+/- 527) = 21599 MB/s +test sherlock::no_match_uncommon ... bench: 19,553 ns/iter (+/- 31) = 30426 MB/s +test sherlock::quotes ... bench: 369,144 ns/iter (+/- 45,316) = 1611 MB/s +test sherlock::repeated_class_negation ... bench: 68,838,857 ns/iter (+/- 330,544) = 8 MB/s +test sherlock::the_lower ... bench: 321,692 ns/iter (+/- 5,418) = 1849 MB/s +test sherlock::the_nocase ... bench: 507,936 ns/iter (+/- 3,080) = 1171 MB/s +test sherlock::the_upper ... bench: 43,705 ns/iter (+/- 788) = 13612 MB/s +test sherlock::the_whitespace ... bench: 819,179 ns/iter (+/- 20,071) = 726 MB/s +test sherlock::word_ending_n ... bench: 1,700,300 ns/iter (+/- 36,623) = 349 MB/s +test sherlock::words ... bench: 8,249,767 ns/iter (+/- 75,015) = 72 MB/s + +test result: ok. 0 passed; 0 failed; 0 ignored; 119 measured; 0 filtered out; finished in 111.55s + diff --git a/bench/log/10-last-frontier/rust-before-literal.log b/bench/log/10-last-frontier/rust-before-literal.log new file mode 100644 index 000000000..98b3496ae --- /dev/null +++ b/bench/log/10-last-frontier/rust-before-literal.log @@ -0,0 +1,124 @@ + +running 119 tests +test misc::anchored_literal_long_match ... bench: 18 ns/iter (+/- 0) = 21666 MB/s +test misc::anchored_literal_long_non_match ... bench: 18 ns/iter (+/- 0) = 21666 MB/s +test misc::anchored_literal_short_match ... bench: 18 ns/iter (+/- 0) = 1444 MB/s +test misc::anchored_literal_short_non_match ... bench: 18 ns/iter (+/- 0) = 1444 MB/s +test misc::easy0_1K ... bench: 15 ns/iter (+/- 0) = 70066 MB/s +test misc::easy0_1MB ... bench: 21 ns/iter (+/- 0) = 49933476 MB/s +test misc::easy0_32 ... bench: 15 ns/iter (+/- 0) = 3933 MB/s +test misc::easy0_32K ... bench: 14 ns/iter (+/- 0) = 2342500 MB/s +test misc::easy1_1K ... bench: 40 ns/iter (+/- 1) = 26100 MB/s +test misc::easy1_1MB ... bench: 45 ns/iter (+/- 1) = 23302133 MB/s +test misc::easy1_32 ... bench: 40 ns/iter (+/- 5) = 1300 MB/s +test misc::easy1_32K ... bench: 40 ns/iter (+/- 1) = 819700 MB/s +test misc::hard_1K ... bench: 51 ns/iter (+/- 1) = 20607 MB/s +test misc::hard_1MB ... bench: 56 ns/iter (+/- 0) = 18725053 MB/s +test misc::hard_32 ... bench: 51 ns/iter (+/- 3) = 1156 MB/s +test misc::hard_32K ... bench: 51 ns/iter (+/- 1) = 643039 MB/s +test misc::is_match_set ... bench: 61 ns/iter (+/- 2) = 409 MB/s +test misc::literal ... bench: 13 ns/iter (+/- 0) = 3923 MB/s +test misc::long_needle1 ... bench: 3,259 ns/iter (+/- 86) = 30684 MB/s +test misc::long_needle2 ... bench: 350,722 ns/iter (+/- 6,984) = 285 MB/s +test misc::match_class ... bench: 60 ns/iter (+/- 1) = 1350 MB/s +test misc::match_class_in_range ... bench: 14 ns/iter (+/- 0) = 5785 MB/s +test misc::match_class_unicode ... bench: 255 ns/iter (+/- 0) = 631 MB/s +test misc::matches_set ... bench: 481 ns/iter (+/- 11) = 51 MB/s +test misc::medium_1K ... bench: 15 ns/iter (+/- 0) = 70133 MB/s +test misc::medium_1MB ... bench: 22 ns/iter (+/- 0) = 47663818 MB/s +test misc::medium_32 ... bench: 15 ns/iter (+/- 0) = 4000 MB/s +test misc::medium_32K ... bench: 15 ns/iter (+/- 0) = 2186400 MB/s +test misc::no_exponential ... bench: 442 ns/iter (+/- 13) = 226 MB/s +test misc::not_literal ... bench: 89 ns/iter (+/- 1) = 573 MB/s +test misc::one_pass_long_prefix ... bench: 54 ns/iter (+/- 1) = 481 MB/s +test misc::one_pass_long_prefix_not ... bench: 52 ns/iter (+/- 1) = 500 MB/s +test misc::one_pass_short ... bench: 39 ns/iter (+/- 0) = 435 MB/s +test misc::one_pass_short_not ... bench: 42 ns/iter (+/- 0) = 404 MB/s +test misc::reallyhard2_1K ... bench: 83 ns/iter (+/- 6) = 12530 MB/s +test misc::reallyhard_1K ... bench: 1,592 ns/iter (+/- 4) = 660 MB/s +test misc::reallyhard_1MB ... bench: 1,575,691 ns/iter (+/- 29,668) = 665 MB/s +test misc::reallyhard_32 ... bench: 101 ns/iter (+/- 5) = 584 MB/s +test misc::reallyhard_32K ... bench: 49,325 ns/iter (+/- 1,734) = 664 MB/s +test misc::replace_all ... bench: 134 ns/iter (+/- 2) +test misc::reverse_suffix_no_quadratic ... bench: 4,189 ns/iter (+/- 274) = 1909 MB/s +test misc::short_haystack_1000000x ... bench: 132,182 ns/iter (+/- 4,966) = 60522 MB/s +test misc::short_haystack_100000x ... bench: 13,344 ns/iter (+/- 275) = 59952 MB/s +test misc::short_haystack_10000x ... bench: 6,119 ns/iter (+/- 285) = 13075 MB/s +test misc::short_haystack_1000x ... bench: 617 ns/iter (+/- 15) = 12983 MB/s +test misc::short_haystack_100x ... bench: 230 ns/iter (+/- 7) = 3526 MB/s +test misc::short_haystack_10x ... bench: 207 ns/iter (+/- 8) = 439 MB/s +test misc::short_haystack_1x ... bench: 213 ns/iter (+/- 7) = 89 MB/s +test misc::short_haystack_2x ... bench: 206 ns/iter (+/- 6) = 131 MB/s +test misc::short_haystack_3x ... bench: 207 ns/iter (+/- 10) = 169 MB/s +test misc::short_haystack_4x ... bench: 208 ns/iter (+/- 7) = 206 MB/s +test regexdna::find_new_lines ... bench: 12,275,804 ns/iter (+/- 145,331) = 414 MB/s +test regexdna::subst1 ... bench: 793,517 ns/iter (+/- 44,203) = 6406 MB/s +test regexdna::subst10 ... bench: 794,922 ns/iter (+/- 23,459) = 6394 MB/s +test regexdna::subst11 ... bench: 790,525 ns/iter (+/- 23,010) = 6430 MB/s +test regexdna::subst2 ... bench: 790,637 ns/iter (+/- 17,962) = 6429 MB/s +test regexdna::subst3 ... bench: 793,559 ns/iter (+/- 17,575) = 6405 MB/s +test regexdna::subst4 ... bench: 792,738 ns/iter (+/- 15,237) = 6412 MB/s +test regexdna::subst5 ... bench: 795,060 ns/iter (+/- 26,172) = 6393 MB/s +test regexdna::subst6 ... bench: 792,357 ns/iter (+/- 15,067) = 6415 MB/s +test regexdna::subst7 ... bench: 797,006 ns/iter (+/- 27,928) = 6378 MB/s +test regexdna::subst8 ... bench: 790,603 ns/iter (+/- 22,754) = 6429 MB/s +test regexdna::subst9 ... bench: 793,055 ns/iter (+/- 13,202) = 6409 MB/s +test regexdna::variant1 ... bench: 2,204,304 ns/iter (+/- 50,669) = 2306 MB/s +test regexdna::variant2 ... bench: 3,224,798 ns/iter (+/- 45,705) = 1576 MB/s +test regexdna::variant3 ... bench: 3,802,774 ns/iter (+/- 86,530) = 1336 MB/s +test regexdna::variant4 ... bench: 3,805,916 ns/iter (+/- 69,737) = 1335 MB/s +test regexdna::variant5 ... bench: 2,662,373 ns/iter (+/- 61,259) = 1909 MB/s +test regexdna::variant6 ... bench: 2,654,072 ns/iter (+/- 51,095) = 1915 MB/s +test regexdna::variant7 ... bench: 3,232,369 ns/iter (+/- 67,147) = 1572 MB/s +test regexdna::variant8 ... bench: 3,311,225 ns/iter (+/- 66,086) = 1535 MB/s +test regexdna::variant9 ... bench: 3,241,601 ns/iter (+/- 68,394) = 1568 MB/s +test rust_compile::compile_huge ... bench: 100,955 ns/iter (+/- 2,466) +test rust_compile::compile_huge_bytes ... bench: 5,936,732 ns/iter (+/- 126,993) +test rust_compile::compile_huge_full ... bench: 11,880,838 ns/iter (+/- 211,387) +test rust_compile::compile_simple ... bench: 4,575 ns/iter (+/- 139) +test rust_compile::compile_simple_bytes ... bench: 4,653 ns/iter (+/- 122) +test rust_compile::compile_simple_full ... bench: 20,656 ns/iter (+/- 535) +test rust_compile::compile_small ... bench: 9,613 ns/iter (+/- 992) +test rust_compile::compile_small_bytes ... bench: 188,349 ns/iter (+/- 4,733) +test rust_compile::compile_small_full ... bench: 341,554 ns/iter (+/- 9,774) +test sherlock::before_after_holmes ... bench: 907,419 ns/iter (+/- 11,645) = 655 MB/s +test sherlock::before_holmes ... bench: 62,036 ns/iter (+/- 1,854) = 9590 MB/s +test sherlock::everything_greedy ... bench: 2,072,694 ns/iter (+/- 45,192) = 287 MB/s +test sherlock::everything_greedy_nl ... bench: 884,483 ns/iter (+/- 25,710) = 672 MB/s +test sherlock::holmes_cochar_watson ... bench: 103,873 ns/iter (+/- 1,310) = 5727 MB/s +test sherlock::holmes_coword_watson ... bench: 481,491 ns/iter (+/- 11,516) = 1235 MB/s +test sherlock::ing_suffix ... bench: 323,119 ns/iter (+/- 7,438) = 1841 MB/s +test sherlock::ing_suffix_limited_space ... bench: 1,067,293 ns/iter (+/- 18,661) = 557 MB/s +test sherlock::letters ... bench: 21,732,526 ns/iter (+/- 253,563) = 27 MB/s +test sherlock::letters_lower ... bench: 21,187,465 ns/iter (+/- 191,023) = 28 MB/s +test sherlock::letters_upper ... bench: 1,766,003 ns/iter (+/- 17,494) = 336 MB/s +test sherlock::line_boundary_sherlock_holmes ... bench: 897,387 ns/iter (+/- 26,674) = 662 MB/s +test sherlock::name_alt1 ... bench: 34,183 ns/iter (+/- 885) = 17404 MB/s +test sherlock::name_alt2 ... bench: 87,151 ns/iter (+/- 2,139) = 6826 MB/s +test sherlock::name_alt3 ... bench: 99,293 ns/iter (+/- 1,938) = 5991 MB/s +test sherlock::name_alt3_nocase ... bench: 379,228 ns/iter (+/- 22,539) = 1568 MB/s +test sherlock::name_alt4 ... bench: 123,040 ns/iter (+/- 2,676) = 4835 MB/s +test sherlock::name_alt4_nocase ... bench: 186,045 ns/iter (+/- 403) = 3197 MB/s +test sherlock::name_alt5 ... bench: 91,679 ns/iter (+/- 2,543) = 6489 MB/s +test sherlock::name_alt5_nocase ... bench: 343,668 ns/iter (+/- 6,807) = 1731 MB/s +test sherlock::name_holmes ... bench: 33,802 ns/iter (+/- 936) = 17600 MB/s +test sherlock::name_holmes_nocase ... bench: 136,208 ns/iter (+/- 4,317) = 4367 MB/s +test sherlock::name_sherlock ... bench: 22,534 ns/iter (+/- 462) = 26401 MB/s +test sherlock::name_sherlock_holmes ... bench: 22,514 ns/iter (+/- 697) = 26425 MB/s +test sherlock::name_sherlock_holmes_nocase ... bench: 97,796 ns/iter (+/- 2,037) = 6083 MB/s +test sherlock::name_sherlock_nocase ... bench: 95,809 ns/iter (+/- 1,538) = 6209 MB/s +test sherlock::name_whitespace ... bench: 30,959 ns/iter (+/- 968) = 19216 MB/s +test sherlock::no_match_common ... bench: 19,568 ns/iter (+/- 616) = 30403 MB/s +test sherlock::no_match_really_common ... bench: 26,273 ns/iter (+/- 1,143) = 22644 MB/s +test sherlock::no_match_uncommon ... bench: 19,643 ns/iter (+/- 496) = 30287 MB/s +test sherlock::quotes ... bench: 371,876 ns/iter (+/- 2,494) = 1599 MB/s +test sherlock::repeated_class_negation ... bench: 76,963,104 ns/iter (+/- 277,311) = 7 MB/s +test sherlock::the_lower ... bench: 331,250 ns/iter (+/- 8,588) = 1796 MB/s +test sherlock::the_nocase ... bench: 516,528 ns/iter (+/- 40,826) = 1151 MB/s +test sherlock::the_upper ... bench: 44,206 ns/iter (+/- 1,277) = 13458 MB/s +test sherlock::the_whitespace ... bench: 822,577 ns/iter (+/- 23,649) = 723 MB/s +test sherlock::word_ending_n ... bench: 1,685,110 ns/iter (+/- 34,615) = 353 MB/s +test sherlock::words ... bench: 8,333,499 ns/iter (+/- 152,757) = 71 MB/s + +test result: ok. 0 passed; 0 failed; 0 ignored; 119 measured; 0 filtered out; finished in 124.94s + diff --git a/bench/log/10-last-frontier/rust-bytes-after-literal.log b/bench/log/10-last-frontier/rust-bytes-after-literal.log new file mode 100644 index 000000000..470e09b9c --- /dev/null +++ b/bench/log/10-last-frontier/rust-bytes-after-literal.log @@ -0,0 +1,112 @@ + +running 107 tests +test misc::anchored_literal_long_match ... bench: 18 ns/iter (+/- 0) = 21666 MB/s +test misc::anchored_literal_long_non_match ... bench: 20 ns/iter (+/- 1) = 19500 MB/s +test misc::anchored_literal_short_match ... bench: 18 ns/iter (+/- 0) = 1444 MB/s +test misc::anchored_literal_short_non_match ... bench: 20 ns/iter (+/- 0) = 1300 MB/s +test misc::easy0_1K ... bench: 54 ns/iter (+/- 2) = 19462 MB/s +test misc::easy0_1MB ... bench: 56 ns/iter (+/- 1) = 18725053 MB/s +test misc::easy0_32 ... bench: 51 ns/iter (+/- 1) = 1156 MB/s +test misc::easy0_32K ... bench: 51 ns/iter (+/- 2) = 643039 MB/s +test misc::easy1_1K ... bench: 41 ns/iter (+/- 1) = 25463 MB/s +test misc::easy1_1MB ... bench: 44 ns/iter (+/- 1) = 23831727 MB/s +test misc::easy1_32 ... bench: 40 ns/iter (+/- 2) = 1300 MB/s +test misc::easy1_32K ... bench: 40 ns/iter (+/- 1) = 819700 MB/s +test misc::hard_1K ... bench: 52 ns/iter (+/- 1) = 20211 MB/s +test misc::hard_1MB ... bench: 57 ns/iter (+/- 0) = 18396543 MB/s +test misc::hard_32 ... bench: 51 ns/iter (+/- 0) = 1156 MB/s +test misc::hard_32K ... bench: 51 ns/iter (+/- 3) = 643039 MB/s +test misc::is_match_set ... bench: 61 ns/iter (+/- 2) = 409 MB/s +test misc::literal ... bench: 14 ns/iter (+/- 0) = 3642 MB/s +test misc::long_needle1 ... bench: 3,249 ns/iter (+/- 87) = 30779 MB/s +test misc::long_needle2 ... bench: 350,559 ns/iter (+/- 7,154) = 285 MB/s +test misc::match_class ... bench: 61 ns/iter (+/- 4) = 1327 MB/s +test misc::match_class_in_range ... bench: 14 ns/iter (+/- 0) = 5785 MB/s +test misc::matches_set ... bench: 401 ns/iter (+/- 17) = 62 MB/s +test misc::medium_1K ... bench: 53 ns/iter (+/- 0) = 19849 MB/s +test misc::medium_1MB ... bench: 58 ns/iter (+/- 0) = 18079379 MB/s +test misc::medium_32 ... bench: 53 ns/iter (+/- 0) = 1132 MB/s +test misc::medium_32K ... bench: 53 ns/iter (+/- 2) = 618792 MB/s +test misc::no_exponential ... bench: 421 ns/iter (+/- 8) = 237 MB/s +test misc::not_literal ... bench: 90 ns/iter (+/- 0) = 566 MB/s +test misc::one_pass_long_prefix ... bench: 53 ns/iter (+/- 1) = 490 MB/s +test misc::one_pass_long_prefix_not ... bench: 53 ns/iter (+/- 0) = 490 MB/s +test misc::one_pass_short ... bench: 38 ns/iter (+/- 0) = 447 MB/s +test misc::one_pass_short_not ... bench: 42 ns/iter (+/- 3) = 404 MB/s +test misc::reallyhard2_1K ... bench: 77 ns/iter (+/- 1) = 13506 MB/s +test misc::reallyhard_1K ... bench: 1,592 ns/iter (+/- 1) = 660 MB/s +test misc::reallyhard_1MB ... bench: 1,575,759 ns/iter (+/- 49,997) = 665 MB/s +test misc::reallyhard_32 ... bench: 102 ns/iter (+/- 2) = 578 MB/s +test misc::reallyhard_32K ... bench: 49,326 ns/iter (+/- 1,055) = 664 MB/s +test misc::reverse_suffix_no_quadratic ... bench: 4,161 ns/iter (+/- 94) = 1922 MB/s +test regexdna::find_new_lines ... bench: 12,344,799 ns/iter (+/- 188,054) = 411 MB/s +test regexdna::subst1 ... bench: 780,449 ns/iter (+/- 14,474) = 6513 MB/s +test regexdna::subst10 ... bench: 795,203 ns/iter (+/- 40,742) = 6392 MB/s +test regexdna::subst11 ... bench: 816,444 ns/iter (+/- 23,334) = 6226 MB/s +test regexdna::subst2 ... bench: 777,546 ns/iter (+/- 19,625) = 6537 MB/s +test regexdna::subst3 ... bench: 783,295 ns/iter (+/- 8,266) = 6489 MB/s +test regexdna::subst4 ... bench: 775,154 ns/iter (+/- 21,350) = 6557 MB/s +test regexdna::subst5 ... bench: 781,414 ns/iter (+/- 21,057) = 6505 MB/s +test regexdna::subst6 ... bench: 783,595 ns/iter (+/- 23,835) = 6487 MB/s +test regexdna::subst7 ... bench: 821,620 ns/iter (+/- 46,131) = 6187 MB/s +test regexdna::subst8 ... bench: 818,402 ns/iter (+/- 21,350) = 6211 MB/s +test regexdna::subst9 ... bench: 779,115 ns/iter (+/- 21,335) = 6524 MB/s +test regexdna::variant1 ... bench: 2,189,308 ns/iter (+/- 32,528) = 2321 MB/s +test regexdna::variant2 ... bench: 3,217,478 ns/iter (+/- 36,011) = 1579 MB/s +test regexdna::variant3 ... bench: 3,771,330 ns/iter (+/- 74,944) = 1347 MB/s +test regexdna::variant4 ... bench: 3,787,593 ns/iter (+/- 37,825) = 1342 MB/s +test regexdna::variant5 ... bench: 2,669,799 ns/iter (+/- 69,777) = 1904 MB/s +test regexdna::variant6 ... bench: 2,651,559 ns/iter (+/- 33,895) = 1917 MB/s +test regexdna::variant7 ... bench: 3,222,991 ns/iter (+/- 41,014) = 1577 MB/s +test regexdna::variant8 ... bench: 3,298,048 ns/iter (+/- 41,331) = 1541 MB/s +test regexdna::variant9 ... bench: 3,218,486 ns/iter (+/- 50,318) = 1579 MB/s +test rust_compile::compile_huge ... bench: 100,031 ns/iter (+/- 3,464) +test rust_compile::compile_huge_bytes ... bench: 5,885,102 ns/iter (+/- 130,016) +test rust_compile::compile_huge_full ... bench: 11,641,251 ns/iter (+/- 147,700) +test rust_compile::compile_simple ... bench: 4,263 ns/iter (+/- 116) +test rust_compile::compile_simple_bytes ... bench: 4,236 ns/iter (+/- 91) +test rust_compile::compile_simple_full ... bench: 22,349 ns/iter (+/- 2,085) +test rust_compile::compile_small ... bench: 9,537 ns/iter (+/- 298) +test rust_compile::compile_small_bytes ... bench: 178,561 ns/iter (+/- 3,796) +test rust_compile::compile_small_full ... bench: 363,343 ns/iter (+/- 9,481) +test sherlock::before_after_holmes ... bench: 907,022 ns/iter (+/- 19,133) = 655 MB/s +test sherlock::before_holmes ... bench: 63,729 ns/iter (+/- 1,830) = 9335 MB/s +test sherlock::everything_greedy ... bench: 2,181,593 ns/iter (+/- 46,002) = 272 MB/s +test sherlock::everything_greedy_nl ... bench: 884,811 ns/iter (+/- 26,211) = 672 MB/s +test sherlock::holmes_cochar_watson ... bench: 105,610 ns/iter (+/- 3,120) = 5633 MB/s +test sherlock::holmes_coword_watson ... bench: 480,986 ns/iter (+/- 13,228) = 1236 MB/s +test sherlock::ing_suffix ... bench: 322,921 ns/iter (+/- 3,555) = 1842 MB/s +test sherlock::ing_suffix_limited_space ... bench: 1,065,372 ns/iter (+/- 21,242) = 558 MB/s +test sherlock::letters ... bench: 22,109,015 ns/iter (+/- 146,243) = 26 MB/s +test sherlock::letters_lower ... bench: 21,686,153 ns/iter (+/- 206,041) = 27 MB/s +test sherlock::letters_upper ... bench: 1,778,225 ns/iter (+/- 25,935) = 334 MB/s +test sherlock::line_boundary_sherlock_holmes ... bench: 897,355 ns/iter (+/- 26,781) = 662 MB/s +test sherlock::name_alt1 ... bench: 31,927 ns/iter (+/- 633) = 18634 MB/s +test sherlock::name_alt2 ... bench: 87,040 ns/iter (+/- 1,859) = 6835 MB/s +test sherlock::name_alt3 ... bench: 97,715 ns/iter (+/- 2,109) = 6088 MB/s +test sherlock::name_alt3_nocase ... bench: 944,955 ns/iter (+/- 26,503) = 629 MB/s +test sherlock::name_alt4 ... bench: 120,935 ns/iter (+/- 2,399) = 4919 MB/s +test sherlock::name_alt4_nocase ... bench: 228,597 ns/iter (+/- 7,137) = 2602 MB/s +test sherlock::name_alt5 ... bench: 91,174 ns/iter (+/- 1,096) = 6525 MB/s +test sherlock::name_alt5_nocase ... bench: 937,189 ns/iter (+/- 23,839) = 634 MB/s +test sherlock::name_holmes ... bench: 34,020 ns/iter (+/- 752) = 17487 MB/s +test sherlock::name_holmes_nocase ... bench: 117,194 ns/iter (+/- 3,444) = 5076 MB/s +test sherlock::name_sherlock ... bench: 22,557 ns/iter (+/- 388) = 26374 MB/s +test sherlock::name_sherlock_holmes ... bench: 22,428 ns/iter (+/- 683) = 26526 MB/s +test sherlock::name_sherlock_holmes_nocase ... bench: 99,637 ns/iter (+/- 636) = 5971 MB/s +test sherlock::name_sherlock_nocase ... bench: 97,895 ns/iter (+/- 1,875) = 6077 MB/s +test sherlock::name_whitespace ... bench: 30,772 ns/iter (+/- 1,591) = 19333 MB/s +test sherlock::no_match_common ... bench: 19,665 ns/iter (+/- 296) = 30253 MB/s +test sherlock::no_match_really_common ... bench: 27,403 ns/iter (+/- 2,507) = 21710 MB/s +test sherlock::no_match_uncommon ... bench: 19,601 ns/iter (+/- 293) = 30352 MB/s +test sherlock::quotes ... bench: 370,323 ns/iter (+/- 1,345) = 1606 MB/s +test sherlock::repeated_class_negation ... bench: 68,414,794 ns/iter (+/- 342,428) = 8 MB/s +test sherlock::the_lower ... bench: 327,767 ns/iter (+/- 5,493) = 1815 MB/s +test sherlock::the_nocase ... bench: 507,818 ns/iter (+/- 1,796) = 1171 MB/s +test sherlock::the_upper ... bench: 45,045 ns/iter (+/- 1,400) = 13207 MB/s +test sherlock::the_whitespace ... bench: 822,080 ns/iter (+/- 16,581) = 723 MB/s +test sherlock::word_ending_n ... bench: 1,690,084 ns/iter (+/- 40,361) = 352 MB/s +test sherlock::words ... bench: 8,573,617 ns/iter (+/- 143,313) = 69 MB/s + +test result: ok. 0 passed; 0 failed; 0 ignored; 107 measured; 0 filtered out; finished in 110.03s + diff --git a/bench/log/10-last-frontier/rust-bytes-before-literal.log b/bench/log/10-last-frontier/rust-bytes-before-literal.log new file mode 100644 index 000000000..7016e3c56 --- /dev/null +++ b/bench/log/10-last-frontier/rust-bytes-before-literal.log @@ -0,0 +1,112 @@ + +running 107 tests +test misc::anchored_literal_long_match ... bench: 18 ns/iter (+/- 0) = 21666 MB/s +test misc::anchored_literal_long_non_match ... bench: 19 ns/iter (+/- 1) = 20526 MB/s +test misc::anchored_literal_short_match ... bench: 18 ns/iter (+/- 0) = 1444 MB/s +test misc::anchored_literal_short_non_match ... bench: 20 ns/iter (+/- 0) = 1300 MB/s +test misc::easy0_1K ... bench: 14 ns/iter (+/- 0) = 75071 MB/s +test misc::easy0_1MB ... bench: 21 ns/iter (+/- 0) = 49933476 MB/s +test misc::easy0_32 ... bench: 14 ns/iter (+/- 0) = 4214 MB/s +test misc::easy0_32K ... bench: 14 ns/iter (+/- 0) = 2342500 MB/s +test misc::easy1_1K ... bench: 41 ns/iter (+/- 0) = 25463 MB/s +test misc::easy1_1MB ... bench: 48 ns/iter (+/- 0) = 21845750 MB/s +test misc::easy1_32 ... bench: 41 ns/iter (+/- 0) = 1268 MB/s +test misc::easy1_32K ... bench: 41 ns/iter (+/- 1) = 799707 MB/s +test misc::hard_1K ... bench: 51 ns/iter (+/- 1) = 20607 MB/s +test misc::hard_1MB ... bench: 56 ns/iter (+/- 2) = 18725053 MB/s +test misc::hard_32 ... bench: 51 ns/iter (+/- 6) = 1156 MB/s +test misc::hard_32K ... bench: 51 ns/iter (+/- 1) = 643039 MB/s +test misc::is_match_set ... bench: 62 ns/iter (+/- 2) = 403 MB/s +test misc::literal ... bench: 13 ns/iter (+/- 0) = 3923 MB/s +test misc::long_needle1 ... bench: 2,825 ns/iter (+/- 57) = 35398 MB/s +test misc::long_needle2 ... bench: 350,755 ns/iter (+/- 11,905) = 285 MB/s +test misc::match_class ... bench: 64 ns/iter (+/- 1) = 1265 MB/s +test misc::match_class_in_range ... bench: 13 ns/iter (+/- 0) = 6230 MB/s +test misc::matches_set ... bench: 422 ns/iter (+/- 12) = 59 MB/s +test misc::medium_1K ... bench: 15 ns/iter (+/- 0) = 70133 MB/s +test misc::medium_1MB ... bench: 21 ns/iter (+/- 0) = 49933523 MB/s +test misc::medium_32 ... bench: 15 ns/iter (+/- 0) = 4000 MB/s +test misc::medium_32K ... bench: 14 ns/iter (+/- 0) = 2342571 MB/s +test misc::no_exponential ... bench: 443 ns/iter (+/- 12) = 225 MB/s +test misc::not_literal ... bench: 89 ns/iter (+/- 1) = 573 MB/s +test misc::one_pass_long_prefix ... bench: 52 ns/iter (+/- 1) = 500 MB/s +test misc::one_pass_long_prefix_not ... bench: 52 ns/iter (+/- 1) = 500 MB/s +test misc::one_pass_short ... bench: 40 ns/iter (+/- 1) = 425 MB/s +test misc::one_pass_short_not ... bench: 42 ns/iter (+/- 0) = 404 MB/s +test misc::reallyhard2_1K ... bench: 80 ns/iter (+/- 0) = 13000 MB/s +test misc::reallyhard_1K ... bench: 1,592 ns/iter (+/- 1) = 660 MB/s +test misc::reallyhard_1MB ... bench: 1,575,789 ns/iter (+/- 34,236) = 665 MB/s +test misc::reallyhard_32 ... bench: 101 ns/iter (+/- 2) = 584 MB/s +test misc::reallyhard_32K ... bench: 49,321 ns/iter (+/- 2,718) = 664 MB/s +test misc::reverse_suffix_no_quadratic ... bench: 4,158 ns/iter (+/- 93) = 1924 MB/s +test regexdna::find_new_lines ... bench: 12,391,732 ns/iter (+/- 180,913) = 410 MB/s +test regexdna::subst1 ... bench: 781,690 ns/iter (+/- 29,637) = 6503 MB/s +test regexdna::subst10 ... bench: 778,306 ns/iter (+/- 22,706) = 6531 MB/s +test regexdna::subst11 ... bench: 777,716 ns/iter (+/- 24,635) = 6536 MB/s +test regexdna::subst2 ... bench: 791,786 ns/iter (+/- 15,778) = 6420 MB/s +test regexdna::subst3 ... bench: 783,470 ns/iter (+/- 25,543) = 6488 MB/s +test regexdna::subst4 ... bench: 814,902 ns/iter (+/- 14,146) = 6238 MB/s +test regexdna::subst5 ... bench: 781,464 ns/iter (+/- 19,532) = 6504 MB/s +test regexdna::subst6 ... bench: 780,116 ns/iter (+/- 16,558) = 6516 MB/s +test regexdna::subst7 ... bench: 795,982 ns/iter (+/- 11,254) = 6386 MB/s +test regexdna::subst8 ... bench: 781,746 ns/iter (+/- 24,996) = 6502 MB/s +test regexdna::subst9 ... bench: 783,793 ns/iter (+/- 14,943) = 6485 MB/s +test regexdna::variant1 ... bench: 2,188,940 ns/iter (+/- 42,308) = 2322 MB/s +test regexdna::variant2 ... bench: 3,218,011 ns/iter (+/- 50,700) = 1579 MB/s +test regexdna::variant3 ... bench: 3,778,907 ns/iter (+/- 90,543) = 1345 MB/s +test regexdna::variant4 ... bench: 3,803,852 ns/iter (+/- 68,319) = 1336 MB/s +test regexdna::variant5 ... bench: 2,660,949 ns/iter (+/- 55,488) = 1910 MB/s +test regexdna::variant6 ... bench: 2,647,131 ns/iter (+/- 26,846) = 1920 MB/s +test regexdna::variant7 ... bench: 3,235,032 ns/iter (+/- 37,599) = 1571 MB/s +test regexdna::variant8 ... bench: 3,305,124 ns/iter (+/- 67,109) = 1538 MB/s +test regexdna::variant9 ... bench: 3,231,033 ns/iter (+/- 55,626) = 1573 MB/s +test rust_compile::compile_huge ... bench: 99,387 ns/iter (+/- 2,366) +test rust_compile::compile_huge_bytes ... bench: 5,865,693 ns/iter (+/- 62,255) +test rust_compile::compile_huge_full ... bench: 11,752,845 ns/iter (+/- 195,440) +test rust_compile::compile_simple ... bench: 4,117 ns/iter (+/- 141) +test rust_compile::compile_simple_bytes ... bench: 4,162 ns/iter (+/- 67) +test rust_compile::compile_simple_full ... bench: 19,955 ns/iter (+/- 622) +test rust_compile::compile_small ... bench: 9,140 ns/iter (+/- 112) +test rust_compile::compile_small_bytes ... bench: 165,990 ns/iter (+/- 5,876) +test rust_compile::compile_small_full ... bench: 342,897 ns/iter (+/- 13,730) +test sherlock::before_after_holmes ... bench: 906,789 ns/iter (+/- 13,931) = 656 MB/s +test sherlock::before_holmes ... bench: 62,319 ns/iter (+/- 790) = 9546 MB/s +test sherlock::everything_greedy ... bench: 2,175,424 ns/iter (+/- 47,720) = 273 MB/s +test sherlock::everything_greedy_nl ... bench: 884,406 ns/iter (+/- 22,679) = 672 MB/s +test sherlock::holmes_cochar_watson ... bench: 105,261 ns/iter (+/- 3,536) = 5651 MB/s +test sherlock::holmes_coword_watson ... bench: 479,524 ns/iter (+/- 7,749) = 1240 MB/s +test sherlock::ing_suffix ... bench: 321,401 ns/iter (+/- 9,123) = 1851 MB/s +test sherlock::ing_suffix_limited_space ... bench: 1,069,722 ns/iter (+/- 16,366) = 556 MB/s +test sherlock::letters ... bench: 21,959,896 ns/iter (+/- 204,695) = 27 MB/s +test sherlock::letters_lower ... bench: 21,462,457 ns/iter (+/- 207,449) = 27 MB/s +test sherlock::letters_upper ... bench: 1,768,026 ns/iter (+/- 41,459) = 336 MB/s +test sherlock::line_boundary_sherlock_holmes ... bench: 897,197 ns/iter (+/- 14,349) = 663 MB/s +test sherlock::name_alt1 ... bench: 34,037 ns/iter (+/- 719) = 17479 MB/s +test sherlock::name_alt2 ... bench: 86,788 ns/iter (+/- 1,203) = 6855 MB/s +test sherlock::name_alt3 ... bench: 98,225 ns/iter (+/- 1,589) = 6056 MB/s +test sherlock::name_alt3_nocase ... bench: 377,597 ns/iter (+/- 14,840) = 1575 MB/s +test sherlock::name_alt4 ... bench: 122,440 ns/iter (+/- 8,123) = 4858 MB/s +test sherlock::name_alt4_nocase ... bench: 187,282 ns/iter (+/- 5,176) = 3176 MB/s +test sherlock::name_alt5 ... bench: 91,429 ns/iter (+/- 1,944) = 6507 MB/s +test sherlock::name_alt5_nocase ... bench: 348,111 ns/iter (+/- 12,721) = 1709 MB/s +test sherlock::name_holmes ... bench: 33,547 ns/iter (+/- 1,119) = 17734 MB/s +test sherlock::name_holmes_nocase ... bench: 132,342 ns/iter (+/- 3,974) = 4495 MB/s +test sherlock::name_sherlock ... bench: 22,562 ns/iter (+/- 364) = 26368 MB/s +test sherlock::name_sherlock_holmes ... bench: 22,313 ns/iter (+/- 579) = 26663 MB/s +test sherlock::name_sherlock_holmes_nocase ... bench: 97,556 ns/iter (+/- 2,092) = 6098 MB/s +test sherlock::name_sherlock_nocase ... bench: 95,917 ns/iter (+/- 4,054) = 6202 MB/s +test sherlock::name_whitespace ... bench: 30,997 ns/iter (+/- 1,039) = 19193 MB/s +test sherlock::no_match_common ... bench: 19,690 ns/iter (+/- 378) = 30214 MB/s +test sherlock::no_match_really_common ... bench: 27,629 ns/iter (+/- 465) = 21532 MB/s +test sherlock::no_match_uncommon ... bench: 19,681 ns/iter (+/- 291) = 30228 MB/s +test sherlock::quotes ... bench: 368,290 ns/iter (+/- 1,508) = 1615 MB/s +test sherlock::repeated_class_negation ... bench: 73,004,024 ns/iter (+/- 1,040,743) = 8 MB/s +test sherlock::the_lower ... bench: 320,929 ns/iter (+/- 12,287) = 1853 MB/s +test sherlock::the_nocase ... bench: 514,946 ns/iter (+/- 11,241) = 1155 MB/s +test sherlock::the_upper ... bench: 43,816 ns/iter (+/- 1,719) = 13577 MB/s +test sherlock::the_whitespace ... bench: 825,245 ns/iter (+/- 20,797) = 720 MB/s +test sherlock::word_ending_n ... bench: 1,676,908 ns/iter (+/- 40,650) = 354 MB/s +test sherlock::words ... bench: 8,449,099 ns/iter (+/- 123,842) = 70 MB/s + +test result: ok. 0 passed; 0 failed; 0 ignored; 107 measured; 0 filtered out; finished in 128.47s + diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs new file mode 100644 index 000000000..de46db291 --- /dev/null +++ b/regex-syntax/src/hir/literal.rs @@ -0,0 +1,2767 @@ +/*! +Provides literal extraction from `Hir` expressions. + +An [`Extractor`] pulls literals out of [`Hir`] expressions and returns a +[`Seq`] of [`Literal`]s. + +The purpose of literal extraction is generally to provide avenues for +optimizing regex searches. The main idea is that substring searches can be an +order of magnitude faster than a regex search. Therefore, if one can execute +a substring search to find candidate match locations and only run the regex +search at those locations, then it is possible for huge improvements in +performance to be realized. + +With that said, literal optimizations are generally a black art because even +though substring search is generally faster, if the number of candidates +produced is high, then it can create a lot of overhead by ping-ponging between +the substring search and the regex search. + +Here are some heuristics that might be used to help increase the chances of +effective literal optimizations: + +* Stick to small [`Seq`]s. If you search for too many literals, it's likely +to lead to substring search that is only a little faster than a regex search, +and thus the overhead of using literal optimizations in the first place might +make things slower overall. +* The literals in your [`Seq`] shoudn't be too short. In general, longer is +better. A sequence corresponding to single bytes that occur frequently in the +haystack, for example, is probably a bad literal optimization because it's +likely to produce many false positive candidates. Longer literals are less +likely to match, and thus probably produce fewer false positives. +* If it's possible to estimate the approximate frequency of each byte according +to some pre-computed background distribution, it is possible to compute a score +of how "good" a `Seq` is. If a `Seq` isn't good enough, you might consider +skipping the literal optimization and just use the regex engine. + +(It should be noted that there are always pathological cases that can make +any kind of literal optimization be a net slower result. This is why it +might be a good idea to be conservative, or to even provide a means for +literal optimizations to be dynamically disabled if they are determined to be +ineffective according to some measure.) + +You're encouraged to explore the methods on [`Seq`], which permit shrinking +the size of sequences in a preference-order preserving fashion. + +Finally, note that it isn't strictly necessary to use an [`Extractor`]. Namely, +an `Extractor` only uses public APIs of the [`Seq`] and [`Literal`] types, +so it is possible to implement your own extractor. For example, for n-grams +or "inner" literals (i.e., not prefix or suffix literals). The `Extractor` +is mostly responsible for the case analysis over `Hir` expressions. Much of +the "trickier" parts are how to combine literal sequences, and that is all +implemented on [`Seq`]. +*/ + +use core::{cmp, mem}; + +use alloc::{vec, vec::Vec}; + +use crate::hir::{self, Hir}; + +/// Extracts prefix or suffix literal sequences from [`Hir`] expressions. +/// +/// Literal extraction is based on the following observations: +/// +/// * Many regexes start with one or a small number of literals. +/// * Substring search for literals is often much faster (sometimes by an order +/// of magnitude) than a regex search. +/// +/// Thus, in many cases, one can search for literals to find candidate starting +/// locations of a match, and then only run the full regex engine at each such +/// location instead of over the full haystack. +/// +/// The main downside of literal extraction is that it can wind up causing a +/// search to be slower overall. For example, if there are many matches or if +/// there are many candidates that don't ultimately lead to a match, then a +/// lot of overhead will be spent in shuffing back-and-forth between substring +/// search and the regex engine. This is the fundamental reason why literal +/// optimizations for regex patterns is sometimes considered a "black art." +/// +/// # Look-around assertions +/// +/// Literal extraction treats all look-around assertions as-if they match every +/// empty string. So for example, the regex `\bquux\b` will yield a sequence +/// containing a single exact literal `quux`. However, not all occurrences +/// of `quux` correspond to a match a of the regex. For example, `\bquux\b` +/// does not match `ZquuxZ` anywhere because `quux` does not fall on a word +/// boundary. +/// +/// In effect, if your regex contains look-around assertions, then a match of +/// an exact literal does not necessarily mean the regex overall matches. So +/// you may still need to run the regex engine in such cases to confirm the +/// match. +/// +/// The precise guarantee you get from a literal sequence is: if every literal +/// in the sequence is exact and the original regex contains zero look-around +/// assertions, then a preference-order multi-substring search of those +/// literals will precisely match a preference-order search of the original +/// regex. +/// +/// # Example +/// +/// This shows how to extract prefixes: +/// +/// ``` +/// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, Parser}; +/// +/// let hir = Parser::new().parse(r"(a|b|c)(x|y|z)[A-Z]+foo")?; +/// +/// let got = Extractor::new().extract(&hir); +/// // All literals returned are "inexact" because none of them reach the +/// // match state. +/// let expected = Seq::from_iter([ +/// Literal::inexact("ax"), +/// Literal::inexact("ay"), +/// Literal::inexact("az"), +/// Literal::inexact("bx"), +/// Literal::inexact("by"), +/// Literal::inexact("bz"), +/// Literal::inexact("cx"), +/// Literal::inexact("cy"), +/// Literal::inexact("cz"), +/// ]); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// This shows how to extract suffixes: +/// +/// ``` +/// use regex_syntax::{ +/// hir::literal::{Extractor, ExtractKind, Literal, Seq}, +/// Parser, +/// }; +/// +/// let hir = Parser::new().parse(r"foo|[A-Z]+bar")?; +/// +/// let got = Extractor::new().kind(ExtractKind::Suffix).extract(&hir); +/// // Since 'foo' gets to a match state, it is considered exact. But 'bar' +/// // does not because of the '[A-Z]+', and thus is marked inexact. +/// let expected = Seq::from_iter([ +/// Literal::exact("foo"), +/// Literal::inexact("bar"), +/// ]); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Extractor { + kind: ExtractKind, + limit_class: usize, + limit_repeat: usize, + limit_literal_len: usize, + limit_total: usize, +} + +impl Extractor { + /// Create a new extractor with a default configuration. + /// + /// The extractor can be optionally configured before calling + /// [`Extractor::extract`] to get a literal sequence. + pub fn new() -> Extractor { + Extractor { + kind: ExtractKind::Prefix, + limit_class: 10, + limit_repeat: 10, + limit_literal_len: 100, + limit_total: 250, + } + } + + /// Set the kind of literal sequence to extract from an [`Hir`] expression. + /// + /// The default is to extract prefixes, but suffixes can be selected + /// instead. The contract for prefixes is that every match of the + /// corresponding `Hir` must start with one of the literals in the sequence + /// returned. Moreover, the _order_ of the sequence returned corresponds to + /// the preference order. + /// + /// Suffixes satisfy a similar contract in that every match of the + /// corresponding `Hir` must end with one of the literals in the sequence + /// returned. However, there is no guarantee that the literals are in + /// preference order. + /// + /// Remember that a sequence can be infinite. For example, unless the + /// limits are configured to be impractically large, attempting to extract + /// prefixes (or suffixes) for the pattern `[A-Z]` will return an infinite + /// sequence. Generally speaking, if the sequence returned is infinite, + /// then it is presumed to be unwise to do prefix (or suffix) optimizations + /// for the pattern. + pub fn kind(&mut self, kind: ExtractKind) -> &mut Extractor { + self.kind = kind; + self + } + + /// Configure a limit on the length of the sequence that is permitted for + /// a character class. If a character class exceeds this limit, then the + /// sequence returned for it is infinite. + /// + /// This prevents classes like `[A-Z]` or `\pL` from getting turned into + /// huge and likely unproductive sequences of literals. + /// + /// # Example + /// + /// This example shows how this limit can be lowered to decrease the tolerance + /// for character classes being turned into literal sequences. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Seq}, Parser}; + /// + /// let hir = Parser::new().parse(r"[0-9]")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::from_iter([ + /// "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", + /// ]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_class(4).extract(&hir); + /// let expected = Seq::infinite(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn limit_class(&mut self, limit: usize) -> &mut Extractor { + self.limit_class = limit; + self + } + + /// Configure a limit on the total number of repetitions that is permitted + /// before literal extraction is stopped. + /// + /// This is useful for limiting things like `(abcde){50}`, or more + /// insidiously, `(?:){1000000000}`. This limit prevents any one single + /// repetition from adding too much to a literal sequence. + /// + /// With this limit set, repetitions that exceed it will be stopped and any + /// literals extracted up to that point will be made inexact. + /// + /// # Example + /// + /// This shows how to decrease the limit and compares it with the default. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, Parser}; + /// + /// let hir = Parser::new().parse(r"(abc){8}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::from_iter(["abcabcabcabcabcabcabcabc"]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_repeat(4).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("abcabcabcabc"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn limit_repeat(&mut self, limit: usize) -> &mut Extractor { + self.limit_repeat = limit; + self + } + + /// Configure a limit on the maximum length of any literal in a sequence. + /// + /// This is useful for limiting things like `(abcde){5}{5}{5}{5}`. While + /// each repetition or literal in that regex is small, when all the + /// repetitions are applied, one ends up with a literal of length `5^4 = + /// 625`. + /// + /// With this limit set, literals that exceed it will be made inexact and + /// thus prevented from growing. + /// + /// # Example + /// + /// This shows how to decrease the limit and compares it with the default. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, Parser}; + /// + /// let hir = Parser::new().parse(r"(abc){2}{2}{2}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::from_iter(["abcabcabcabcabcabcabcabc"]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_literal_len(14).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("abcabcabcabcab"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn limit_literal_len(&mut self, limit: usize) -> &mut Extractor { + self.limit_literal_len = limit; + self + } + + /// Configure a limit on the total number of literals that will be + /// returned. + /// + /// This is useful as a practical measure for avoiding the creation of + /// large sequences of literals. While the extractor will automatically + /// handle local creations of large sequences (for example, `[A-Z]` yields + /// an infinite sequence by default), large sequences can be created + /// through non-local means as well. + /// + /// For example, `[ab]{3}{3}` would yield a sequence of length `512 = 2^9` + /// despite each of the repetitions being small on their own. This limit + /// thus represents a "catch all" for avoiding locally small sequences from + /// combining into large sequences. + /// + /// # Example + /// + /// This example shows how reducing the limit will change the literal + /// sequence returned. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, Parser}; + /// + /// let hir = Parser::new().parse(r"[ab]{2}{2}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::from_iter([ + /// "aaaa", "aaab", "aaba", "aabb", + /// "abaa", "abab", "abba", "abbb", + /// "baaa", "baab", "baba", "babb", + /// "bbaa", "bbab", "bbba", "bbbb", + /// ]); + /// assert_eq!(expected, got); + /// + /// // The default limit is not too big, but big enough to extract all + /// // literals from '[ab]{2}{2}'. If we shrink the limit to less than 16, + /// // then we'll get a truncated set. Notice that it returns a sequence of + /// // length 4 even though our limit was 10. This is because the sequence + /// // is difficult to increase without blowing the limit. Notice also + /// // that every literal in the sequence is now inexact because they were + /// // stripped of some suffix. + /// let got = Extractor::new().limit_total(10).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("aa"), + /// Literal::inexact("ab"), + /// Literal::inexact("ba"), + /// Literal::inexact("bb"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn limit_total(&mut self, limit: usize) -> &mut Extractor { + self.limit_total = limit; + self + } + + /// Execute the extractor and return a sequence of literals. + pub fn extract(&self, hir: &Hir) -> Seq { + use crate::hir::HirKind::*; + + match *hir.kind() { + Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), + Literal(hir::Literal(ref bytes)) => { + let mut seq = + Seq::singleton(self::Literal::exact(bytes.to_vec())); + self.enforce_literal_len(&mut seq); + seq + } + Class(hir::Class::Unicode(ref cls)) => { + self.extract_class_unicode(cls) + } + Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls), + Repetition(ref rep) => self.extract_repetition(rep), + Group(hir::Group { ref hir, .. }) => self.extract(hir), + Concat(ref hirs) => match self.kind { + ExtractKind::Prefix => self.extract_concat(hirs.iter()), + ExtractKind::Suffix => self.extract_concat(hirs.iter().rev()), + }, + Alternation(ref hirs) => { + // Unlike concat, we always union starting from the beginning, + // since the beginning corresponds to the highest preference, + // which doesn't change based on forwards vs reverse. + self.extract_alternation(hirs.iter()) + } + } + } + + /// Extract a sequence from the given concatenation. Sequences from each of + /// the child HIR expressions are combined via cross product. + /// + /// This short circuits once the cross product turns into a sequence + /// containing only inexact literals. + fn extract_concat<'a, I: Iterator>(&self, it: I) -> Seq { + let mut seq = Seq::singleton(self::Literal::exact(vec![])); + for hir in it { + // If every element in the sequence is inexact, then a cross + // product will always be a no-op. Thus, there is nothing else we + // can add to it and can quit early. Note that this also includes + // infinite sequences. + if seq.is_inexact() { + break; + } + // Note that 'cross' also dispatches based on whether we're + // extracting prefixes or suffixes. + seq = self.cross(seq, &mut self.extract(hir)); + } + seq + } + + /// Extract a sequence from the given alternation. + /// + /// This short circuits once the union turns into an infinite sequence. + fn extract_alternation<'a, I: Iterator>( + &self, + it: I, + ) -> Seq { + let mut seq = Seq::empty(); + for hir in it { + // Once our 'seq' is infinite, every subsequent union + // operation on it will itself always result in an + // infinite sequence. Thus, it can never change and we can + // short-circuit. + if !seq.is_finite() { + break; + } + seq = self.union(seq, &mut self.extract(hir)); + } + seq + } + + /// Extract a sequence of literals from the given repetition. We do our + /// best, Some examples: + /// + /// 'a*' => [inexact(a), exact("")] + /// 'a*?' => [exact(""), inexact(a)] + /// 'a+' => [inexact(a)] + /// 'a{3}' => [exact(aaa)] + /// 'a{3,5} => [inexact(aaa)] + /// + /// The key here really is making sure we get the 'inexact' vs 'exact' + /// attributes correct on each of the literals we add. For example, the + /// fact that 'a*' gives us an inexact 'a' and an exact empty string means + /// that a regex like 'ab*c' will result in [inexact(ab), exact(ac)] + /// literals being extracted, which might actually be a better prefilter + /// than just 'a'. + fn extract_repetition(&self, rep: &hir::Repetition) -> Seq { + let mut subseq = self.extract(&rep.hir); + match *rep { + hir::Repetition { min: 0, max, greedy, .. } => { + // When 'max=1', we can retain exactness, since 'a?' is + // equivalent to 'a|'. Similarly below, 'a??' is equivalent to + // '|a'. + if max != Some(1) { + subseq.make_inexact(); + } + let mut empty = Seq::singleton(Literal::exact(vec![])); + if !greedy { + mem::swap(&mut subseq, &mut empty); + } + self.union(subseq, &mut empty) + } + hir::Repetition { min, max: Some(max), .. } if min == max => { + assert!(min > 0); // handled above + let limit = + u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); + let mut seq = Seq::singleton(Literal::exact(vec![])); + for _ in 0..cmp::min(min, limit) { + if seq.is_inexact() { + break; + } + seq = self.cross(seq, &mut subseq.clone()); + } + if usize::try_from(min).is_err() || min > limit { + seq.make_inexact(); + } + seq + } + hir::Repetition { min, max: Some(max), .. } if min < max => { + assert!(min > 0); // handled above + let limit = + u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); + let mut seq = Seq::singleton(Literal::exact(vec![])); + for _ in 0..cmp::min(min, limit) { + if seq.is_inexact() { + break; + } + seq = self.cross(seq, &mut subseq.clone()); + } + seq.make_inexact(); + seq + } + hir::Repetition { .. } => { + subseq.make_inexact(); + subseq + } + } + } + + /// Convert the given Unicode class into a sequence of literals if the + /// class is small enough. If the class is too big, return an infinite + /// sequence. + fn extract_class_unicode(&self, cls: &hir::ClassUnicode) -> Seq { + if self.class_over_limit_unicode(cls) { + return Seq::infinite(); + } + let mut seq = Seq::empty(); + for r in cls.iter() { + for ch in r.start()..=r.end() { + seq.push(Literal::from(ch)); + } + } + self.enforce_literal_len(&mut seq); + seq + } + + /// Convert the given byte class into a sequence of literals if the class + /// is small enough. If the class is too big, return an infinite sequence. + fn extract_class_bytes(&self, cls: &hir::ClassBytes) -> Seq { + if self.class_over_limit_bytes(cls) { + return Seq::infinite(); + } + let mut seq = Seq::empty(); + for r in cls.iter() { + for b in r.start()..=r.end() { + seq.push(Literal::from(b)); + } + } + self.enforce_literal_len(&mut seq); + seq + } + + /// Returns true if the given Unicode class exceeds the configured limits + /// on this extractor. + fn class_over_limit_unicode(&self, cls: &hir::ClassUnicode) -> bool { + let mut count = 0; + for r in cls.iter() { + if count > self.limit_class { + return true; + } + count += r.len(); + } + count > self.limit_class + } + + /// Returns true if the given byte class exceeds the configured limits on + /// this extractor. + fn class_over_limit_bytes(&self, cls: &hir::ClassBytes) -> bool { + let mut count = 0; + for r in cls.iter() { + if count > self.limit_class { + return true; + } + count += r.len(); + } + count > self.limit_class + } + + /// Compute the cross product of the two sequences if the result would be + /// within configured limits. Otherwise, make `seq2` infinite and cross the + /// infinite sequence with `seq1`. + fn cross(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { + if seq1.max_cross_len(seq2).map_or(false, |len| len > self.limit_total) + { + seq2.make_infinite(); + } + if let ExtractKind::Suffix = self.kind { + seq1.cross_reverse(seq2); + } else { + seq1.cross_forward(seq2); + } + assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); + self.enforce_literal_len(&mut seq1); + seq1 + } + + /// Union the two sequences if the result would be within configured + /// limits. Otherwise, make `seq2` infinite and union the infinite sequence + /// with `seq1`. + fn union(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { + if seq1.max_union_len(seq2).map_or(false, |len| len > self.limit_total) + { + seq2.make_infinite(); + } + seq1.union(seq2); + assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); + seq1 + } + + /// Applies the literal length limit to the given sequence. If none of the + /// literals in the sequence exceed the limit, then this is a no-op. + fn enforce_literal_len(&self, seq: &mut Seq) { + let len = self.limit_literal_len; + match self.kind { + ExtractKind::Prefix => seq.keep_first_bytes(len), + ExtractKind::Suffix => seq.keep_last_bytes(len), + } + } +} + +impl Default for Extractor { + fn default() -> Extractor { + Extractor::new() + } +} + +/// The kind of literals to extract from an [`Hir`] expression. +/// +/// The default extraction kind is `Prefix`. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum ExtractKind { + /// Extracts only prefix literals from a regex. + Prefix, + /// Extracts only suffix literals from a regex. + /// + /// Note that the sequence returned by suffix literals currently may + /// not correctly represent leftmost-first or "preference" order match + /// semantics. + Suffix, +} + +impl ExtractKind { + /// Returns true if this kind is the `Prefix` variant. + pub fn is_prefix(&self) -> bool { + matches!(*self, ExtractKind::Prefix) + } + + /// Returns true if this kind is the `Suffix` variant. + pub fn is_suffix(&self) -> bool { + matches!(*self, ExtractKind::Suffix) + } +} + +impl Default for ExtractKind { + fn default() -> ExtractKind { + ExtractKind::Prefix + } +} + +/// A sequence of literals. +/// +/// A `Seq` is very much like a set in that it represents a union of its +/// members. That is, it corresponds to a set of literals where at least one +/// must match in order for a particular [`Hir`] expression to match. (Whether +/// this corresponds to the entire `Hir` expression, a prefix of it or a suffix +/// of it depends on how the `Seq` was extracted from the `Hir`.) +/// +/// It is also unlike a set in that multiple identical literals may appear, +/// and that the order of the literals in the `Seq` matters. For example, if +/// the sequence is `[sam, samwise]` and leftmost-first matching is used, then +/// `samwise` can never match and the sequence is equivalent to `[sam]`. +/// +/// # States of a sequence +/// +/// A `Seq` has a few different logical states to consider: +/// +/// * The sequence can represent "any" literal. When this happens, the set does +/// not have a finite size. The purpose of this state is to inhibit callers +/// from making assumptions about what literals are required in order to match +/// a particular [`Hir`] expression. Generally speaking, when a set is in this +/// state, literal optimizations are inhibited. A good example of a regex that +/// will cause this sort of set to apppear is `[A-Za-z]`. The character class +/// is just too big (and also too narrow) to be usefully expanded into 52 +/// different literals. (Note that the decision for when a seq should become +/// infinite is determined by the caller. A seq itself has no hard-coded +/// limits.) +/// * The sequence can be empty, in which case, it is an affirmative statement +/// that there are no literals that can match the corresponding `Hir`. +/// Consequently, the `Hir` never matches any input. For example, `[a&&b]`. +/// * The sequence can be non-empty, in which case, at least one of the +/// literals must match in order for the corresponding `Hir` to match. +/// +/// # Example +/// +/// This example shows how literal sequences can be simplified by stripping +/// suffixes and minimizing while maintaining preference order. +/// +/// ``` +/// use regex_syntax::hir::literal::{Literal, Seq}; +/// +/// let mut seq = Seq::from_iter(&[ +/// "farm", +/// "appliance", +/// "faraway", +/// "apple", +/// "fare", +/// "gap", +/// "applicant", +/// "applaud", +/// ]); +/// seq.keep_first_bytes(3); +/// seq.minimize_by_preference(); +/// // Notice that 'far' comes before 'app', which matches the order in the +/// // original sequence. This guarantees that leftmost-first semantics are +/// // not altered by simplifying the set. +/// let expected = Seq::from_iter([ +/// Literal::inexact("far"), +/// Literal::inexact("app"), +/// Literal::exact("gap"), +/// ]); +/// assert_eq!(expected, seq); +/// ``` +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Seq { + /// The members of this seq. + /// + /// When `None`, the seq represents all possible literals. That is, it + /// prevents one from making assumptions about specific literals in the + /// seq, and forces one to treat it as if any literal might be in the seq. + /// + /// Note that `Some(vec![])` is valid and corresponds to the empty seq of + /// literals, i.e., a regex that can never match. For example, `[a&&b]`. + /// It is distinct from `Some(vec![""])`, which corresponds to the seq + /// containing an empty string, which matches at every position. + literals: Option>, +} + +impl Seq { + /// Returns an empty sequence. + /// + /// An empty sequence matches zero literals, and thus corresponds to a + /// regex that itself can never match. + #[inline] + pub fn empty() -> Seq { + Seq { literals: Some(vec![]) } + } + + /// Returns a sequence containing a single literal. + #[inline] + pub fn singleton(lit: Literal) -> Seq { + Seq { literals: Some(vec![lit]) } + } + + /// Returns a sequence of literals without a finite size and may contain + /// any literal. + /// + /// A sequence without finite size does not reveal anything about the + /// characteristics of the literals in its set. There are no fixed prefixes + /// or suffixes, nor are lower or upper bounds on the length of the literals + /// in the set known. + /// + /// This is useful to represent constructs in a regex that are "too big" + /// to useful represent as a sequence of literals. For example, `[A-Za-z]`. + /// When sequences get too big, they lose their discriminating nature and + /// are more likely to produce false positives, which in turn makes them + /// less likely to speed up searches. + /// + /// More pragmatically, for many regexes, enumerating all possible literals + /// is itself not possible or might otherwise use too many resources. So + /// constraining the size of sets during extraction is a practical trade + /// off to make. + #[inline] + pub fn infinite() -> Seq { + Seq { literals: None } + } + + /// If this is a finite sequence, return its members as a slice of + /// literals. + /// + /// The slice returned may be empty, in which case, there are no literals + /// that can match this sequence. + #[inline] + pub fn literals(&self) -> Option<&[Literal]> { + self.literals.as_deref() + } + + /// Push a literal to the end of this sequence. + /// + /// If this sequence is not finite, then this is a no-op. + /// + /// Similarly, if the most recently added item of this sequence is + /// equivalent to the literal given, then it is not added. This reflects + /// a `Seq`'s "set like" behavior, and represents a practical trade off. + /// Namely, there is never any need to have two adjacent and equivalent + /// literals in the same sequence, _and_ it is easy to detect in some + /// cases. + #[inline] + pub fn push(&mut self, lit: Literal) { + let lits = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + if lits.last().map_or(false, |m| m == &lit) { + return; + } + lits.push(lit); + } + + /// Make all of the literals in this sequence inexact. + /// + /// This is a no-op if this sequence is not finite. + #[inline] + pub fn make_inexact(&mut self) { + let lits = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + for lit in lits.iter_mut() { + lit.make_inexact(); + } + } + + /// Converts this sequence to an infinite sequence. + /// + /// This is a no-op if the sequence is already infinite. + #[inline] + pub fn make_infinite(&mut self) { + self.literals = None; + } + + /// Modify this sequence to contain the cross product between it and the + /// sequence given. + /// + /// The cross product only considers literals in this sequence that are + /// exact. That is, inexact literals are not extended. + /// + /// The literals are always drained from `other`, even if none are used. + /// This permits callers to reuse the sequence allocation elsewhere. + /// + /// If this sequence is infinite, then this is a no-op, regardless of what + /// `other` contains (and in this case, the literals are still drained from + /// `other`). If `other` is infinite and this sequence is finite, then this + /// is a no-op, unless this sequence contains a zero-length literal. In + /// which case, the infiniteness of `other` infects this sequence, and this + /// sequence is itself made infinite. + /// + /// Like [`Seq::union`], this may attempt to deduplicate literals. See + /// [`Seq::dedup`] for how deduplication deals with exact and inexact + /// literals. + /// + /// # Example + /// + /// This example shows basic usage and how exact and inexact literals + /// interact. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::from_iter([ + /// Literal::inexact("quux"), + /// Literal::exact("baz"), + /// ]); + /// seq1.cross_forward(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// let expected = Seq::from_iter([ + /// Literal::inexact("fooquux"), + /// Literal::exact("foobaz"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example shows the behavior of when `other` is an infinite + /// sequence. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_forward(&mut seq2); + /// + /// // When seq2 is infinite, cross product doesn't add anything, but + /// // ensures all members of seq1 are inexact. + /// let expected = Seq::from_iter([ + /// Literal::inexact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example is like the one above, but shows what happens when this + /// sequence contains an empty string. In this case, an infinite `other` + /// sequence infects this sequence (because the empty string means that + /// there are no finite prefixes): + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact(""), // inexact provokes same behavior + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_forward(&mut seq2); + /// + /// // seq1 is now infinite! + /// assert!(!seq1.is_finite()); + /// ``` + /// + /// This example shows the behavior of this sequence is infinite. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::infinite(); + /// let mut seq2 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// seq1.cross_forward(&mut seq2); + /// + /// // seq1 remains unchanged. + /// assert!(!seq1.is_finite()); + /// // Even though the literals in seq2 weren't used, it was still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn cross_forward(&mut self, other: &mut Seq) { + let (lits1, lits2) = match self.cross_preamble(other) { + None => return, + Some((lits1, lits2)) => (lits1, lits2), + }; + let newcap = lits1.len().saturating_mul(lits2.len()); + for selflit in mem::replace(lits1, Vec::with_capacity(newcap)) { + if !selflit.is_exact() { + lits1.push(selflit); + continue; + } + for otherlit in lits2.iter() { + let mut newlit = Literal::exact(Vec::with_capacity( + selflit.len() + otherlit.len(), + )); + newlit.extend(&selflit); + newlit.extend(&otherlit); + if !otherlit.is_exact() { + newlit.make_inexact(); + } + lits1.push(newlit); + } + } + lits2.drain(..); + self.dedup(); + } + + /// Modify this sequence to contain the cross product between it and + /// the sequence given, where the sequences are treated as suffixes + /// instead of prefixes. Namely, the sequence `other` is *prepended* + /// to `self` (as opposed to `other` being *appended* to `self` in + /// [`Seq::cross_forward`]). + /// + /// The cross product only considers literals in this sequence that are + /// exact. That is, inexact literals are not extended. + /// + /// The literals are always drained from `other`, even if none are used. + /// This permits callers to reuse the sequence allocation elsewhere. + /// + /// If this sequence is infinite, then this is a no-op, regardless of what + /// `other` contains (and in this case, the literals are still drained from + /// `other`). If `other` is infinite and this sequence is finite, then this + /// is a no-op, unless this sequence contains a zero-length literal. In + /// which case, the infiniteness of `other` infects this sequence, and this + /// sequence is itself made infinite. + /// + /// Like [`Seq::union`], this may attempt to deduplicate literals. See + /// [`Seq::dedup`] for how deduplication deals with exact and inexact + /// literals. + /// + /// # Example + /// + /// This example shows basic usage and how exact and inexact literals + /// interact. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::from_iter([ + /// Literal::inexact("quux"), + /// Literal::exact("baz"), + /// ]); + /// seq1.cross_reverse(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// let expected = Seq::from_iter([ + /// Literal::inexact("quuxfoo"), + /// Literal::inexact("bar"), + /// Literal::exact("bazfoo"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example shows the behavior of when `other` is an infinite + /// sequence. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_reverse(&mut seq2); + /// + /// // When seq2 is infinite, cross product doesn't add anything, but + /// // ensures all members of seq1 are inexact. + /// let expected = Seq::from_iter([ + /// Literal::inexact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example is like the one above, but shows what happens when this + /// sequence contains an empty string. In this case, an infinite `other` + /// sequence infects this sequence (because the empty string means that + /// there are no finite suffixes): + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact(""), // inexact provokes same behavior + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_reverse(&mut seq2); + /// + /// // seq1 is now infinite! + /// assert!(!seq1.is_finite()); + /// ``` + /// + /// This example shows the behavior when this sequence is infinite. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::infinite(); + /// let mut seq2 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// seq1.cross_reverse(&mut seq2); + /// + /// // seq1 remains unchanged. + /// assert!(!seq1.is_finite()); + /// // Even though the literals in seq2 weren't used, it was still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn cross_reverse(&mut self, other: &mut Seq) { + let (lits1, lits2) = match self.cross_preamble(other) { + None => return, + Some((lits1, lits2)) => (lits1, lits2), + }; + // We basically proceed as we do in 'cross_forward' at this point, + // except that the outer loop is now 'other' and the inner loop is now + // 'self'. That's because 'self' corresponds to suffixes and 'other' + // corresponds to the sequence we want to *prepend* to the suffixes. + let newcap = lits1.len().saturating_mul(lits2.len()); + let selflits = mem::replace(lits1, Vec::with_capacity(newcap)); + for (i, otherlit) in lits2.drain(..).enumerate() { + for selflit in selflits.iter() { + if !selflit.is_exact() { + // If the suffix isn't exact, then we can't prepend + // anything to it. However, we still want to keep it. But + // we only want to keep one of them, to avoid duplication. + // (The duplication is okay from a correctness perspective, + // but wasteful.) + if i == 0 { + lits1.push(selflit.clone()); + } + continue; + } + let mut newlit = Literal::exact(Vec::with_capacity( + otherlit.len() + selflit.len(), + )); + newlit.extend(&otherlit); + newlit.extend(&selflit); + if !otherlit.is_exact() { + newlit.make_inexact(); + } + lits1.push(newlit); + } + } + self.dedup(); + } + + /// A helper function the corresponds to the subtle preamble for both + /// `cross_forward` and `cross_reverse`. In effect, it handles the cases + /// of infinite sequences for both `self` and `other`, as well as ensuring + /// that literals from `other` are drained even if they aren't used. + fn cross_preamble<'a>( + &'a mut self, + other: &'a mut Seq, + ) -> Option<(&'a mut Vec, &'a mut Vec)> { + let lits2 = match other.literals { + None => { + // If our current seq contains the empty string and the seq + // we're adding matches any literal, then it follows that the + // current seq must now also match any literal. + // + // Otherwise, we just have to make sure everything in this + // sequence is inexact. + if self.min_literal_len() == Some(0) { + *self = Seq::infinite(); + } else { + self.make_inexact(); + } + return None; + } + Some(ref mut lits) => lits, + }; + let lits1 = match self.literals { + None => { + // If we aren't going to make it to the end of this routine + // where lits2 is drained, then we need to do it now. + lits2.drain(..); + return None; + } + Some(ref mut lits) => lits, + }; + Some((lits1, lits2)) + } + + /// Unions the `other` sequence into this one. + /// + /// The literals are always drained out of the given `other` sequence, + /// even if they are being unioned into an infinite sequence. This permits + /// the caller to reuse the `other` sequence in another context. + /// + /// Some literal deduping may be performed. If any deduping happens, + /// any leftmost-first or "preference" order match semantics will be + /// preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::from_iter(&["foo", "bar"]); + /// let mut seq2 = Seq::from_iter(&["bar", "quux", "foo"]); + /// seq1.union(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// // Adjacent literals are deduped, but non-adjacent literals may not be. + /// assert_eq!(Seq::from_iter(&["foo", "bar", "quux", "foo"]), seq1); + /// ``` + /// + /// This example shows that literals are drained from `other` even when + /// they aren't necessarily used. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::infinite(); + /// // Infinite sequences have no finite length. + /// assert_eq!(None, seq1.len()); + /// + /// let mut seq2 = Seq::from_iter(&["bar", "quux", "foo"]); + /// seq1.union(&mut seq2); + /// + /// // seq1 is still infinite and seq2 has been drained. + /// assert_eq!(None, seq1.len()); + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn union(&mut self, other: &mut Seq) { + let lits2 = match other.literals { + None => { + // Unioning with an infinite sequence always results in an + // infinite sequence. + self.make_infinite(); + return; + } + Some(ref mut lits) => lits.drain(..), + }; + let lits1 = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + lits1.extend(lits2); + self.dedup(); + } + + /// Unions the `other` sequence into this one by splice the `other` + /// sequence at the position of the first zero-length literal. + /// + /// This is useful for preserving preference order semantics when combining + /// two literal sequences. For example, in the regex `(a||f)+foo`, the + /// correct preference order prefix sequence is `[a, foo, f]`. + /// + /// The literals are always drained out of the given `other` sequence, + /// even if they are being unioned into an infinite sequence. This permits + /// the caller to reuse the `other` sequence in another context. Note that + /// the literals are drained even if no union is performed as well, i.e., + /// when this sequence does not contain a zero-length literal. + /// + /// Some literal deduping may be performed. If any deduping happens, + /// any leftmost-first or "preference" order match semantics will be + /// preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::from_iter(&["a", "", "f", ""]); + /// let mut seq2 = Seq::from_iter(&["foo"]); + /// seq1.union_into_empty(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// // 'foo' gets spliced into seq1 where the first empty string occurs. + /// assert_eq!(Seq::from_iter(&["a", "foo", "f"]), seq1); + /// ``` + /// + /// This example shows that literals are drained from `other` even when + /// they aren't necessarily used. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::from_iter(&["foo", "bar"]); + /// let mut seq2 = Seq::from_iter(&["bar", "quux", "foo"]); + /// seq1.union_into_empty(&mut seq2); + /// + /// // seq1 has no zero length literals, so no splicing happens. + /// assert_eq!(Seq::from_iter(&["foo", "bar"]), seq1); + /// // Even though no splicing happens, seq2 is still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn union_into_empty(&mut self, other: &mut Seq) { + let lits2 = other.literals.as_mut().map(|lits| lits.drain(..)); + let lits1 = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + let first_empty = match lits1.iter().position(|m| m.is_empty()) { + None => return, + Some(i) => i, + }; + let lits2 = match lits2 { + None => { + // Note that we are only here if we've found an empty literal, + // which implies that an infinite sequence infects this seq and + // also turns it into an infinite sequence. + self.literals = None; + return; + } + Some(lits) => lits, + }; + // Clearing out the empties needs to come before the splice because + // the splice might add more empties that we don't want to get rid + // of. Since we're splicing into the position of the first empty, the + // 'first_empty' position computed above is still correct. + lits1.retain(|m| !m.is_empty()); + lits1.splice(first_empty..first_empty, lits2); + self.dedup(); + } + + /// Deduplicate adjacent equivalent literals in this sequence. + /// + /// If adjacent literals are equivalent strings but one is exact and the + /// other inexact, the inexact literal is kept and the exact one is + /// removed. + /// + /// Deduping an infinite sequence is a no-op. + /// + /// # Example + /// + /// This example shows how literals that are duplicate byte strings but + /// are not equivalent with respect to exactness are resolved. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("foo"), + /// ]); + /// seq.dedup(); + /// + /// assert_eq!(Seq::from_iter([Literal::inexact("foo")]), seq); + /// ``` + #[inline] + pub fn dedup(&mut self) { + if let Some(ref mut lits) = self.literals { + lits.dedup_by(|lit1, lit2| { + if lit1.as_bytes() != lit2.as_bytes() { + return false; + } + if lit1.is_exact() != lit2.is_exact() { + lit1.make_inexact(); + lit2.make_inexact(); + } + true + }); + } + } + + /// Sorts this sequence of literals lexicographically. + /// + /// Note that if, before sorting, if a literal that is a prefix of another + /// literal appears after it, then after sorting, the sequence will not + /// represent the same preference order match semantics. For example, + /// sorting the sequence `[samwise, sam]` yields the sequence `[sam, + /// samwise]`. Under preference order semantics, the latter sequence will + /// never match `samwise` where as the first sequence can. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq = Seq::from_iter(&["foo", "quux", "bar"]); + /// seq.sort(); + /// + /// assert_eq!(Seq::from_iter(&["bar", "foo", "quux"]), seq); + /// ``` + #[inline] + pub fn sort(&mut self) { + if let Some(ref mut lits) = self.literals { + lits.sort(); + } + } + + /// Reverses all of the literals in this sequence. + /// + /// The order of the sequence itself is preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq = Seq::from_iter(&["oof", "rab"]); + /// seq.reverse_literals(); + /// assert_eq!(Seq::from_iter(&["foo", "bar"]), seq); + /// ``` + #[inline] + pub fn reverse_literals(&mut self) { + if let Some(ref mut lits) = self.literals { + for lit in lits.iter_mut() { + lit.reverse(); + } + } + } + + /// Shrinks this seq to its minimal size while respecting the preference + /// order of its literals. + /// + /// While this routine will remove duplicate literals from this seq, it + /// will also remove literals that can never match in a leftmost-first or + /// "preference order" search. Similar to [`Seq::dedup`], if a literal is + /// deduped, then the one that remains is made inexact. + /// + /// This is a no-op on seqs that are empty or not finite. + /// + /// # Example + /// + /// This example shows the difference between `{sam, samwise}` and + /// `{samwise, sam}`. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// // If 'sam' comes before 'samwise' and a preference order search is + /// // executed, then 'samwise' can never match. + /// let mut seq = Seq::from_iter(&["sam", "samwise"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::from_iter([Literal::inexact("sam")]), seq); + /// + /// // But if they are reversed, then it's possible for 'samwise' to match + /// // since it is given higher preference. + /// let mut seq = Seq::from_iter(&["samwise", "sam"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::from_iter(&["samwise", "sam"]), seq); + /// ``` + /// + /// This example shows that if an empty string is in this seq, then + /// anything that comes after it can never match. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// // An empty string is a prefix of all strings, so it automatically + /// // inhibits any subsequent strings from matching. + /// let mut seq = Seq::from_iter(&["foo", "bar", "", "quux", "fox"]); + /// seq.minimize_by_preference(); + /// let expected = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact("bar"), + /// Literal::inexact(""), + /// ]); + /// assert_eq!(expected, seq); + /// + /// // And of course, if it's at the beginning, then it makes it impossible + /// // for anything else to match. + /// let mut seq = Seq::from_iter(&["", "foo", "quux", "fox"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::from_iter([Literal::inexact("")]), seq); + /// ``` + #[inline] + pub fn minimize_by_preference(&mut self) { + if let Some(ref mut lits) = self.literals { + PreferenceTrie::minimize(lits); + } + } + + /// Trims all literals in this seq such that only the first `len` bytes + /// remain. If a literal has less than or equal to `len` bytes, then it + /// remains unchanged. Otherwise, it is trimmed and made inexact. + /// + /// # Example + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::from_iter(&["a", "foo", "quux"]); + /// seq.keep_first_bytes(2); + /// + /// let expected = Seq::from_iter([ + /// Literal::exact("a"), + /// Literal::inexact("fo"), + /// Literal::inexact("qu"), + /// ]); + /// assert_eq!(expected, seq); + /// ``` + #[inline] + pub fn keep_first_bytes(&mut self, len: usize) { + if let Some(ref mut lits) = self.literals { + for m in lits.iter_mut() { + m.keep_first_bytes(len); + } + } + } + + /// Trims all literals in this seq such that only the last `len` bytes + /// remain. If a literal has less than or equal to `len` bytes, then it + /// remains unchanged. Otherwise, it is trimmed and made inexact. + /// + /// # Example + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::from_iter(&["a", "foo", "quux"]); + /// seq.keep_last_bytes(2); + /// + /// let expected = Seq::from_iter([ + /// Literal::exact("a"), + /// Literal::inexact("oo"), + /// Literal::inexact("ux"), + /// ]); + /// assert_eq!(expected, seq); + /// ``` + #[inline] + pub fn keep_last_bytes(&mut self, len: usize) { + if let Some(ref mut lits) = self.literals { + for m in lits.iter_mut() { + m.keep_last_bytes(len); + } + } + } + + /// Returns true if this sequence is finite. + /// + /// When false, this sequence is infinite and must be treated as if it + /// contains every possible literal. + #[inline] + pub fn is_finite(&self) -> bool { + self.literals.is_some() + } + + /// Returns true if and only if this sequence is finite and empty. + /// + /// An empty sequence never matches anything. It can only be produced by + /// literal extraction when the corresponding regex itself cannot match. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == Some(0) + } + + /// Returns the number of literals in this sequence if the sequence is + /// finite. If the sequence is infinite, then `None` is returned. + #[inline] + pub fn len(&self) -> Option { + self.literals.as_ref().map(|lits| lits.len()) + } + + /// Returns true if and only if all literals in this sequence are exact. + /// + /// This returns false if the sequence is infinite. + #[inline] + pub fn is_exact(&self) -> bool { + self.literals().map_or(false, |lits| lits.iter().all(|x| x.is_exact())) + } + + /// Returns true if and only if all literals in this sequence are inexact. + /// + /// This returns true if the sequence is infinite. + #[inline] + pub fn is_inexact(&self) -> bool { + self.literals().map_or(true, |lits| lits.iter().all(|x| !x.is_exact())) + } + + /// Return the maximum length of the sequence that would result from + /// unioning `self` with `other`. If either set is infinite, then this + /// returns `None`. + #[inline] + fn max_union_len(&self, other: &Seq) -> Option { + let len1 = self.len()?; + let len2 = other.len()?; + Some(len1.saturating_add(len2)) + } + + /// Return the maximum length of the sequence that would result from the + /// cross product of `self` with `other`. If either set is infinite, then + /// this returns `None`. + #[inline] + fn max_cross_len(&self, other: &Seq) -> Option { + let len1 = self.len()?; + let len2 = other.len()?; + Some(len1.saturating_mul(len2)) + } + + /// Returns the length of the shortest literal in this sequence. + /// + /// If the sequence is infinite or empty, then this returns `None`. + #[inline] + pub fn min_literal_len(&self) -> Option { + self.literals.as_ref()?.iter().map(|x| x.len()).min() + } + + /// Returns the length of the longest literal in this sequence. + /// + /// If the sequence is infinite or empty, then this returns `None`. + #[inline] + pub fn max_literal_len(&self) -> Option { + self.literals.as_ref()?.iter().map(|x| x.len()).max() + } + + /// Returns the longest common prefix from this seq. + /// + /// If the seq matches any literal or other contains no literals, then + /// there is no meaningful prefix and this returns `None`. + /// + /// # Example + /// + /// This shows some example seqs and their longest common prefix. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let seq = Seq::from_iter(&["foo", "foobar", "fo"]); + /// assert_eq!(Some(&b"fo"[..]), seq.longest_common_prefix()); + /// let seq = Seq::from_iter(&["foo", "foo"]); + /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_prefix()); + /// let seq = Seq::from_iter(&["foo", "bar"]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); + /// let seq = Seq::from_iter(&[""]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); + /// + /// let seq = Seq::infinite(); + /// assert_eq!(None, seq.longest_common_prefix()); + /// let seq = Seq::empty(); + /// assert_eq!(None, seq.longest_common_prefix()); + /// ``` + #[inline] + pub fn longest_common_prefix(&self) -> Option<&[u8]> { + // If we match everything or match nothing, then there's no meaningful + // longest common prefix. + let lits = match self.literals { + None => return None, + Some(ref lits) => lits, + }; + if lits.len() == 0 { + return None; + } + let base = lits[0].as_bytes(); + let mut len = base.len(); + for m in lits.iter().skip(1) { + len = m + .as_bytes() + .iter() + .zip(base[..len].iter()) + .take_while(|&(a, b)| a == b) + .count(); + if len == 0 { + return Some(&[]); + } + } + Some(&base[..len]) + } + + /// Returns the longest common suffix from this seq. + /// + /// If the seq matches any literal or other contains no literals, then + /// there is no meaningful suffix and this returns `None`. + /// + /// # Example + /// + /// This shows some example seqs and their longest common suffix. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let seq = Seq::from_iter(&["oof", "raboof", "of"]); + /// assert_eq!(Some(&b"of"[..]), seq.longest_common_suffix()); + /// let seq = Seq::from_iter(&["foo", "foo"]); + /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_suffix()); + /// let seq = Seq::from_iter(&["foo", "bar"]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); + /// let seq = Seq::from_iter(&[""]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); + /// + /// let seq = Seq::infinite(); + /// assert_eq!(None, seq.longest_common_suffix()); + /// let seq = Seq::empty(); + /// assert_eq!(None, seq.longest_common_suffix()); + /// ``` + #[inline] + pub fn longest_common_suffix(&self) -> Option<&[u8]> { + // If we match everything or match nothing, then there's no meaningful + // longest common suffix. + let lits = match self.literals { + None => return None, + Some(ref lits) => lits, + }; + if lits.len() == 0 { + return None; + } + let base = lits[0].as_bytes(); + let mut len = base.len(); + for m in lits.iter().skip(1) { + len = m + .as_bytes() + .iter() + .rev() + .zip(base[base.len() - len..].iter().rev()) + .take_while(|&(a, b)| a == b) + .count(); + if len == 0 { + return Some(&[]); + } + } + Some(&base[base.len() - len..]) + } +} + +impl FromIterator for Seq { + fn from_iter>(it: T) -> Seq { + let mut seq = Seq::empty(); + for literal in it { + seq.push(literal); + } + seq + } +} + +/// Creates a sequence of exact literals from an iterator of byte strings. +impl> FromIterator for Seq { + fn from_iter>(it: T) -> Seq { + let mut seq = Seq::empty(); + for literal in it { + seq.push(Literal::exact(literal.as_ref())); + } + seq + } +} + +/// A single literal extracted from an [`Hir`] expression. +/// +/// A literal is composed of two things: +/// +/// * A sequence of bytes. No guarantees with respect to UTF-8 are provided. +/// In particular, even if the regex a literal is extracted from is UTF-8, the +/// literal extracted may not be valid UTF-8. (For example, if an [`Extractor`] +/// limit resulted in trimming a literal in a way that splits a codepoint.) +/// * Whether the literal is "exact" or not. An "exact" literal means that it +/// has not been trimmed, and may continue to be extended. If a literal is +/// "exact" after visiting the entire `Hir` expression, then this implies that +/// the literal leads to a match state. (Although it doesn't necessarily imply +/// all occurrences of the literal correspond to a match of the regex, since +/// literal extraction ignores look-around assertions.) +#[derive(Clone, Eq, PartialEq, PartialOrd, Ord)] +pub struct Literal { + bytes: Vec, + exact: bool, +} + +impl Literal { + /// Returns a new exact literal containing the bytes given. + #[inline] + pub fn exact>>(bytes: B) -> Literal { + Literal { bytes: bytes.into(), exact: true } + } + + /// Returns a new inexact literal containing the bytes given. + #[inline] + pub fn inexact>>(bytes: B) -> Literal { + Literal { bytes: bytes.into(), exact: false } + } + + /// Returns the bytes in this literal. + #[inline] + pub fn as_bytes(&self) -> &[u8] { + &self.bytes + } + + /// Yields ownership of the bytes inside this literal. + /// + /// Note that this throws away whether the literal is "exact" or not. + #[inline] + pub fn into_bytes(self) -> Vec { + self.bytes + } + + /// Returns the length of this literal in bytes. + #[inline] + pub fn len(&self) -> usize { + self.as_bytes().len() + } + + /// Returns true if and only if this literal has zero bytes. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns true if and only if this literal is exact. + #[inline] + pub fn is_exact(&self) -> bool { + self.exact + } + + /// Marks this literal as inexact. + /// + /// Inexact literals can never be extended. For example, + /// [`Seq::cross_forward`] will not extend inexact literals. + #[inline] + pub fn make_inexact(&mut self) { + self.exact = false; + } + + /// Reverse the bytes in this literal. + #[inline] + pub fn reverse(&mut self) { + self.bytes.reverse(); + } + + /// Extend this literal with the literal given. + /// + /// If this literal is inexact, then this is a no-op. + #[inline] + pub fn extend(&mut self, lit: &Literal) { + if !self.is_exact() { + return; + } + self.bytes.extend_from_slice(&lit.bytes); + } + + /// Trims this literal such that only the first `len` bytes remain. If + /// this literal has fewer than `len` bytes, then it remains unchanged. + /// Otherwise, the literal is marked as inexact. + #[inline] + pub fn keep_first_bytes(&mut self, len: usize) { + if len >= self.len() { + return; + } + self.make_inexact(); + self.bytes.truncate(len); + } + + /// Trims this literal such that only the last `len` bytes remain. If this + /// literal has fewer than `len` bytes, then it remains unchanged. + /// Otherwise, the literal is marked as inexact. + #[inline] + pub fn keep_last_bytes(&mut self, len: usize) { + if len >= self.len() { + return; + } + self.make_inexact(); + self.bytes.drain(..self.len() - len); + } +} + +impl From for Literal { + fn from(byte: u8) -> Literal { + Literal::exact(vec![byte]) + } +} + +impl From for Literal { + fn from(ch: char) -> Literal { + use alloc::string::ToString; + Literal::exact(ch.encode_utf8(&mut [0; 4]).to_string()) + } +} + +impl core::fmt::Debug for Literal { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let tag = if self.exact { "E" } else { "I" }; + f.debug_tuple(tag) + .field(&crate::debug::Bytes(self.as_bytes())) + .finish() + } +} + +/// A "preference" trie that rejects literals that will never match when +/// executing a leftmost first or "preference" search. +/// +/// For example, if 'sam' is inserted, then trying to insert 'samwise' will be +/// rejected because 'samwise' can never match since 'sam' will always take +/// priority. However, if 'samwise' is inserted first, then inserting 'sam' +/// after it is accepted. In this case, either 'samwise' or 'sam' can match in +/// a "preference" search. +/// +/// Note that we only use this trie as a "set." That is, given a sequence of +/// literals, we insert each one in order. An `insert` will reject a literal +/// if a prefix of that literal already exists in the trie. Thus, to rebuild +/// the "minimal" sequence, we simply only keep literals that were successfully +/// inserted. (Since we don't need traversal, one wonders whether we can make +/// some simplifications here, but I haven't given it a ton of thought and I've +/// never seen this show up on a profile.) +#[derive(Debug, Default)] +struct PreferenceTrie { + /// The states in this trie. The index of a state in this vector is its ID. + states: Vec, + /// The index to allocate to the next literal added to this trie. Starts at + /// 0 and increments by 1 for every literal successfully added to the trie. + next_literal_index: usize, +} + +/// A single state in a trie. Uses a sparse representation for its transitions. +#[derive(Debug, Default)] +struct State { + /// Sparse representation of the transitions out of this state. Transitions + /// are sorted by byte. There is at most one such transition for any + /// particular byte. + trans: Vec<(u8, usize)>, + /// Whether this is a matching state or not. If it is, then it contains the + /// index to the matching literal. + literal_index: Option, +} + +impl PreferenceTrie { + /// Minimizes the given sequence of literals while preserving preference + /// order semantics. + fn minimize(literals: &mut Vec) { + use core::cell::RefCell; + + // MSRV(1.61): Use retain_mut here to avoid interior mutability. + let trie = RefCell::new(PreferenceTrie::default()); + let mut make_inexact = vec![]; + literals.retain(|lit| { + match trie.borrow_mut().insert(lit.as_bytes()) { + Ok(_) => true, + Err(i) => { + make_inexact.push(i); + false + } + } + }); + for i in make_inexact { + literals[i].make_inexact(); + } + } + + /// Returns `Ok` if the given byte string is accepted into this trie and + /// `Err` otherwise. The index for the success case corresponds to the + /// index of the literal added. The index for the error case corresponds to + /// the index of the literal already in the trie that prevented the given + /// byte string from being added. (Which implies it is a prefix of the one + /// given.) + /// + /// In short, the byte string given is accepted into the trie if and only + /// if it is possible for it to match when executing a preference order + /// search. + fn insert(&mut self, bytes: &[u8]) -> Result { + let mut prev = self.root(); + if let Some(idx) = self.states[prev].literal_index { + return Err(idx); + } + for &b in bytes.iter() { + match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) { + Ok(i) => { + prev = self.states[prev].trans[i].1; + if let Some(idx) = self.states[prev].literal_index { + return Err(idx); + } + } + Err(i) => { + let next = self.create_state(); + self.states[prev].trans.insert(i, (b, next)); + prev = next; + } + } + } + let idx = self.next_literal_index; + self.next_literal_index += 1; + self.states[prev].literal_index = Some(idx); + Ok(idx) + } + + /// Returns the root state ID, and if it doesn't exist, creates it. + fn root(&mut self) -> usize { + if !self.states.is_empty() { + 0 + } else { + self.create_state() + } + } + + /// Creates a new empty state and returns its ID. + fn create_state(&mut self) -> usize { + let id = self.states.len(); + self.states.push(State::default()); + id + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(pattern: &str) -> Hir { + crate::ParserBuilder::new() + .allow_invalid_utf8(true) + .build() + .parse(pattern) + .unwrap() + } + + fn prefixes(pattern: &str) -> Seq { + Extractor::new().kind(ExtractKind::Prefix).extract(&parse(pattern)) + } + + fn suffixes(pattern: &str) -> Seq { + Extractor::new().kind(ExtractKind::Suffix).extract(&parse(pattern)) + } + + fn e(pattern: &str) -> (Seq, Seq) { + (prefixes(pattern), suffixes(pattern)) + } + + #[allow(non_snake_case)] + fn E(x: &str) -> Literal { + Literal::exact(x.as_bytes()) + } + + #[allow(non_snake_case)] + fn I(x: &str) -> Literal { + Literal::inexact(x.as_bytes()) + } + + fn seq>(it: I) -> Seq { + Seq::from_iter(it) + } + + fn infinite() -> (Seq, Seq) { + (Seq::infinite(), Seq::infinite()) + } + + fn inexact(it1: I1, it2: I2) -> (Seq, Seq) + where + I1: IntoIterator, + I2: IntoIterator, + { + (Seq::from_iter(it1), Seq::from_iter(it2)) + } + + fn exact, I: IntoIterator>(it: I) -> (Seq, Seq) { + let s1 = Seq::from_iter(it); + let s2 = s1.clone(); + (s1, s2) + } + + #[test] + fn literal() { + assert_eq!(exact(["a"]), e("a")); + assert_eq!(exact(["aaaaa"]), e("aaaaa")); + assert_eq!(exact(["A", "a"]), e("(?i-u)a")); + assert_eq!(exact(["AB", "Ab", "aB", "ab"]), e("(?i-u)ab")); + assert_eq!(exact(["abC", "abc"]), e("ab(?i-u)c")); + + assert_eq!(exact([b"\xFF"]), e(r"(?-u:\xFF)")); + + #[cfg(feature = "unicode-case")] + { + assert_eq!(exact(["☃"]), e("☃")); + assert_eq!(exact(["☃"]), e("(?i)☃")); + assert_eq!(exact(["☃☃☃☃☃"]), e("☃☃☃☃☃")); + + assert_eq!(exact(["Δ"]), e("Δ")); + assert_eq!(exact(["δ"]), e("δ")); + assert_eq!(exact(["Δ", "δ"]), e("(?i)Δ")); + assert_eq!(exact(["Δ", "δ"]), e("(?i)δ")); + + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)S")); + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)s")); + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)ſ")); + } + + let letters = "ͱͳͷΐάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋ"; + assert_eq!(exact([letters]), e(letters)); + } + + #[test] + fn class() { + assert_eq!(exact(["a", "b", "c"]), e("[abc]")); + assert_eq!(exact(["a1b", "a2b", "a3b"]), e("a[123]b")); + assert_eq!(exact(["δ", "ε"]), e("[εδ]")); + #[cfg(feature = "unicode-case")] + { + assert_eq!(exact(["Δ", "Ε", "δ", "ε", "ϵ"]), e(r"(?i)[εδ]")); + } + } + + #[test] + fn look() { + assert_eq!(exact(["ab"]), e(r"a\Ab")); + assert_eq!(exact(["ab"]), e(r"a\zb")); + assert_eq!(exact(["ab"]), e(r"a(?m:^)b")); + assert_eq!(exact(["ab"]), e(r"a(?m:$)b")); + assert_eq!(exact(["ab"]), e(r"a\bb")); + assert_eq!(exact(["ab"]), e(r"a\Bb")); + assert_eq!(exact(["ab"]), e(r"a(?-u:\b)b")); + assert_eq!(exact(["ab"]), e(r"a(?-u:\B)b")); + + assert_eq!(exact(["ab"]), e(r"^ab")); + assert_eq!(exact(["ab"]), e(r"$ab")); + assert_eq!(exact(["ab"]), e(r"(?m:^)ab")); + assert_eq!(exact(["ab"]), e(r"(?m:$)ab")); + assert_eq!(exact(["ab"]), e(r"\bab")); + assert_eq!(exact(["ab"]), e(r"\Bab")); + assert_eq!(exact(["ab"]), e(r"(?-u:\b)ab")); + assert_eq!(exact(["ab"]), e(r"(?-u:\B)ab")); + + assert_eq!(exact(["ab"]), e(r"ab^")); + assert_eq!(exact(["ab"]), e(r"ab$")); + assert_eq!(exact(["ab"]), e(r"ab(?m:^)")); + assert_eq!(exact(["ab"]), e(r"ab(?m:$)")); + assert_eq!(exact(["ab"]), e(r"ab\b")); + assert_eq!(exact(["ab"]), e(r"ab\B")); + assert_eq!(exact(["ab"]), e(r"ab(?-u:\b)")); + assert_eq!(exact(["ab"]), e(r"ab(?-u:\B)")); + + let expected = (seq([I("aZ"), E("ab")]), seq([I("Zb"), E("ab")])); + assert_eq!(expected, e(r"^aZ*b")); + } + + #[test] + fn repetition() { + assert_eq!(exact(["a", ""]), e(r"a?")); + assert_eq!(exact(["", "a"]), e(r"a??")); + assert_eq!(inexact([I("a"), E("")], [I("a"), E("")]), e(r"a*")); + assert_eq!(inexact([E(""), I("a")], [E(""), I("a")]), e(r"a*?")); + assert_eq!(inexact([I("a")], [I("a")]), e(r"a+")); + assert_eq!(inexact([I("a")], [I("a")]), e(r"(a+)+")); + + assert_eq!(exact(["ab"]), e(r"aZ{0}b")); + assert_eq!(exact(["aZb", "ab"]), e(r"aZ?b")); + assert_eq!(exact(["ab", "aZb"]), e(r"aZ??b")); + assert_eq!( + inexact([I("aZ"), E("ab")], [I("Zb"), E("ab")]), + e(r"aZ*b") + ); + assert_eq!( + inexact([E("ab"), I("aZ")], [E("ab"), I("Zb")]), + e(r"aZ*?b") + ); + assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+b")); + assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+?b")); + + assert_eq!(exact(["aZZb"]), e(r"aZ{2}b")); + assert_eq!(inexact([I("aZZ")], [I("ZZb")]), e(r"aZ{2,3}b")); + + assert_eq!(exact(["abc", ""]), e(r"(abc)?")); + assert_eq!(exact(["", "abc"]), e(r"(abc)??")); + + assert_eq!(inexact([I("a"), E("b")], [I("ab"), E("b")]), e(r"a*b")); + assert_eq!(inexact([E("b"), I("a")], [E("b"), I("ab")]), e(r"a*?b")); + assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+")); + assert_eq!(inexact([I("a"), I("b")], [I("b")]), e(r"a*b+")); + + // FIXME: The suffixes for this don't look quite right to me. I think + // the right suffixes would be: [I(ac), I(bc), E(c)]. The main issue I + // think is that suffixes are computed by iterating over concatenations + // in reverse, and then [bc, ac, c] ordering is indeed correct from + // that perspective. We also test a few more equivalent regexes, and + // we get the same result, so it is consistent at least I suppose. + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"a*b*c") + ); + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"(a+)?(b+)?c") + ); + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"(a+|)(b+|)c") + ); + // A few more similarish but not identical regexes. These may have a + // similar problem as above. + assert_eq!( + inexact( + [I("a"), I("b"), I("c"), E("")], + [I("c"), I("b"), I("a"), E("")] + ), + e(r"a*b*c*") + ); + assert_eq!(inexact([I("a"), I("b"), I("c")], [I("c")]), e(r"a*b*c+")); + assert_eq!(inexact([I("a"), I("b")], [I("bc")]), e(r"a*b+c")); + assert_eq!(inexact([I("a"), I("b")], [I("c"), I("b")]), e(r"a*b+c*")); + assert_eq!(inexact([I("ab"), E("a")], [I("b"), E("a")]), e(r"ab*")); + assert_eq!( + inexact([I("ab"), E("ac")], [I("bc"), E("ac")]), + e(r"ab*c") + ); + assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+")); + assert_eq!(inexact([I("ab")], [I("bc")]), e(r"ab+c")); + + assert_eq!( + inexact([I("z"), E("azb")], [I("zazb"), E("azb")]), + e(r"z*azb") + ); + + let expected = + exact(["aaa", "aab", "aba", "abb", "baa", "bab", "bba", "bbb"]); + assert_eq!(expected, e(r"[ab]{3}")); + let expected = inexact( + [ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb"), + ], + [ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb"), + ], + ); + assert_eq!(expected, e(r"[ab]{3,4}")); + } + + #[test] + fn concat() { + let empty: [&str; 0] = []; + + assert_eq!(exact(["abcxyz"]), e(r"abc()xyz")); + assert_eq!(exact(["abcxyz"]), e(r"(abc)(xyz)")); + assert_eq!(exact(["abcmnoxyz"]), e(r"abc()mno()xyz")); + assert_eq!(exact(empty), e(r"abc[a&&b]xyz")); + assert_eq!(exact(["abcxyz"]), e(r"abc[a&&b]*xyz")); + } + + #[test] + fn alternation() { + assert_eq!(exact(["abc", "mno", "xyz"]), e(r"abc|mno|xyz")); + assert_eq!( + inexact( + [E("abc"), I("mZ"), E("mo"), E("xyz")], + [E("abc"), I("Zo"), E("mo"), E("xyz")] + ), + e(r"abc|mZ*o|xyz") + ); + assert_eq!(exact(["abc", "xyz"]), e(r"abc|M[a&&b]N|xyz")); + assert_eq!(exact(["abc", "MN", "xyz"]), e(r"abc|M[a&&b]*N|xyz")); + + assert_eq!(exact(["aaa", "aaaaa"]), e(r"(?:|aa)aaa")); + assert_eq!( + inexact( + [I("aaa"), E(""), I("aaaaa"), E("aa")], + [I("aaa"), E(""), E("aa")] + ), + e(r"(?:|aa)(?:aaa)*") + ); + assert_eq!( + inexact( + [E(""), I("aaa"), E("aa"), I("aaaaa")], + [E(""), I("aaa"), E("aa")] + ), + e(r"(?:|aa)(?:aaa)*?") + ); + + assert_eq!( + inexact([E("a"), I("b"), E("")], [E("a"), I("b"), E("")]), + e(r"a|b*") + ); + assert_eq!(inexact([E("a"), I("b")], [E("a"), I("b")]), e(r"a|b+")); + + assert_eq!( + inexact([I("a"), E("b"), E("c")], [I("ab"), E("b"), E("c")]), + e(r"a*b|c") + ); + + assert_eq!( + inexact( + [E("a"), E("b"), I("c"), E("")], + [E("a"), E("b"), I("c"), E("")] + ), + e(r"a|(?:b|c*)") + ); + + assert_eq!( + inexact( + [I("a"), I("b"), E("c"), I("a"), I("ab"), E("c")], + [I("ac"), I("bc"), E("c"), I("ac"), I("abc"), E("c")], + ), + e(r"(a|b)*c|(a|ab)*c") + ); + + assert_eq!( + exact(["abef", "abgh", "cdef", "cdgh"]), + e(r"(ab|cd)(ef|gh)") + ); + assert_eq!( + exact([ + "abefij", "abefkl", "abghij", "abghkl", "cdefij", "cdefkl", + "cdghij", "cdghkl", + ]), + e(r"(ab|cd)(ef|gh)(ij|kl)") + ); + } + + #[test] + fn impossible() { + let empty: [&str; 0] = []; + + assert_eq!(exact(empty), e(r"[a&&b]")); + assert_eq!(exact(empty), e(r"a[a&&b]")); + assert_eq!(exact(empty), e(r"[a&&b]b")); + assert_eq!(exact(empty), e(r"a[a&&b]b")); + assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]|b")); + assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]|b")); + assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]d|b")); + assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]d|b")); + assert_eq!(exact([""]), e(r"[a&&b]*")); + assert_eq!(exact(["MN"]), e(r"M[a&&b]*N")); + } + + // This tests patterns that contain something that defeats literal + // detection, usually because it would blow some limit on the total number + // of literals that can be returned. + // + // The main idea is that when literal extraction sees something that + // it knows will blow a limit, it replaces it with a marker that says + // "any literal will match here." While not necessarily true, the + // over-estimation is just fine for the purposes of literal extraction, + // because the imprecision doesn't matter: too big is too big. + // + // This is one of the trickier parts of literal extraction, since we need + // to make sure all of our literal extraction operations correctly compose + // with the markers. + #[test] + fn anything() { + assert_eq!(infinite(), e(r".")); + assert_eq!(infinite(), e(r"(?s).")); + assert_eq!(infinite(), e(r"[A-Za-z]")); + assert_eq!(infinite(), e(r"[A-Z]")); + assert_eq!(exact([""]), e(r"[A-Z]{0}")); + assert_eq!(infinite(), e(r"[A-Z]?")); + assert_eq!(infinite(), e(r"[A-Z]*")); + assert_eq!(infinite(), e(r"[A-Z]+")); + assert_eq!((seq([I("1")]), Seq::infinite()), e(r"1[A-Z]")); + assert_eq!((seq([I("1")]), seq([I("2")])), e(r"1[A-Z]2")); + assert_eq!((Seq::infinite(), seq([I("123")])), e(r"[A-Z]+123")); + assert_eq!(infinite(), e(r"[A-Z]+123[A-Z]+")); + assert_eq!(infinite(), e(r"1|[A-Z]|3")); + assert_eq!( + (seq([E("1"), I("2"), E("3")]), Seq::infinite()), + e(r"1|2[A-Z]|3"), + ); + assert_eq!( + (Seq::infinite(), seq([E("1"), I("2"), E("3")])), + e(r"1|[A-Z]2|3"), + ); + assert_eq!( + (seq([E("1"), I("2"), E("4")]), seq([E("1"), I("3"), E("4")])), + e(r"1|2[A-Z]3|4"), + ); + assert_eq!((Seq::infinite(), seq([I("2")])), e(r"(?:|1)[A-Z]2")); + assert_eq!(inexact([I("a")], [I("z")]), e(r"a.z")); + } + + // Like the 'anything' test, but it uses smaller limits in order to test + // the logic for effectively aborting literal extraction when the seqs get + // too big. + #[test] + fn anything_small_limits() { + fn prefixes(pattern: &str) -> Seq { + Extractor::new() + .kind(ExtractKind::Prefix) + .limit_total(10) + .extract(&parse(pattern)) + } + + fn suffixes(pattern: &str) -> Seq { + Extractor::new() + .kind(ExtractKind::Suffix) + .limit_total(10) + .extract(&parse(pattern)) + } + + fn e(pattern: &str) -> (Seq, Seq) { + (prefixes(pattern), suffixes(pattern)) + } + + assert_eq!( + ( + seq([ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb") + ]), + seq([ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb") + ]) + ), + e(r"[ab]{3}{3}") + ); + + assert_eq!(infinite(), e(r"ab|cd|ef|gh|ij|kl|mn|op|qr|st|uv|wx|yz")); + } + + #[test] + fn empty() { + assert_eq!(exact([""]), e(r"")); + assert_eq!(exact([""]), e(r"^")); + assert_eq!(exact([""]), e(r"$")); + assert_eq!(exact([""]), e(r"(?m:^)")); + assert_eq!(exact([""]), e(r"(?m:$)")); + assert_eq!(exact([""]), e(r"\b")); + assert_eq!(exact([""]), e(r"\B")); + assert_eq!(exact([""]), e(r"(?-u:\b)")); + assert_eq!(exact([""]), e(r"(?-u:\B)")); + } + + #[test] + fn odds_and_ends() { + assert_eq!((Seq::infinite(), seq([I("a")])), e(r".a")); + assert_eq!((seq([I("a")]), Seq::infinite()), e(r"a.")); + assert_eq!(infinite(), e(r"a|.")); + assert_eq!(infinite(), e(r".|a")); + + let pat = r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]"; + let expected = inexact( + ["Mo'am", "Moam", "Mu'am", "Muam"].map(I), + [ + "ddafi", "ddafy", "dhafi", "dhafy", "dzafi", "dzafy", "dafi", + "dafy", "tdafi", "tdafy", "thafi", "thafy", "tzafi", "tzafy", + "tafi", "tafy", "zdafi", "zdafy", "zhafi", "zhafy", "zzafi", + "zzafy", "zafi", "zafy", + ] + .map(I), + ); + assert_eq!(expected, e(pat)); + + assert_eq!( + (seq(["fn is_", "fn as_"].map(I)), Seq::infinite()), + e(r"fn is_([A-Z]+)|fn as_([A-Z]+)"), + ); + assert_eq!( + inexact([I("foo")], [I("quux")]), + e(r"foo[A-Z]+bar[A-Z]+quux") + ); + assert_eq!(infinite(), e(r"[A-Z]+bar[A-Z]+")); + assert_eq!( + exact(["Sherlock Holmes"]), + e(r"(?m)^Sherlock Holmes|Sherlock Holmes$") + ); + + assert_eq!(exact(["sa", "sb"]), e(r"\bs(?:[ab])")); + } + + // This tests a specific regex along with some heuristic steps to reduce + // the sequences extracted. This is meant to roughly correspond to the + // types of heuristics used to shrink literal sets in practice. (Shrinking + // is done because you want to balance "spend too much work looking for + // too many literals" and "spend too much work processing false positive + // matches from short literals.") + #[test] + #[cfg(feature = "unicode-case")] + fn holmes() { + let expected = inexact( + ["HOL", "HOl", "HoL", "Hol", "hOL", "hOl", "hoL", "hol"].map(I), + [ + "MES", "MEs", "Eſ", "MeS", "Mes", "eſ", "mES", "mEs", "meS", + "mes", + ] + .map(I), + ); + let (mut prefixes, mut suffixes) = e(r"(?i)Holmes"); + prefixes.keep_first_bytes(3); + suffixes.keep_last_bytes(3); + prefixes.minimize_by_preference(); + suffixes.minimize_by_preference(); + assert_eq!(expected, (prefixes, suffixes)); + } + + // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 + // See: CVE-2022-24713 + #[test] + fn crazy_repeats() { + assert_eq!(inexact([I("")], [I("")]), e(r"(?:){4294967295}")); + assert_eq!( + inexact([I("")], [I("")]), + e(r"(?:){64}{64}{64}{64}{64}{64}") + ); + assert_eq!(inexact([I("")], [I("")]), e(r"x{0}{4294967295}")); + assert_eq!(inexact([I("")], [I("")]), e(r"(?:|){4294967295}")); + + assert_eq!( + inexact([E("")], [E("")]), + e(r"(?:){8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") + ); + let repa = "a".repeat(100); + assert_eq!( + inexact([I(&repa)], [I(&repa)]), + e(r"a{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") + ); + } + + #[test] + fn huge() { + let pat = r#"(?-u) + 2(?: + [45]\d{3}| + 7(?: + 1[0-267]| + 2[0-289]| + 3[0-29]| + 4[01]| + 5[1-3]| + 6[013]| + 7[0178]| + 91 + )| + 8(?: + 0[125]| + [139][1-6]| + 2[0157-9]| + 41| + 6[1-35]| + 7[1-5]| + 8[1-8]| + 90 + )| + 9(?: + 0[0-2]| + 1[0-4]| + 2[568]| + 3[3-6]| + 5[5-7]| + 6[0167]| + 7[15]| + 8[0146-9] + ) + )\d{4}| + 3(?: + 12?[5-7]\d{2}| + 0(?: + 2(?: + [025-79]\d| + [348]\d{1,2} + )| + 3(?: + [2-4]\d| + [56]\d? + ) + )| + 2(?: + 1\d{2}| + 2(?: + [12]\d| + [35]\d{1,2}| + 4\d? + ) + )| + 3(?: + 1\d{2}| + 2(?: + [2356]\d| + 4\d{1,2} + ) + )| + 4(?: + 1\d{2}| + 2(?: + 2\d{1,2}| + [47]| + 5\d{2} + ) + )| + 5(?: + 1\d{2}| + 29 + )| + [67]1\d{2}| + 8(?: + 1\d{2}| + 2(?: + 2\d{2}| + 3| + 4\d + ) + ) + )\d{3}| + 4(?: + 0(?: + 2(?: + [09]\d| + 7 + )| + 33\d{2} + )| + 1\d{3}| + 2(?: + 1\d{2}| + 2(?: + [25]\d?| + [348]\d| + [67]\d{1,2} + ) + )| + 3(?: + 1\d{2}(?: + \d{2} + )?| + 2(?: + [045]\d| + [236-9]\d{1,2} + )| + 32\d{2} + )| + 4(?: + [18]\d{2}| + 2(?: + [2-46]\d{2}| + 3 + )| + 5[25]\d{2} + )| + 5(?: + 1\d{2}| + 2(?: + 3\d| + 5 + ) + )| + 6(?: + [18]\d{2}| + 2(?: + 3(?: + \d{2} + )?| + [46]\d{1,2}| + 5\d{2}| + 7\d + )| + 5(?: + 3\d?| + 4\d| + [57]\d{1,2}| + 6\d{2}| + 8 + ) + )| + 71\d{2}| + 8(?: + [18]\d{2}| + 23\d{2}| + 54\d{2} + )| + 9(?: + [18]\d{2}| + 2[2-5]\d{2}| + 53\d{1,2} + ) + )\d{3}| + 5(?: + 02[03489]\d{2}| + 1\d{2}| + 2(?: + 1\d{2}| + 2(?: + 2(?: + \d{2} + )?| + [457]\d{2} + ) + )| + 3(?: + 1\d{2}| + 2(?: + [37](?: + \d{2} + )?| + [569]\d{2} + ) + )| + 4(?: + 1\d{2}| + 2[46]\d{2} + )| + 5(?: + 1\d{2}| + 26\d{1,2} + )| + 6(?: + [18]\d{2}| + 2| + 53\d{2} + )| + 7(?: + 1| + 24 + )\d{2}| + 8(?: + 1| + 26 + )\d{2}| + 91\d{2} + )\d{3}| + 6(?: + 0(?: + 1\d{2}| + 2(?: + 3\d{2}| + 4\d{1,2} + ) + )| + 2(?: + 2[2-5]\d{2}| + 5(?: + [3-5]\d{2}| + 7 + )| + 8\d{2} + )| + 3(?: + 1| + 2[3478] + )\d{2}| + 4(?: + 1| + 2[34] + )\d{2}| + 5(?: + 1| + 2[47] + )\d{2}| + 6(?: + [18]\d{2}| + 6(?: + 2(?: + 2\d| + [34]\d{2} + )| + 5(?: + [24]\d{2}| + 3\d| + 5\d{1,2} + ) + ) + )| + 72[2-5]\d{2}| + 8(?: + 1\d{2}| + 2[2-5]\d{2} + )| + 9(?: + 1\d{2}| + 2[2-6]\d{2} + ) + )\d{3}| + 7(?: + (?: + 02| + [3-589]1| + 6[12]| + 72[24] + )\d{2}| + 21\d{3}| + 32 + )\d{3}| + 8(?: + (?: + 4[12]| + [5-7]2| + 1\d? + )| + (?: + 0| + 3[12]| + [5-7]1| + 217 + )\d + )\d{4}| + 9(?: + [35]1| + (?: + [024]2| + 81 + )\d| + (?: + 1| + [24]1 + )\d{2} + )\d{3} + "#; + // TODO: This is a good candidate of a seq of literals that could be + // shrunk quite a bit and still be very productive with respect to + // literal optimizations. + let (prefixes, suffixes) = e(pat); + assert!(!suffixes.is_finite()); + assert_eq!(Some(247), prefixes.len()); + } +} diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs deleted file mode 100644 index d127063bc..000000000 --- a/regex-syntax/src/hir/literal/mod.rs +++ /dev/null @@ -1,1672 +0,0 @@ -/*! -Provides routines for extracting literal prefixes and suffixes from an `Hir`. -*/ - -use core::{cmp, iter, mem, ops}; - -use alloc::{ - boxed::Box, - format, - string::{String, ToString}, - vec, - vec::Vec, -}; - -use crate::hir::{self, Hir, HirKind}; - -/// A set of literal byte strings extracted from a regular expression. -/// -/// Every member of the set is a `Literal`, which is represented by a -/// `Vec`. (Notably, it may contain invalid UTF-8.) Every member is -/// said to be either *complete* or *cut*. A complete literal means that -/// it extends until the beginning (or end) of the regular expression. In -/// some circumstances, this can be used to indicate a match in the regular -/// expression. -/// -/// A key aspect of literal extraction is knowing when to stop. It is not -/// feasible to blindly extract all literals from a regular expression, even if -/// there are finitely many. For example, the regular expression `[0-9]{10}` -/// has `10^10` distinct literals. For this reason, literal extraction is -/// bounded to some low number by default using heuristics, but the limits can -/// be tweaked. -/// -/// **WARNING**: Literal extraction uses stack space proportional to the -/// size of the `Hir` expression. At some point, this drawback will be -/// eliminated. To protect yourself, set a reasonable [`nest_limit` on your -/// `Parser`](crate::ParserBuilder::nest_limit). This is done for you by -/// default. -#[derive(Clone, Eq, PartialEq)] -pub struct Literals { - lits: Vec, - limit_size: usize, - limit_class: usize, -} - -/// A single member of a set of literals extracted from a regular expression. -/// -/// This type has `Deref` and `DerefMut` impls to `Vec` so that all slice -/// and `Vec` operations are available. -#[derive(Clone, Eq, Ord)] -pub struct Literal { - v: Vec, - cut: bool, -} - -impl Literals { - /// Returns a new empty set of literals using default limits. - pub fn empty() -> Literals { - Literals { lits: vec![], limit_size: 250, limit_class: 10 } - } - - /// Returns a set of literal prefixes extracted from the given `Hir`. - pub fn prefixes(expr: &Hir) -> Literals { - let mut lits = Literals::empty(); - lits.union_prefixes(expr); - lits - } - - /// Returns a set of literal suffixes extracted from the given `Hir`. - pub fn suffixes(expr: &Hir) -> Literals { - let mut lits = Literals::empty(); - lits.union_suffixes(expr); - lits - } - - /// Get the approximate size limit (in bytes) of this set. - pub fn limit_size(&self) -> usize { - self.limit_size - } - - /// Set the approximate size limit (in bytes) of this set. - /// - /// If extracting a literal would put the set over this limit, then - /// extraction stops. - /// - /// The new limits will only apply to additions to this set. Existing - /// members remain unchanged, even if the set exceeds the new limit. - pub fn set_limit_size(&mut self, size: usize) -> &mut Literals { - self.limit_size = size; - self - } - - /// Get the character class size limit for this set. - pub fn limit_class(&self) -> usize { - self.limit_class - } - - /// Limits the size of character(or byte) classes considered. - /// - /// A value of `0` prevents all character classes from being considered. - /// - /// This limit also applies to case insensitive literals, since each - /// character in the case insensitive literal is converted to a class, and - /// then case folded. - /// - /// The new limits will only apply to additions to this set. Existing - /// members remain unchanged, even if the set exceeds the new limit. - pub fn set_limit_class(&mut self, size: usize) -> &mut Literals { - self.limit_class = size; - self - } - - /// Returns the set of literals as a slice. Its order is unspecified. - pub fn literals(&self) -> &[Literal] { - &self.lits - } - - /// Returns the length of the smallest literal. - /// - /// Returns None is there are no literals in the set. - pub fn min_len(&self) -> Option { - let mut min = None; - for lit in &self.lits { - match min { - None => min = Some(lit.len()), - Some(m) if lit.len() < m => min = Some(lit.len()), - _ => {} - } - } - min - } - - /// Returns true if all members in this set are complete. - pub fn all_complete(&self) -> bool { - !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut()) - } - - /// Returns true if any member in this set is complete. - pub fn any_complete(&self) -> bool { - self.lits.iter().any(|lit| !lit.is_cut()) - } - - /// Returns true if this set contains an empty literal. - pub fn contains_empty(&self) -> bool { - self.lits.iter().any(|lit| lit.is_empty()) - } - - /// Returns true if this set is empty or if all of its members is empty. - pub fn is_empty(&self) -> bool { - self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty()) - } - - /// Returns a new empty set of literals using this set's limits. - pub fn to_empty(&self) -> Literals { - let mut lits = Literals::empty(); - lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class); - lits - } - - /// Returns the longest common prefix of all members in this set. - pub fn longest_common_prefix(&self) -> &[u8] { - if self.is_empty() { - return &[]; - } - let lit0 = &*self.lits[0]; - let mut len = lit0.len(); - for lit in &self.lits[1..] { - len = cmp::min( - len, - lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(), - ); - } - &self.lits[0][..len] - } - - /// Returns the longest common suffix of all members in this set. - pub fn longest_common_suffix(&self) -> &[u8] { - if self.is_empty() { - return &[]; - } - let lit0 = &*self.lits[0]; - let mut len = lit0.len(); - for lit in &self.lits[1..] { - len = cmp::min( - len, - lit.iter() - .rev() - .zip(lit0.iter().rev()) - .take_while(|&(a, b)| a == b) - .count(), - ); - } - &self.lits[0][self.lits[0].len() - len..] - } - - /// Returns a new set of literals with the given number of bytes trimmed - /// from the suffix of each literal. - /// - /// If any literal would be cut out completely by trimming, then None is - /// returned. - /// - /// Any duplicates that are created as a result of this transformation are - /// removed. - pub fn trim_suffix(&self, num_bytes: usize) -> Option { - if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) { - return None; - } - let mut new = self.to_empty(); - for mut lit in self.lits.iter().cloned() { - let new_len = lit.len() - num_bytes; - lit.truncate(new_len); - lit.cut(); - new.lits.push(lit); - } - new.lits.sort(); - new.lits.dedup(); - Some(new) - } - - /// Returns a new set of prefixes of this set of literals that are - /// guaranteed to be unambiguous. - /// - /// Any substring match with a member of the set is returned is guaranteed - /// to never overlap with a substring match of another member of the set - /// at the same starting position. - /// - /// Given any two members of the returned set, neither is a substring of - /// the other. - pub fn unambiguous_prefixes(&self) -> Literals { - if self.lits.is_empty() { - return self.to_empty(); - } - let mut old = self.lits.to_vec(); - let mut new = self.to_empty(); - 'OUTER: while let Some(mut candidate) = old.pop() { - if candidate.is_empty() { - continue; - } - if new.lits.is_empty() { - new.lits.push(candidate); - continue; - } - for lit2 in &mut new.lits { - if lit2.is_empty() { - continue; - } - if &candidate == lit2 { - // If the literal is already in the set, then we can - // just drop it. But make sure that cut literals are - // infectious! - candidate.cut = candidate.cut || lit2.cut; - lit2.cut = candidate.cut; - continue 'OUTER; - } - if candidate.len() < lit2.len() { - if let Some(i) = position(&candidate, &lit2) { - candidate.cut(); - let mut lit3 = lit2.clone(); - lit3.truncate(i); - lit3.cut(); - old.push(lit3); - lit2.clear(); - } - } else if let Some(i) = position(&lit2, &candidate) { - lit2.cut(); - let mut new_candidate = candidate.clone(); - new_candidate.truncate(i); - new_candidate.cut(); - old.push(new_candidate); - candidate.clear(); - } - // Oops, the candidate is already represented in the set. - if candidate.is_empty() { - continue 'OUTER; - } - } - new.lits.push(candidate); - } - new.lits.retain(|lit| !lit.is_empty()); - new.lits.sort(); - new.lits.dedup(); - new - } - - /// Returns a new set of suffixes of this set of literals that are - /// guaranteed to be unambiguous. - /// - /// Any substring match with a member of the set is returned is guaranteed - /// to never overlap with a substring match of another member of the set - /// at the same ending position. - /// - /// Given any two members of the returned set, neither is a substring of - /// the other. - pub fn unambiguous_suffixes(&self) -> Literals { - // This is a touch wasteful... - let mut lits = self.clone(); - lits.reverse(); - let mut unamb = lits.unambiguous_prefixes(); - unamb.reverse(); - unamb - } - - /// Unions the prefixes from the given expression to this set. - /// - /// If prefixes could not be added (for example, this set would exceed its - /// size limits or the set of prefixes from `expr` includes the empty - /// string), then false is returned. - /// - /// Note that prefix literals extracted from `expr` are said to be complete - /// if and only if the literal extends from the beginning of `expr` to the - /// end of `expr`. - pub fn union_prefixes(&mut self, expr: &Hir) -> bool { - let mut lits = self.to_empty(); - prefixes(expr, &mut lits); - !lits.is_empty() && !lits.contains_empty() && self.union(lits) - } - - /// Unions the suffixes from the given expression to this set. - /// - /// If suffixes could not be added (for example, this set would exceed its - /// size limits or the set of suffixes from `expr` includes the empty - /// string), then false is returned. - /// - /// Note that prefix literals extracted from `expr` are said to be complete - /// if and only if the literal extends from the end of `expr` to the - /// beginning of `expr`. - pub fn union_suffixes(&mut self, expr: &Hir) -> bool { - let mut lits = self.to_empty(); - suffixes(expr, &mut lits); - lits.reverse(); - !lits.is_empty() && !lits.contains_empty() && self.union(lits) - } - - /// Unions this set with another set. - /// - /// If the union would cause the set to exceed its limits, then the union - /// is skipped and it returns false. Otherwise, if the union succeeds, it - /// returns true. - pub fn union(&mut self, lits: Literals) -> bool { - if self.num_bytes() + lits.num_bytes() > self.limit_size { - return false; - } - if lits.is_empty() { - self.lits.push(Literal::empty()); - } else { - self.lits.extend(lits.lits); - } - true - } - - /// Extends this set with another set. - /// - /// The set of literals is extended via a cross product. - /// - /// If a cross product would cause this set to exceed its limits, then the - /// cross product is skipped and it returns false. Otherwise, if the cross - /// product succeeds, it returns true. - pub fn cross_product(&mut self, lits: &Literals) -> bool { - if lits.is_empty() { - return true; - } - // Check that we make sure we stay in our limits. - let mut size_after; - if self.is_empty() || !self.any_complete() { - size_after = self.num_bytes(); - for lits_lit in lits.literals() { - size_after += lits_lit.len(); - } - } else { - size_after = self.lits.iter().fold(0, |accum, lit| { - accum + if lit.is_cut() { lit.len() } else { 0 } - }); - for lits_lit in lits.literals() { - for self_lit in self.literals() { - if !self_lit.is_cut() { - size_after += self_lit.len() + lits_lit.len(); - } - } - } - } - if size_after > self.limit_size { - return false; - } - - let mut base = self.remove_complete(); - if base.is_empty() { - base = vec![Literal::empty()]; - } - for lits_lit in lits.literals() { - for mut self_lit in base.clone() { - self_lit.extend(&**lits_lit); - self_lit.cut = lits_lit.cut; - self.lits.push(self_lit); - } - } - true - } - - /// Extends each literal in this set with the bytes given. - /// - /// If the set is empty, then the given literal is added to the set. - /// - /// If adding any number of bytes to all members of this set causes a limit - /// to be exceeded, then no bytes are added and false is returned. If a - /// prefix of `bytes` can be fit into this set, then it is used and all - /// resulting literals are cut. - pub fn cross_add(&mut self, bytes: &[u8]) -> bool { - // N.B. This could be implemented by simply calling cross_product with - // a literal set containing just `bytes`, but we can be smarter about - // taking shorter prefixes of `bytes` if they'll fit. - if bytes.is_empty() { - return true; - } - if self.lits.is_empty() { - let i = cmp::min(self.limit_size, bytes.len()); - self.lits.push(Literal::new(bytes[..i].to_vec())); - self.lits[0].cut = i < bytes.len(); - return !self.lits[0].is_cut(); - } - let size = self.num_bytes(); - if size + self.lits.len() >= self.limit_size { - return false; - } - let mut i = 1; - while size + (i * self.lits.len()) <= self.limit_size - && i < bytes.len() - { - i += 1; - } - for lit in &mut self.lits { - if !lit.is_cut() { - lit.extend(&bytes[..i]); - if i < bytes.len() { - lit.cut(); - } - } - } - true - } - - /// Adds the given literal to this set. - /// - /// Returns false if adding this literal would cause the class to be too - /// big. - pub fn add(&mut self, lit: Literal) -> bool { - if self.num_bytes() + lit.len() > self.limit_size { - return false; - } - self.lits.push(lit); - true - } - - /// Extends each literal in this set with the character class given. - /// - /// Returns false if the character class was too big to add. - pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool { - self._add_char_class(cls, false) - } - - /// Extends each literal in this set with the character class given, - /// writing the bytes of each character in reverse. - /// - /// Returns false if the character class was too big to add. - fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool { - self._add_char_class(cls, true) - } - - fn _add_char_class( - &mut self, - cls: &hir::ClassUnicode, - reverse: bool, - ) -> bool { - if self.class_exceeds_limits(cls_char_count(cls)) { - return false; - } - let mut base = self.remove_complete(); - if base.is_empty() { - base = vec![Literal::empty()]; - } - for r in cls.iter() { - let (s, e) = (u32::from(r.start), u32::from(r.end)); - for c in (s..=e).filter_map(char::from_u32) { - for mut lit in base.clone() { - let mut bytes = c.to_string().into_bytes(); - if reverse { - bytes.reverse(); - } - lit.extend(&bytes); - self.lits.push(lit); - } - } - } - true - } - - /// Extends each literal in this set with the byte class given. - /// - /// Returns false if the byte class was too big to add. - pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool { - if self.class_exceeds_limits(cls_byte_count(cls)) { - return false; - } - let mut base = self.remove_complete(); - if base.is_empty() { - base = vec![Literal::empty()]; - } - for r in cls.iter() { - for b in r.start..=r.end { - for mut lit in base.clone() { - lit.push(b); - self.lits.push(lit); - } - } - } - true - } - - /// Cuts every member of this set. When a member is cut, it can never - /// be extended. - pub fn cut(&mut self) { - for lit in &mut self.lits { - lit.cut(); - } - } - - /// Reverses all members in place. - pub fn reverse(&mut self) { - for lit in &mut self.lits { - lit.reverse(); - } - } - - /// Clears this set of all members. - pub fn clear(&mut self) { - self.lits.clear(); - } - - /// Pops all complete literals out of this set. - fn remove_complete(&mut self) -> Vec { - let mut base = vec![]; - for lit in mem::replace(&mut self.lits, vec![]) { - if lit.is_cut() { - self.lits.push(lit); - } else { - base.push(lit); - } - } - base - } - - /// Returns the total number of bytes in this set. - fn num_bytes(&self) -> usize { - self.lits.iter().fold(0, |accum, lit| accum + lit.len()) - } - - /// Returns true if a character class with the given size would cause this - /// set to exceed its limits. - /// - /// The size given should correspond to the number of items in the class. - fn class_exceeds_limits(&self, size: usize) -> bool { - if size > self.limit_class { - return true; - } - // This is an approximation since codepoints in a char class can encode - // to 1-4 bytes. - let new_byte_count = if self.lits.is_empty() { - size - } else { - self.lits.iter().fold(0, |accum, lit| { - accum - + if lit.is_cut() { - // If the literal is cut, then we'll never add - // anything to it, so don't count it. - 0 - } else { - (lit.len() + 1) * size - } - }) - }; - new_byte_count > self.limit_size - } -} - -fn prefixes(expr: &Hir, lits: &mut Literals) { - match *expr.kind() { - HirKind::Literal(hir::Literal(ref bytes)) => { - lits.cross_add(bytes); - } - HirKind::Class(hir::Class::Unicode(ref cls)) => { - if !lits.add_char_class(cls) { - lits.cut(); - } - } - HirKind::Class(hir::Class::Bytes(ref cls)) => { - if !lits.add_byte_class(cls) { - lits.cut(); - } - } - HirKind::Group(hir::Group { ref hir, .. }) => { - prefixes(&**hir, lits); - } - HirKind::Repetition(ref x) => match (x.min, x.max) { - (0, Some(1)) => { - repeat_zero_or_one_literals(&x.hir, lits, prefixes); - } - (0, None) => { - repeat_zero_or_more_literals(&x.hir, lits, prefixes); - } - (1, None) => { - repeat_one_or_more_literals(&x.hir, lits, prefixes); - } - (min, max) => repeat_range_literals( - &x.hir, min, max, x.greedy, lits, prefixes, - ), - }, - HirKind::Concat(ref es) if es.is_empty() => {} - HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), - HirKind::Concat(ref es) => { - for e in es { - if let HirKind::Look(hir::Look::Start) = *e.kind() { - if !lits.is_empty() { - lits.cut(); - break; - } - lits.add(Literal::empty()); - continue; - } - let mut lits2 = lits.to_empty(); - prefixes(e, &mut lits2); - if !lits.cross_product(&lits2) || !lits2.any_complete() { - // If this expression couldn't yield any literal that - // could be extended, then we need to quit. Since we're - // short-circuiting, we also need to freeze every member. - lits.cut(); - break; - } - } - } - HirKind::Alternation(ref es) => { - alternate_literals(es, lits, prefixes); - } - _ => lits.cut(), - } -} - -fn suffixes(expr: &Hir, lits: &mut Literals) { - match *expr.kind() { - HirKind::Literal(hir::Literal(ref bytes)) => { - let mut bytes = bytes.to_vec(); - bytes.reverse(); - lits.cross_add(&bytes); - } - HirKind::Class(hir::Class::Unicode(ref cls)) => { - if !lits.add_char_class_reverse(cls) { - lits.cut(); - } - } - HirKind::Class(hir::Class::Bytes(ref cls)) => { - if !lits.add_byte_class(cls) { - lits.cut(); - } - } - HirKind::Group(hir::Group { ref hir, .. }) => { - suffixes(&**hir, lits); - } - HirKind::Repetition(ref x) => match (x.min, x.max) { - (0, Some(1)) => { - repeat_zero_or_one_literals(&x.hir, lits, suffixes); - } - (0, None) => { - repeat_zero_or_more_literals(&x.hir, lits, suffixes); - } - (1, None) => { - repeat_one_or_more_literals(&x.hir, lits, suffixes); - } - (min, max) => repeat_range_literals( - &x.hir, min, max, x.greedy, lits, suffixes, - ), - }, - HirKind::Concat(ref es) if es.is_empty() => {} - HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), - HirKind::Concat(ref es) => { - for e in es.iter().rev() { - if let HirKind::Look(hir::Look::End) = *e.kind() { - if !lits.is_empty() { - lits.cut(); - break; - } - lits.add(Literal::empty()); - continue; - } - let mut lits2 = lits.to_empty(); - suffixes(e, &mut lits2); - if !lits.cross_product(&lits2) || !lits2.any_complete() { - // If this expression couldn't yield any literal that - // could be extended, then we need to quit. Since we're - // short-circuiting, we also need to freeze every member. - lits.cut(); - break; - } - } - } - HirKind::Alternation(ref es) => { - alternate_literals(es, lits, suffixes); - } - _ => lits.cut(), - } -} - -fn repeat_zero_or_one_literals( - e: &Hir, - lits: &mut Literals, - mut f: F, -) { - f( - &Hir::repetition(hir::Repetition { - min: 0, - max: None, - // FIXME: Our literal extraction doesn't care about greediness. - // Which is partially why we're treating 'e?' as 'e*'. Namely, - // 'ab??' yields [Complete(ab), Complete(a)], but it should yield - // [Complete(a), Complete(ab)] because of the non-greediness. - greedy: true, - hir: Box::new(e.clone()), - }), - lits, - ); -} - -fn repeat_zero_or_more_literals( - e: &Hir, - lits: &mut Literals, - mut f: F, -) { - let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); - lits3.set_limit_size(lits.limit_size() / 2); - f(e, &mut lits3); - - if lits3.is_empty() || !lits2.cross_product(&lits3) { - lits.cut(); - return; - } - lits2.cut(); - lits2.add(Literal::empty()); - if !lits.union(lits2) { - lits.cut(); - } -} - -fn repeat_one_or_more_literals( - e: &Hir, - lits: &mut Literals, - mut f: F, -) { - f(e, lits); - lits.cut(); -} - -fn repeat_range_literals( - e: &Hir, - min: u32, - max: Option, - greedy: bool, - lits: &mut Literals, - mut f: F, -) { - // If 'min' somehow overflows usize, then we just treat it as 0, which is - // the most conservative thing we can do. - let umin = usize::try_from(min).unwrap_or(0); - if umin == 0 { - // This is a bit conservative. If `max` is set, then we could - // treat this as a finite set of alternations. For now, we - // just treat it as `e*`. - f( - &Hir::repetition(hir::Repetition { - min: 0, - max: None, - greedy, - hir: Box::new(e.clone()), - }), - lits, - ); - } else { - if umin > 0 { - let n = cmp::min(lits.limit_size, umin); - let es = iter::repeat(e.clone()).take(n).collect(); - f(&Hir::concat(es), lits); - if n < umin || lits.contains_empty() { - lits.cut(); - } - } - if max.map_or(true, |max| min < max) { - lits.cut(); - } - } -} - -fn alternate_literals( - es: &[Hir], - lits: &mut Literals, - mut f: F, -) { - let mut lits2 = lits.to_empty(); - for e in es { - let mut lits3 = lits.to_empty(); - lits3.set_limit_size(lits.limit_size() / 5); - f(e, &mut lits3); - if lits3.is_empty() || !lits2.union(lits3) { - // If we couldn't find suffixes for *any* of the - // alternates, then the entire alternation has to be thrown - // away and any existing members must be frozen. Similarly, - // if the union couldn't complete, stop and freeze. - lits.cut(); - return; - } - } - if !lits.cross_product(&lits2) { - lits.cut(); - } -} - -impl core::fmt::Debug for Literals { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - f.debug_struct("Literals") - .field("lits", &self.lits) - .field("limit_size", &self.limit_size) - .field("limit_class", &self.limit_class) - .finish() - } -} - -impl Literal { - /// Returns a new complete literal with the bytes given. - pub fn new(bytes: Vec) -> Literal { - Literal { v: bytes, cut: false } - } - - /// Returns a new complete empty literal. - pub fn empty() -> Literal { - Literal { v: vec![], cut: false } - } - - /// Returns true if this literal was "cut." - pub fn is_cut(&self) -> bool { - self.cut - } - - /// Cuts this literal. - pub fn cut(&mut self) { - self.cut = true; - } -} - -impl PartialEq for Literal { - fn eq(&self, other: &Literal) -> bool { - self.v == other.v - } -} - -impl PartialOrd for Literal { - fn partial_cmp(&self, other: &Literal) -> Option { - self.v.partial_cmp(&other.v) - } -} - -impl core::fmt::Debug for Literal { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - if self.is_cut() { - write!(f, "Cut({})", escape_unicode(&self.v)) - } else { - write!(f, "Complete({})", escape_unicode(&self.v)) - } - } -} - -impl AsRef<[u8]> for Literal { - fn as_ref(&self) -> &[u8] { - &self.v - } -} - -impl ops::Deref for Literal { - type Target = Vec; - fn deref(&self) -> &Vec { - &self.v - } -} - -impl ops::DerefMut for Literal { - fn deref_mut(&mut self) -> &mut Vec { - &mut self.v - } -} - -fn position(needle: &[u8], mut haystack: &[u8]) -> Option { - let mut i = 0; - while haystack.len() >= needle.len() { - if needle == &haystack[..needle.len()] { - return Some(i); - } - i += 1; - haystack = &haystack[1..]; - } - None -} - -fn escape_unicode(bytes: &[u8]) -> String { - let show = match core::str::from_utf8(bytes) { - Ok(v) => v.to_string(), - Err(_) => escape_bytes(bytes), - }; - let mut space_escaped = String::new(); - for c in show.chars() { - if c.is_whitespace() { - let cp = u32::from(c); - let escaped = if cp <= 0x7F { - escape_byte(u8::try_from(cp).unwrap()) - } else if cp <= 0xFFFF { - format!(r"\u{{{:04x}}}", cp) - } else { - format!(r"\U{{{:08x}}}", cp) - }; - space_escaped.push_str(&escaped); - } else { - space_escaped.push(c); - } - } - space_escaped -} - -fn escape_bytes(bytes: &[u8]) -> String { - let mut s = String::new(); - for &b in bytes { - s.push_str(&escape_byte(b)); - } - s -} - -fn escape_byte(byte: u8) -> String { - use core::ascii::escape_default; - - let escaped: Vec = escape_default(byte).collect(); - String::from_utf8_lossy(&escaped).into_owned() -} - -fn cls_char_count(cls: &hir::ClassUnicode) -> usize { - cls.iter().map(|&r| r.len()).sum() -} - -fn cls_byte_count(cls: &hir::ClassBytes) -> usize { - cls.iter().map(|&r| r.len()).sum() -} - -#[cfg(test)] -mod tests { - use alloc::{ - string::{String, ToString}, - vec, - vec::Vec, - }; - - use crate::{hir::Hir, ParserBuilder}; - - use super::*; - - // To make test failures easier to read. - #[derive(Debug, Eq, PartialEq)] - struct Bytes(Vec); - #[derive(Debug, Eq, PartialEq)] - struct Unicode(Vec); - - fn escape_lits(blits: &[Literal]) -> Vec { - let mut ulits = vec![]; - for blit in blits { - ulits - .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() }); - } - ulits - } - - fn create_lits>(it: I) -> Literals { - Literals { - lits: it.into_iter().collect(), - limit_size: 0, - limit_class: 0, - } - } - - // Needs to be pub for 1.3? - #[derive(Clone, Eq, PartialEq)] - pub struct ULiteral { - v: String, - cut: bool, - } - - impl ULiteral { - fn is_cut(&self) -> bool { - self.cut - } - } - - impl core::fmt::Debug for ULiteral { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - if self.is_cut() { - write!(f, "Cut({})", self.v) - } else { - write!(f, "Complete({})", self.v) - } - } - } - - impl PartialEq for ULiteral { - fn eq(&self, other: &Literal) -> bool { - self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut() - } - } - - impl PartialEq for Literal { - fn eq(&self, other: &ULiteral) -> bool { - &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut() - } - } - - #[allow(non_snake_case)] - fn C(s: &'static str) -> ULiteral { - ULiteral { v: s.to_string(), cut: true } - } - #[allow(non_snake_case)] - fn M(s: &'static str) -> ULiteral { - ULiteral { v: s.to_string(), cut: false } - } - - fn prefixes(lits: &mut Literals, expr: &Hir) { - lits.union_prefixes(expr); - } - - fn suffixes(lits: &mut Literals, expr: &Hir) { - lits.union_suffixes(expr); - } - - macro_rules! assert_lit_eq { - ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{ - let expected: Vec = vec![$($expected_lit),*]; - let lits = $got_lits; - assert_eq!( - $which(expected.clone()), - $which(escape_lits(lits.literals()))); - assert_eq!( - !expected.is_empty() && expected.iter().all(|l| !l.is_cut()), - lits.all_complete()); - assert_eq!( - expected.iter().any(|l| !l.is_cut()), - lits.any_complete()); - }}; - } - - macro_rules! test_lit { - ($name:ident, $which:ident, $re:expr) => { - test_lit!($name, $which, $re,); - }; - ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { - #[test] - fn $name() { - let expr = ParserBuilder::new() - .build() - .parse($re) - .unwrap(); - let lits = Literals::$which(&expr); - assert_lit_eq!(Unicode, lits, $($lit),*); - - let expr = ParserBuilder::new() - .allow_invalid_utf8(true) - .unicode(false) - .build() - .parse($re) - .unwrap(); - let lits = Literals::$which(&expr); - assert_lit_eq!(Bytes, lits, $($lit),*); - } - }; - } - - // ************************************************************************ - // Tests for prefix literal extraction. - // ************************************************************************ - - // Elementary tests. - test_lit!(pfx_one_lit1, prefixes, "a", M("a")); - test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); - test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83")); - #[cfg(feature = "unicode-case")] - test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83")); - test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); - test_lit!( - pfx_class2, - prefixes, - "(?u)[☃Ⅰ]", - M("\\xe2\\x85\\xa0"), - M("\\xe2\\x98\\x83") - ); - #[cfg(feature = "unicode-case")] - test_lit!( - pfx_class3, - prefixes, - "(?ui)[☃Ⅰ]", - M("\\xe2\\x85\\xa0"), - M("\\xe2\\x85\\xb0"), - M("\\xe2\\x98\\x83") - ); - test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a")); - test_lit!( - pfx_one_lit_casei2, - prefixes, - "(?i-u)abc", - M("ABC"), - M("aBC"), - M("AbC"), - M("abC"), - M("ABc"), - M("aBc"), - M("Abc"), - M("abc") - ); - test_lit!(pfx_group1, prefixes, "(a)", M("a")); - test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); - test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); - test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a")); - // FIXME: This should return [M("a"), M("ab")] because of the non-greedy - // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get - // a cut literal. - test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a")); - test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); - test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); - test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); - test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc")); - test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a")); - test_lit!(pfx_rep_range1, prefixes, "a{0}"); - test_lit!(pfx_rep_range2, prefixes, "a{0,}"); - test_lit!(pfx_rep_range3, prefixes, "a{0,1}"); - test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a")); - test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa")); - test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a")); - test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa")); - - // Test regexes with concatenations. - test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab")); - test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz")); - test_lit!( - pfx_cat3, - prefixes, - "(?i-u)[ab]z", - M("AZ"), - M("BZ"), - M("aZ"), - M("bZ"), - M("Az"), - M("Bz"), - M("az"), - M("bz") - ); - test_lit!( - pfx_cat4, - prefixes, - "[ab][yz]", - M("ay"), - M("by"), - M("az"), - M("bz") - ); - test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b")); - test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c")); - test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c")); - test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b")); - test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b")); - test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a")); - test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac")); - test_lit!(pfx_cat12, prefixes, "ab+", C("ab")); - test_lit!(pfx_cat13, prefixes, "ab+c", C("ab")); - test_lit!(pfx_cat14, prefixes, "a^", C("a")); - test_lit!(pfx_cat15, prefixes, "$a"); - test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac")); - test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab")); - test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb")); - test_lit!(pfx_cat19, prefixes, "a.z", C("a")); - - // Test regexes with alternations. - test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b")); - test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); - test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz")); - test_lit!(pfx_alt4, prefixes, "a|b*"); - test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b")); - test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)"); - test_lit!( - pfx_alt7, - prefixes, - "(a|b)*c|(a|ab)*c", - C("a"), - C("b"), - M("c"), - C("a"), - C("ab"), - M("c") - ); - test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c")); - - // Test regexes with empty assertions. - test_lit!(pfx_empty1, prefixes, "^a", M("a")); - test_lit!(pfx_empty2, prefixes, "a${2}", C("a")); - test_lit!(pfx_empty3, prefixes, "^abc", M("abc")); - test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); - - // Make sure some curious regexes have no prefixes. - test_lit!(pfx_nothing1, prefixes, "."); - test_lit!(pfx_nothing2, prefixes, "(?s)."); - test_lit!(pfx_nothing3, prefixes, "^"); - test_lit!(pfx_nothing4, prefixes, "$"); - test_lit!(pfx_nothing6, prefixes, "(?m)$"); - test_lit!(pfx_nothing7, prefixes, r"\b"); - test_lit!(pfx_nothing8, prefixes, r"\B"); - - // Test a few regexes that defeat any prefix literal detection. - test_lit!(pfx_defeated1, prefixes, ".a"); - test_lit!(pfx_defeated2, prefixes, "(?s).a"); - test_lit!(pfx_defeated3, prefixes, "a*b*c*"); - test_lit!(pfx_defeated4, prefixes, "a|."); - test_lit!(pfx_defeated5, prefixes, ".|a"); - test_lit!(pfx_defeated6, prefixes, "a|^"); - test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))"); - test_lit!(pfx_defeated8, prefixes, "$a"); - test_lit!(pfx_defeated9, prefixes, "(?m)$a"); - test_lit!(pfx_defeated10, prefixes, r"\ba"); - test_lit!(pfx_defeated11, prefixes, r"\Ba"); - test_lit!(pfx_defeated12, prefixes, "^*a"); - test_lit!(pfx_defeated13, prefixes, "^+a"); - - test_lit!( - pfx_crazy1, - prefixes, - r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", - C("Mo\\'"), - C("Mu\\'"), - C("Moam"), - C("Muam") - ); - - // ************************************************************************ - // Tests for quiting prefix literal search. - // ************************************************************************ - - macro_rules! test_exhausted { - ($name:ident, $which:ident, $re:expr) => { - test_exhausted!($name, $which, $re,); - }; - ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { - #[test] - fn $name() { - let expr = ParserBuilder::new() - .build() - .parse($re) - .unwrap(); - let mut lits = Literals::empty(); - lits.set_limit_size(20).set_limit_class(10); - $which(&mut lits, &expr); - assert_lit_eq!(Unicode, lits, $($lit),*); - - let expr = ParserBuilder::new() - .allow_invalid_utf8(true) - .unicode(false) - .build() - .parse($re) - .unwrap(); - let mut lits = Literals::empty(); - lits.set_limit_size(20).set_limit_class(10); - $which(&mut lits, &expr); - assert_lit_eq!(Bytes, lits, $($lit),*); - } - }; - } - - // These test use a much lower limit than the default so that we can - // write test cases of reasonable size. - test_exhausted!(pfx_exhausted1, prefixes, "[a-z]"); - test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A"); - test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A")); - test_exhausted!( - pfx_exhausted4, - prefixes, - "(?i-u)foobar", - C("FO"), - C("fO"), - C("Fo"), - C("fo") - ); - test_exhausted!( - pfx_exhausted5, - prefixes, - "(?:ab){100}", - C("abababababababababab") - ); - test_exhausted!( - pfx_exhausted6, - prefixes, - "(?:(?:ab){100})*cd", - C("ababababab"), - M("cd") - ); - test_exhausted!( - pfx_exhausted7, - prefixes, - "z(?:(?:ab){100})*cd", - C("zababababab"), - M("zcd") - ); - test_exhausted!( - pfx_exhausted8, - prefixes, - "aaaaaaaaaaaaaaaaaaaaz", - C("aaaaaaaaaaaaaaaaaaaa") - ); - - // ************************************************************************ - // Tests for suffix literal extraction. - // ************************************************************************ - - // Elementary tests. - test_lit!(sfx_one_lit1, suffixes, "a", M("a")); - test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); - test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83")); - #[cfg(feature = "unicode-case")] - test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83")); - test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); - test_lit!( - sfx_class2, - suffixes, - "(?u)[☃Ⅰ]", - M("\\xe2\\x85\\xa0"), - M("\\xe2\\x98\\x83") - ); - #[cfg(feature = "unicode-case")] - test_lit!( - sfx_class3, - suffixes, - "(?ui)[☃Ⅰ]", - M("\\xe2\\x85\\xa0"), - M("\\xe2\\x85\\xb0"), - M("\\xe2\\x98\\x83") - ); - test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a")); - test_lit!( - sfx_one_lit_casei2, - suffixes, - "(?i-u)abc", - M("ABC"), - M("ABc"), - M("AbC"), - M("Abc"), - M("aBC"), - M("aBc"), - M("abC"), - M("abc") - ); - test_lit!(sfx_group1, suffixes, "(a)", M("a")); - test_lit!(sfx_rep_zero_or_one1, suffixes, "a?"); - test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?"); - test_lit!(sfx_rep_zero_or_more1, suffixes, "a*"); - test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*"); - test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a")); - test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc")); - test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a")); - test_lit!(sfx_rep_range1, suffixes, "a{0}"); - test_lit!(sfx_rep_range2, suffixes, "a{0,}"); - test_lit!(sfx_rep_range3, suffixes, "a{0,1}"); - test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a")); - test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa")); - test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a")); - test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa")); - - // Test regexes with concatenations. - test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab")); - test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz")); - test_lit!( - sfx_cat3, - suffixes, - "(?i-u)[ab]z", - M("AZ"), - M("Az"), - M("BZ"), - M("Bz"), - M("aZ"), - M("az"), - M("bZ"), - M("bz") - ); - test_lit!( - sfx_cat4, - suffixes, - "[ab][yz]", - M("ay"), - M("az"), - M("by"), - M("bz") - ); - test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b")); - test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c")); - test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c")); - test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc")); - test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b")); - test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a")); - test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac")); - test_lit!(sfx_cat12, suffixes, "ab+", C("b")); - test_lit!(sfx_cat13, suffixes, "ab+c", C("bc")); - test_lit!(sfx_cat14, suffixes, "a^"); - test_lit!(sfx_cat15, suffixes, "$a", C("a")); - test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac")); - test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc")); - test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb")); - test_lit!(sfx_cat19, suffixes, "a.z", C("z")); - - // Test regexes with alternations. - test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b")); - test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); - test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz")); - test_lit!(sfx_alt4, suffixes, "a|b*"); - test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b")); - test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)"); - test_lit!( - sfx_alt7, - suffixes, - "(a|b)*c|(a|ab)*c", - C("ac"), - C("bc"), - M("c"), - C("ac"), - C("abc"), - M("c") - ); - test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c")); - - // Test regexes with empty assertions. - test_lit!(sfx_empty1, suffixes, "a$", M("a")); - test_lit!(sfx_empty2, suffixes, "${2}a", C("a")); - - // Make sure some curious regexes have no suffixes. - test_lit!(sfx_nothing1, suffixes, "."); - test_lit!(sfx_nothing2, suffixes, "(?s)."); - test_lit!(sfx_nothing3, suffixes, "^"); - test_lit!(sfx_nothing4, suffixes, "$"); - test_lit!(sfx_nothing6, suffixes, "(?m)$"); - test_lit!(sfx_nothing7, suffixes, r"\b"); - test_lit!(sfx_nothing8, suffixes, r"\B"); - - // Test a few regexes that defeat any suffix literal detection. - test_lit!(sfx_defeated1, suffixes, "a."); - test_lit!(sfx_defeated2, suffixes, "(?s)a."); - test_lit!(sfx_defeated3, suffixes, "a*b*c*"); - test_lit!(sfx_defeated4, suffixes, "a|."); - test_lit!(sfx_defeated5, suffixes, ".|a"); - test_lit!(sfx_defeated6, suffixes, "a|^"); - test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c))."); - test_lit!(sfx_defeated8, suffixes, "a^"); - test_lit!(sfx_defeated9, suffixes, "(?m)a$"); - test_lit!(sfx_defeated10, suffixes, r"a\b"); - test_lit!(sfx_defeated11, suffixes, r"a\B"); - test_lit!(sfx_defeated12, suffixes, "a^*"); - test_lit!(sfx_defeated13, suffixes, "a^+"); - - // These test use a much lower limit than the default so that we can - // write test cases of reasonable size. - test_exhausted!(sfx_exhausted1, suffixes, "[a-z]"); - test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*"); - test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z")); - test_exhausted!( - sfx_exhausted4, - suffixes, - "(?i-u)foobar", - C("AR"), - C("Ar"), - C("aR"), - C("ar") - ); - test_exhausted!( - sfx_exhausted5, - suffixes, - "(?:ab){100}", - C("abababababababababab") - ); - test_exhausted!( - sfx_exhausted6, - suffixes, - "cd(?:(?:ab){100})*", - C("ababababab"), - M("cd") - ); - test_exhausted!( - sfx_exhausted7, - suffixes, - "cd(?:(?:ab){100})*z", - C("abababababz"), - M("cdz") - ); - test_exhausted!( - sfx_exhausted8, - suffixes, - "zaaaaaaaaaaaaaaaaaaaa", - C("aaaaaaaaaaaaaaaaaaaa") - ); - - // ************************************************************************ - // Tests for generating unambiguous literal sets. - // ************************************************************************ - - macro_rules! test_unamb { - ($name:ident, $given:expr, $expected:expr) => { - #[test] - fn $name() { - let given: Vec = $given - .into_iter() - .map(|ul| { - let cut = ul.is_cut(); - Literal { v: ul.v.into_bytes(), cut: cut } - }) - .collect(); - let lits = create_lits(given); - let got = lits.unambiguous_prefixes(); - assert_eq!($expected, escape_lits(got.literals())); - } - }; - } - - test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]); - test_unamb!( - unambiguous2, - vec![M("zaaaaaa"), M("aa")], - vec![C("aa"), C("z")] - ); - test_unamb!( - unambiguous3, - vec![M("Sherlock"), M("Watson")], - vec![M("Sherlock"), M("Watson")] - ); - test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]); - test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]); - test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]); - test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]); - test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]); - test_unamb!( - unambiguous9, - vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")], - vec![C("a"), C("b"), C("c")] - ); - test_unamb!( - unambiguous10, - vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")], - vec![C("Mo"), C("Mu")] - ); - test_unamb!( - unambiguous11, - vec![M("zazb"), M("azb")], - vec![C("a"), C("z")] - ); - test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]); - test_unamb!( - unambiguous13, - vec![M("ABCX"), M("CDAX"), M("BCX")], - vec![C("A"), C("BCX"), C("CD")] - ); - test_unamb!( - unambiguous14, - vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")], - vec![M("DSX"), C("I"), C("MGX"), C("MV")] - ); - test_unamb!( - unambiguous15, - vec![M("IMG_"), M("MG_"), M("CIMG")], - vec![C("C"), C("I"), C("MG_")] - ); - - // ************************************************************************ - // Tests for suffix trimming. - // ************************************************************************ - macro_rules! test_trim { - ($name:ident, $trim:expr, $given:expr, $expected:expr) => { - #[test] - fn $name() { - let given: Vec = $given - .into_iter() - .map(|ul| { - let cut = ul.is_cut(); - Literal { v: ul.v.into_bytes(), cut: cut } - }) - .collect(); - let lits = create_lits(given); - let got = lits.trim_suffix($trim).unwrap(); - assert_eq!($expected, escape_lits(got.literals())); - } - }; - } - - test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]); - test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]); - test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]); - test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]); - - // ************************************************************************ - // Tests for longest common prefix. - // ************************************************************************ - - macro_rules! test_lcp { - ($name:ident, $given:expr, $expected:expr) => { - #[test] - fn $name() { - let given: Vec = $given - .into_iter() - .map(|s: &str| Literal { - v: s.to_string().into_bytes(), - cut: false, - }) - .collect(); - let lits = create_lits(given); - let got = lits.longest_common_prefix(); - assert_eq!($expected, escape_bytes(got)); - } - }; - } - - test_lcp!(lcp1, vec!["a"], "a"); - test_lcp!(lcp2, vec![], ""); - test_lcp!(lcp3, vec!["a", "b"], ""); - test_lcp!(lcp4, vec!["ab", "ab"], "ab"); - test_lcp!(lcp5, vec!["ab", "a"], "a"); - test_lcp!(lcp6, vec!["a", "ab"], "a"); - test_lcp!(lcp7, vec!["ab", "b"], ""); - test_lcp!(lcp8, vec!["b", "ab"], ""); - test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba"); - test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], ""); - test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], ""); - test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f"); - - // ************************************************************************ - // Tests for longest common suffix. - // ************************************************************************ - - macro_rules! test_lcs { - ($name:ident, $given:expr, $expected:expr) => { - #[test] - fn $name() { - let given: Vec = $given - .into_iter() - .map(|s: &str| Literal { - v: s.to_string().into_bytes(), - cut: false, - }) - .collect(); - let lits = create_lits(given); - let got = lits.longest_common_suffix(); - assert_eq!($expected, escape_bytes(got)); - } - }; - } - - test_lcs!(lcs1, vec!["a"], "a"); - test_lcs!(lcs2, vec![], ""); - test_lcs!(lcs3, vec!["a", "b"], ""); - test_lcs!(lcs4, vec!["ab", "ab"], "ab"); - test_lcs!(lcs5, vec!["ab", "a"], ""); - test_lcs!(lcs6, vec!["a", "ab"], ""); - test_lcs!(lcs7, vec!["ab", "b"], "b"); - test_lcs!(lcs8, vec!["b", "ab"], "b"); - test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo"); - test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], ""); - test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], ""); - test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b"); -} diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 79b48af82..26c17dca3 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -82,9 +82,8 @@ in a monospace font. # Literal extraction This crate provides limited support for [literal extraction from `Hir` -values](hir::literal::Literals). Be warned that literal extraction currently -uses recursion, and therefore, stack size proportional to the size of the -`Hir`. +values](hir::literal). Be warned that literal extraction uses recursion, and +therefore, stack size proportional to the size of the `Hir`. The purpose of literal extraction is to speed up searches. That is, if you know a regular expression must match a prefix or suffix literal, then it is diff --git a/src/exec.rs b/src/exec.rs index eafd4e63b..194cf71fe 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -5,7 +5,7 @@ use std::sync::Arc; #[cfg(feature = "perf-literal")] use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; -use regex_syntax::hir::literal::Literals; +use regex_syntax::hir::literal; use regex_syntax::hir::{Hir, Look}; use regex_syntax::ParserBuilder; @@ -78,15 +78,18 @@ struct ExecReadOnly { /// not supported.) Note that this program contains an embedded `.*?` /// preceding the first capture group, unless the regex is anchored at the /// beginning. + #[allow(dead_code)] dfa: Program, /// The same as above, except the program is reversed (and there is no /// preceding `.*?`). This is used by the DFA to find the starting location /// of matches. + #[allow(dead_code)] dfa_reverse: Program, /// A set of suffix literals extracted from the regex. /// /// Prefix literals are stored on the `Program`, since they are used inside /// the matching engines. + #[allow(dead_code)] suffixes: LiteralSearcher, /// An Aho-Corasick automaton with leftmost-first match semantics. /// @@ -121,8 +124,8 @@ pub struct ExecBuilder { /// literals. struct Parsed { exprs: Vec, - prefixes: Literals, - suffixes: Literals, + prefixes: literal::Seq, + suffixes: literal::Seq, bytes: bool, } @@ -228,8 +231,8 @@ impl ExecBuilder { /// Parse the current set of patterns into their AST and extract literals. fn parse(&self) -> Result { let mut exprs = Vec::with_capacity(self.options.pats.len()); - let mut prefixes = Some(Literals::empty()); - let mut suffixes = Some(Literals::empty()); + let mut prefixes = Some(literal::Seq::empty()); + let mut suffixes = Some(literal::Seq::empty()); let mut bytes = false; let is_set = self.options.pats.len() > 1; // If we're compiling a regex set and that set has any anchored @@ -264,14 +267,18 @@ impl ExecBuilder { // Regex sets with anchors do not go well with literal // optimizations. prefixes = None; + } else if props.look_set_prefix().contains_word() { + // The new literal extractor ignores look-around while + // the old one refused to extract prefixes from regexes + // that began with a \b. These old creaky regex internals + // can't deal with it, so we drop it. + prefixes = None; + } else if props.look_set().contains(Look::StartLF) { + // Similar to the reasoning for word boundaries, this old + // regex engine can't handle literal prefixes with '(?m:^)' + // at the beginning of a regex. + prefixes = None; } - prefixes = prefixes.and_then(|mut prefixes| { - if !prefixes.union_prefixes(&expr) { - None - } else { - Some(prefixes) - } - }); if !props.look_set_suffix().contains(Look::End) && props.look_set().contains(Look::End) @@ -284,21 +291,45 @@ impl ExecBuilder { // Regex sets with anchors do not go well with literal // optimizations. suffixes = None; + } else if props.look_set_suffix().contains_word() { + // See the prefix case for reasoning here. + suffixes = None; + } else if props.look_set().contains(Look::EndLF) { + // See the prefix case for reasoning here. + suffixes = None; } - suffixes = suffixes.and_then(|mut suffixes| { - if !suffixes.union_suffixes(&expr) { - None + + let (mut pres, mut suffs) = + if prefixes.is_none() && suffixes.is_none() { + (literal::Seq::infinite(), literal::Seq::infinite()) } else { - Some(suffixes) - } + literal_analysis(&expr) + }; + // These old creaky regex internals can't handle cases where + // the literal sequences are exact but there are look-around + // assertions. So we make sure the sequences are inexact if + // there are look-around assertions anywhere. This forces the + // regex engines to run instead of assuming that a literal + // match implies an overall match. + if !props.look_set().is_empty() { + pres.make_inexact(); + suffs.make_inexact(); + } + prefixes = prefixes.and_then(|mut prefixes| { + prefixes.union(&mut pres); + Some(prefixes) + }); + suffixes = suffixes.and_then(|mut suffixes| { + suffixes.union(&mut suffs); + Some(suffixes) }); } exprs.push(expr); } Ok(Parsed { exprs, - prefixes: prefixes.unwrap_or_else(Literals::empty), - suffixes: suffixes.unwrap_or_else(Literals::empty), + prefixes: prefixes.unwrap_or_else(literal::Seq::empty), + suffixes: suffixes.unwrap_or_else(literal::Seq::empty), bytes, }) } @@ -1597,6 +1628,48 @@ fn alternation_literals(expr: &Hir) -> Option>> { Some(lits) } +#[cfg(not(feature = "perf-literal"))] +fn literal_analysis(_: &Hir) -> (literal::Seq, literal::Seq) { + (literal::Seq::infinite(), literal::Seq::infinite()) +} + +#[cfg(feature = "perf-literal")] +fn literal_analysis(expr: &Hir) -> (literal::Seq, literal::Seq) { + const ATTEMPTS: [(usize, usize); 3] = [(5, 50), (4, 30), (3, 20)]; + + let mut prefixes = literal::Extractor::new() + .kind(literal::ExtractKind::Prefix) + .extract(expr); + for (keep, limit) in ATTEMPTS { + let len = match prefixes.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + prefixes.keep_first_bytes(keep); + prefixes.minimize_by_preference(); + } + + let mut suffixes = literal::Extractor::new() + .kind(literal::ExtractKind::Suffix) + .extract(expr); + for (keep, limit) in ATTEMPTS { + let len = match suffixes.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + suffixes.keep_last_bytes(keep); + suffixes.minimize_by_preference(); + } + + (prefixes, suffixes) +} + #[cfg(test)] mod test { #[test] diff --git a/src/literal/imp.rs b/src/literal/imp.rs index 90b2f1160..89fddca0e 100644 --- a/src/literal/imp.rs +++ b/src/literal/imp.rs @@ -2,7 +2,7 @@ use std::mem; use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder}; use memchr::{memchr, memchr2, memchr3, memmem}; -use regex_syntax::hir::literal::{Literal, Literals}; +use regex_syntax::hir::literal::{Literal, Seq}; /// A prefix extracted from a compiled regular expression. /// @@ -39,27 +39,26 @@ enum Matcher { impl LiteralSearcher { /// Returns a matcher that never matches and never advances the input. pub fn empty() -> Self { - Self::new(Literals::empty(), Matcher::Empty) + Self::new(Seq::infinite(), Matcher::Empty) } /// Returns a matcher for literal prefixes from the given set. - pub fn prefixes(lits: Literals) -> Self { + pub fn prefixes(lits: Seq) -> Self { let matcher = Matcher::prefixes(&lits); Self::new(lits, matcher) } /// Returns a matcher for literal suffixes from the given set. - pub fn suffixes(lits: Literals) -> Self { + pub fn suffixes(lits: Seq) -> Self { let matcher = Matcher::suffixes(&lits); Self::new(lits, matcher) } - fn new(lits: Literals, matcher: Matcher) -> Self { - let complete = lits.all_complete(); + fn new(lits: Seq, matcher: Matcher) -> Self { LiteralSearcher { - complete, - lcp: Memmem::new(lits.longest_common_prefix()), - lcs: Memmem::new(lits.longest_common_suffix()), + complete: lits.is_exact(), + lcp: Memmem::new(lits.longest_common_prefix().unwrap_or(b"")), + lcs: Memmem::new(lits.longest_common_suffix().unwrap_or(b"")), matcher, } } @@ -169,20 +168,24 @@ impl LiteralSearcher { } impl Matcher { - fn prefixes(lits: &Literals) -> Self { + fn prefixes(lits: &Seq) -> Self { let sset = SingleByteSet::prefixes(lits); Matcher::new(lits, sset) } - fn suffixes(lits: &Literals) -> Self { + fn suffixes(lits: &Seq) -> Self { let sset = SingleByteSet::suffixes(lits); Matcher::new(lits, sset) } - fn new(lits: &Literals, sset: SingleByteSet) -> Self { - if lits.literals().is_empty() { + fn new(lits: &Seq, sset: SingleByteSet) -> Self { + if lits.is_empty() || lits.min_literal_len() == Some(0) { return Matcher::Empty; } + let lits = match lits.literals() { + None => return Matcher::Empty, + Some(members) => members, + }; if sset.dense.len() >= 26 { // Avoid trying to match a large number of single bytes. // This is *very* sensitive to a frequency analysis comparison @@ -195,18 +198,18 @@ impl Matcher { if sset.complete { return Matcher::Bytes(sset); } - if lits.literals().len() == 1 { - return Matcher::Memmem(Memmem::new(&lits.literals()[0])); + if lits.len() == 1 { + return Matcher::Memmem(Memmem::new(lits[0].as_bytes())); } - let pats = lits.literals().to_owned(); + let pats: Vec<&[u8]> = lits.iter().map(|lit| lit.as_bytes()).collect(); let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii; - if lits.literals().len() <= 100 && !is_aho_corasick_fast { + if lits.len() <= 100 && !is_aho_corasick_fast { let mut builder = packed::Config::new() .match_kind(packed::MatchKind::LeftmostFirst) .builder(); if let Some(s) = builder.extend(&pats).build() { - return Matcher::Packed { s, lits: pats }; + return Matcher::Packed { s, lits: lits.to_owned() }; } } let ac = AhoCorasickBuilder::new() @@ -214,7 +217,7 @@ impl Matcher { .dfa(true) .build_with_size::(&pats) .unwrap(); - Matcher::AC { ac, lits: pats } + Matcher::AC { ac, lits: lits.to_owned() } } } @@ -257,7 +260,7 @@ impl<'a> Iterator for LiteralIter<'a> { } else { let next = &lits[0]; *lits = &lits[1..]; - Some(&**next) + Some(next.as_bytes()) } } LiteralIter::Packed(ref mut lits) => { @@ -266,7 +269,7 @@ impl<'a> Iterator for LiteralIter<'a> { } else { let next = &lits[0]; *lits = &lits[1..]; - Some(&**next) + Some(next.as_bytes()) } } } @@ -291,11 +294,15 @@ impl SingleByteSet { } } - fn prefixes(lits: &Literals) -> SingleByteSet { + fn prefixes(lits: &Seq) -> SingleByteSet { let mut sset = SingleByteSet::new(); - for lit in lits.literals() { + let lits = match lits.literals() { + None => return sset, + Some(lits) => lits, + }; + for lit in lits.iter() { sset.complete = sset.complete && lit.len() == 1; - if let Some(&b) = lit.get(0) { + if let Some(&b) = lit.as_bytes().get(0) { if !sset.sparse[b as usize] { if b > 0x7F { sset.all_ascii = false; @@ -308,11 +315,15 @@ impl SingleByteSet { sset } - fn suffixes(lits: &Literals) -> SingleByteSet { + fn suffixes(lits: &Seq) -> SingleByteSet { let mut sset = SingleByteSet::new(); - for lit in lits.literals() { + let lits = match lits.literals() { + None => return sset, + Some(lits) => lits, + }; + for lit in lits.iter() { sset.complete = sset.complete && lit.len() == 1; - if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) { + if let Some(&b) = lit.as_bytes().last() { if !sset.sparse[b as usize] { if b > 0x7F { sset.all_ascii = false; diff --git a/src/literal/mod.rs b/src/literal/mod.rs index 980f52330..b9fb77aed 100644 --- a/src/literal/mod.rs +++ b/src/literal/mod.rs @@ -6,7 +6,7 @@ mod imp; #[allow(missing_docs)] #[cfg(not(feature = "perf-literal"))] mod imp { - use regex_syntax::hir::literal::Literals; + use regex_syntax::hir::literal::Seq; #[derive(Clone, Debug)] pub struct LiteralSearcher(()); @@ -16,11 +16,11 @@ mod imp { LiteralSearcher(()) } - pub fn prefixes(_: Literals) -> Self { + pub fn prefixes(_: Seq) -> Self { LiteralSearcher(()) } - pub fn suffixes(_: Literals) -> Self { + pub fn suffixes(_: Seq) -> Self { LiteralSearcher(()) } From 724ae3e3fd55773f6c3b8e3dfaddc0203131c0ea Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 9 Oct 2022 19:06:57 -0400 Subject: [PATCH 41/79] syntax: add --lib to syntax tests I couldn't figure how how to easily make doc tests run with 'no_std' enabled, which regex-syntax now does. The '?' in particular was tripping me up. We still get doctest coverage from the top-level 'cargo test'. --- regex-syntax/test | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/regex-syntax/test b/regex-syntax/test index d03db94b4..50854d2b3 100755 --- a/regex-syntax/test +++ b/regex-syntax/test @@ -19,5 +19,8 @@ features=( ) for f in "${features[@]}"; do echo "===== FEATURE: $f ===" - cargo test --no-default-features --features "$f" + # We only run library tests because I couldn't figure out how to easily + # make doc tests run in 'no_std' mode. In particular, without the Error + # trait, using '?' in doc tests seems tricky. + cargo test --no-default-features --lib --features "$f" done From 60b9a6cb6a1e7bf688de28552348d24c4e8939ed Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 10 Oct 2022 10:11:12 -0400 Subject: [PATCH 42/79] syntax: rewrite 'cls1|..|clsN' as '[cls1..clsN]' Whenever we have an alternation where each of its branches are just classes, we can always combined that into a single class. Single classes are generally going to be cheaper to process further down the pipeline. Namely, instead of needing to branch between them at a higher level in an NFA graph, they can handled as one single unit. --- regex-syntax/src/hir/mod.rs | 80 +++++++++++++++++++++++++++++++ regex-syntax/src/hir/translate.rs | 39 +++++++++++++++ 2 files changed, 119 insertions(+) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index f2a95204a..bffc297f9 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -447,6 +447,14 @@ impl Hir { .map(|b| ClassBytesRange { start: b, end: b }); return Hir::class(Class::Bytes(ClassBytes::new(it))); } + // Similar to singleton chars, we can also look for alternations of + // classes. Those can be smushed into a single class. + if let Some(cls) = class_chars(&new) { + return Hir::class(cls); + } + if let Some(cls) = class_bytes(&new) { + return Hir::class(cls); + } let props = Properties::alternation(&new); Hir { kind: HirKind::Alternation(new), props } } @@ -854,6 +862,23 @@ impl ClassUnicode { None } } + + /// If this class consists of only ASCII ranges, then return its + /// corresponding and equivalent byte class. + pub fn to_byte_class(&self) -> Option { + if !self.is_all_ascii() { + return None; + } + Some(ClassBytes::new(self.ranges().iter().map(|r| { + // Since we are guaranteed that our codepoint range is ASCII, the + // 'u8::try_from' calls below are guaranteed to be correct. + ClassBytesRange { + // MSRV(1.59): Use 'u8::try_from(c)' instead. + start: u8::try_from(u32::from(r.start)).unwrap(), + end: u8::try_from(u32::from(r.end)).unwrap(), + } + }))) + } } /// An iterator over all ranges in a Unicode character class. @@ -1120,6 +1145,23 @@ impl ClassBytes { None } } + + /// If this class consists of only ASCII ranges, then return its + /// corresponding and equivalent Unicode class. + pub fn to_unicode_class(&self) -> Option { + if !self.is_all_ascii() { + return None; + } + Some(ClassUnicode::new(self.ranges().iter().map(|r| { + // Since we are guaranteed that our byte range is ASCII, the + // 'char::from' calls below are correct and will not erroneously + // convert a raw byte value into its corresponding codepoint. + ClassUnicodeRange { + start: char::from(r.start), + end: char::from(r.end), + } + }))) + } } /// An iterator over all ranges in a byte character class. @@ -1936,6 +1978,44 @@ impl Iterator for LookSetIter { } } +/// Given a sequence of HIR values where each value corresponds to a Unicode +/// class (or an all-ASCII byte class), return a single Unicode class +/// corresponding to the union of the classes found. +fn class_chars(hirs: &[Hir]) -> Option { + let mut cls = ClassUnicode::new(vec![]); + for hir in hirs.iter() { + match *hir.kind() { + HirKind::Class(Class::Unicode(ref cls2)) => { + cls.union(cls2); + } + HirKind::Class(Class::Bytes(ref cls2)) => { + cls.union(&cls2.to_unicode_class()?); + } + _ => return None, + }; + } + Some(Class::Unicode(cls)) +} + +/// Given a sequence of HIR values where each value corresponds to a byte class +/// (or an all-ASCII Unicode class), return a single byte class corresponding +/// to the union of the classes found. +fn class_bytes(hirs: &[Hir]) -> Option { + let mut cls = ClassBytes::new(vec![]); + for hir in hirs.iter() { + match *hir.kind() { + HirKind::Class(Class::Unicode(ref cls2)) => { + cls.union(&cls2.to_byte_class()?); + } + HirKind::Class(Class::Bytes(ref cls2)) => { + cls.union(cls2); + } + _ => return None, + }; + } + Some(Class::Bytes(cls)) +} + /// Given a sequence of HIR values where each value corresponds to a literal /// that is a single `char`, return that sequence of `char`s. Otherwise return /// None. No deduplication is done. diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 8934139a2..acbb98472 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -1963,6 +1963,45 @@ mod tests { ); } + // Tests the HIR transformation of things like '[a-z]|[A-Z]' into + // '[A-Za-z]'. In other words, an alternation of just classes is always + // equivalent to a single class corresponding to the union of the branches + // in that class. (Unless some branches match invalid UTF-8 and others + // match non-ASCII Unicode.) + #[test] + fn cat_class_flattened() { + assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); + // Combining all of the letter properties should give us the one giant + // letter property. + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"(?x) + \p{Lowercase_Letter} + |\p{Uppercase_Letter} + |\p{Titlecase_Letter} + |\p{Modifier_Letter} + |\p{Other_Letter} + "), + hir_uclass_query(ClassQuery::Binary("letter")) + ); + // Byte classes that can truly match invalid UTF-8 cannot be combined + // with Unicode classes. + assert_eq!( + t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"), + hir_alt(vec![ + hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]), + hir_bclass(&[(b'\x90', b'\xFF')]), + hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]), + ]) + ); + // Byte classes on their own can be combined, even if some are ASCII + // and others are invalid UTF-8. + assert_eq!( + t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"), + hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]), + ); + } + #[test] fn class_ascii() { assert_eq!( From 6d254aad3f27315a0c19bd6fd82e5aafb336c2fb Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 10 Oct 2022 20:48:26 -0400 Subject: [PATCH 43/79] syntax: remove 'deny(warnings)' This is generally overall pretty annoying. --- regex-syntax/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 26c17dca3..f44bba7f0 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -161,7 +161,6 @@ The following features are available: #![no_std] #![forbid(unsafe_code)] #![deny(missing_docs, rustdoc::broken_intra_doc_links)] -#![doc(test(attr(deny(warnings))))] #![warn(missing_debug_implementations)] #![cfg_attr(docsrs, feature(doc_auto_cfg))] From 01a89b6fdef3e1efd36c457a9c510316610e309c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Wed, 19 Oct 2022 08:43:05 -0400 Subject: [PATCH 44/79] syntax: add Properties::{union,captures_len} This factors out the constructor for properties for an alternation into a public API method called "union." This is useful for collapsing multiple the properties for multiple regexes down into one analyzeable unit. The 'captures_len' method is also useful for making decisions like "if this regex has no captures and is all literals, then we don't ever need to use a regex engine under any circumstance." --- regex-syntax/src/hir/mod.rs | 168 +++++++++++++++++++----------- regex-syntax/src/hir/translate.rs | 17 +++ 2 files changed, 124 insertions(+), 61 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index bffc297f9..7d291e607 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1506,6 +1506,7 @@ struct PropertiesI { look_set_prefix: LookSet, look_set_suffix: LookSet, utf8: bool, + captures_len: usize, literal: bool, alternation_literal: bool, } @@ -1582,6 +1583,17 @@ impl Properties { self.0.utf8 } + /// Returns the total number of explicit capturing groups in the + /// corresponding HIR. + /// + /// Note that this does not include the implicit capturing group + /// corresponding to the entire match that is typically included by regex + /// engines. So for example, this method will return `0` for `a` and `1` + /// for `(a)`. + pub fn captures_len(&self) -> usize { + self.0.captures_len + } + /// Return true if and only if this HIR is a simple literal. This is only /// true when this HIR expression is either itself a `Literal` or a /// concatenation of only `Literal`s. @@ -1603,6 +1615,69 @@ impl Properties { pub fn is_alternation_literal(&self) -> bool { self.0.alternation_literal } + + /// Returns a new set of properties that corresponds to the union of the + /// iterator of properties given. + /// + /// This is useful when one has multiple `Hir` expressions and wants + /// to combine them into a single alternation without constructing the + /// corresponding `Hir`. This routine provides a way of combining the + /// properties of each `Hir` expression into one set of properties + /// representing the union of those expressions. + pub fn union(props: I) -> Properties + where + I: IntoIterator, + P: core::borrow::Borrow, + { + let mut it = props.into_iter().peekable(); + // While empty alternations aren't possible, we still behave as if they + // are. When we have an empty alternate, then clearly the look-around + // prefix and suffix is empty. Otherwise, it is the intersection of all + // prefixes and suffixes (respectively) of the branches. + let fix = if it.peek().is_none() { + LookSet::empty() + } else { + LookSet::full() + }; + // The base case is an empty alternation, which matches nothing. + // Note though that empty alternations aren't possible, because the + // Hir::alternation smart constructor rewrites those as empty character + // classes. + let mut props = PropertiesI { + minimum_len: None, + maximum_len: None, + look_set: LookSet::empty(), + look_set_prefix: fix, + look_set_suffix: fix, + utf8: true, + captures_len: 0, + literal: false, + alternation_literal: true, + }; + // Handle properties that need to visit every child hir. + for prop in it { + let p = prop.borrow(); + props.look_set.union(p.look_set()); + props.look_set_prefix.intersect(p.look_set_prefix()); + props.look_set_suffix.intersect(p.look_set_suffix()); + props.utf8 = props.utf8 && p.is_utf8(); + props.captures_len = + props.captures_len.saturating_add(p.captures_len()); + props.alternation_literal = + props.alternation_literal && p.is_alternation_literal(); + if let Some(xmin) = p.minimum_len() { + if props.minimum_len.map_or(true, |pmin| xmin < pmin) { + props.minimum_len = Some(xmin); + } + } + if let Some(xmax) = p.maximum_len() { + if props.maximum_len.map_or(true, |pmax| xmax > pmax) { + props.maximum_len = Some(xmax); + } + } + } + Properties(Box::new(props)) + } } impl Properties { @@ -1632,6 +1707,7 @@ impl Properties { // were false, for example, then 'a*' would also need to be false // since it too can match the empty string. utf8: true, + captures_len: 0, literal: false, alternation_literal: false, }; @@ -1647,6 +1723,7 @@ impl Properties { look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), utf8: core::str::from_utf8(&lit.0).is_ok(), + captures_len: 0, literal: true, alternation_literal: true, }; @@ -1662,6 +1739,7 @@ impl Properties { look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), utf8: class.is_utf8(), + captures_len: 0, literal: false, alternation_literal: false, }; @@ -1693,6 +1771,7 @@ impl Properties { look_set_prefix: LookSet::singleton(look), look_set_suffix: LookSet::singleton(look), utf8, + captures_len: 0, literal: false, alternation_literal: false, }; @@ -1701,31 +1780,31 @@ impl Properties { /// Create a new set of HIR properties for a repetition. fn repetition(rep: &Repetition) -> Properties { - let minimum_len = - rep.hir.properties().minimum_len().map(|child_min| { - let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX); - child_min.saturating_mul(rep_min) - }); + let p = rep.hir.properties(); + let minimum_len = p.minimum_len().map(|child_min| { + let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX); + child_min.saturating_mul(rep_min) + }); let maximum_len = rep.max.and_then(|rep_max| { let rep_max = usize::try_from(rep_max).ok()?; - let child_max = rep.hir.properties().maximum_len()?; + let child_max = p.maximum_len()?; child_max.checked_mul(rep_max) }); let mut inner = PropertiesI { minimum_len, maximum_len, - look_set: rep.hir.properties().look_set(), + look_set: p.look_set(), look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), - utf8: rep.hir.properties().is_utf8(), + utf8: p.is_utf8(), + captures_len: p.captures_len(), literal: false, alternation_literal: false, }; if !rep.is_match_empty() { - let child_props = rep.hir.properties(); - inner.look_set_prefix = child_props.look_set_prefix(); - inner.look_set_suffix = child_props.look_set_suffix(); + inner.look_set_prefix = p.look_set_prefix(); + inner.look_set_suffix = p.look_set_suffix(); } Properties(Box::new(inner)) } @@ -1736,10 +1815,12 @@ impl Properties { // their child expressions. But the literal properties somewhat // over-constrained in what they represent in order to make downstream // analyses a bit more straight-forward. + let p = group.hir.properties(); Properties(Box::new(PropertiesI { + captures_len: p.captures_len().saturating_add(1), literal: false, alternation_literal: false, - ..*group.hir.properties().0.clone() + ..*p.0.clone() })) } @@ -1756,26 +1837,30 @@ impl Properties { look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), utf8: true, + captures_len: 0, literal: true, alternation_literal: true, }; // Handle properties that need to visit every child hir. for x in concat.iter() { - props.look_set.union(x.properties().look_set()); - props.utf8 = props.utf8 && x.properties().is_utf8(); - props.literal = props.literal && x.properties().is_literal(); - props.alternation_literal = props.alternation_literal - && x.properties().is_alternation_literal(); + let p = x.properties(); + props.look_set.union(p.look_set()); + props.utf8 = props.utf8 && p.is_utf8(); + props.captures_len = + props.captures_len.saturating_add(p.captures_len()); + props.literal = props.literal && p.is_literal(); + props.alternation_literal = + props.alternation_literal && p.is_alternation_literal(); if let Some(ref mut minimum_len) = props.minimum_len { - match x.properties().minimum_len() { + match p.minimum_len() { None => props.minimum_len = None, - Some(x) => *minimum_len += x, + Some(len) => *minimum_len += len, } } if let Some(ref mut maximum_len) = props.maximum_len { - match x.properties().maximum_len() { + match p.maximum_len() { None => props.maximum_len = None, - Some(x) => *maximum_len += x, + Some(len) => *maximum_len += len, } } } @@ -1801,46 +1886,7 @@ impl Properties { /// Create a new set of HIR properties for a concatenation. fn alternation(alts: &[Hir]) -> Properties { - // While empty alternations aren't possible, we still behave as if they - // are. When we have an empty alternate, then clearly the look-around - // prefix and suffix is empty. Otherwise, it is the intersection of all - // prefixes and suffixes (respectively) of the branches. - let fix = - if alts.is_empty() { LookSet::empty() } else { LookSet::full() }; - // The base case is an empty alternation, which matches nothing. - // Note though that empty alternations aren't possible, because the - // Hir::alternation smart constructor rewrites those as empty character - // classes. - let mut props = PropertiesI { - minimum_len: None, - maximum_len: None, - look_set: LookSet::empty(), - look_set_prefix: fix, - look_set_suffix: fix, - utf8: true, - literal: false, - alternation_literal: true, - }; - // Handle properties that need to visit every child hir. - for x in alts.iter() { - props.look_set.union(x.properties().look_set()); - props.look_set_prefix.intersect(x.properties().look_set_prefix()); - props.look_set_suffix.intersect(x.properties().look_set_suffix()); - props.utf8 = props.utf8 && x.properties().is_utf8(); - props.alternation_literal = props.alternation_literal - && x.properties().is_alternation_literal(); - if let Some(xmin) = x.properties().minimum_len() { - if props.minimum_len.map_or(true, |pmin| xmin < pmin) { - props.minimum_len = Some(xmin); - } - } - if let Some(xmax) = x.properties().maximum_len() { - if props.maximum_len.map_or(true, |pmax| xmax > pmax) { - props.maximum_len = Some(xmax); - } - } - } - Properties(Box::new(props)) + Properties::union(alts.iter().map(|hir| hir.properties())) } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index acbb98472..b7608c064 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -3106,6 +3106,23 @@ mod tests { assert!(!props_bytes(r"(?-u)\B").is_utf8()); } + #[test] + fn analysis_captures_len() { + assert_eq!(0, props(r"a").captures_len()); + assert_eq!(0, props(r"(?:a)").captures_len()); + assert_eq!(0, props(r"(?i:a)").captures_len()); + assert_eq!(0, props(r"(?i)a").captures_len()); + assert_eq!(1, props(r"(a)").captures_len()); + assert_eq!(1, props(r"(?Pa)").captures_len()); + assert_eq!(1, props(r"()").captures_len()); + assert_eq!(1, props(r"()a").captures_len()); + assert_eq!(1, props(r"(a)+").captures_len()); + assert_eq!(2, props(r"(a)(b)").captures_len()); + assert_eq!(2, props(r"(a)|(b)").captures_len()); + assert_eq!(2, props(r"((a))").captures_len()); + assert_eq!(1, props(r"([a&&b])").captures_len()); + } + #[test] fn analysis_is_all_assertions() { // Positive examples. From e995b731c30f0cba0968e75a3c94f17bba97639a Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Jan 2023 14:03:17 -0500 Subject: [PATCH 45/79] syntax: add more convenience routines to LookSet This makes it a little terser to check different types of word boundaries in the lookset. --- regex-syntax/src/hir/mod.rs | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 7d291e607..ec273a922 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1965,12 +1965,24 @@ impl LookSet { /// word boundaries. #[inline] pub fn contains_word(&self) -> bool { - self.contains(Look::WordAscii) - || self.contains(Look::WordAsciiNegate) - || self.contains(Look::WordUnicode) + self.contains_word_unicode() || self.contains_word_ascii() + } + + /// Returns true if and only if this set contains any Unicode word boundary + /// or negated Unicode word boundary assertions. + #[inline] + pub fn contains_word_unicode(&self) -> bool { + self.contains(Look::WordUnicode) || self.contains(Look::WordUnicodeNegate) } + /// Returns true if and only if this set contains any ASCII word boundary + /// or negated ASCII word boundary assertions. + #[inline] + pub fn contains_word_ascii(&self) -> bool { + self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) + } + /// Modifies this set to be the union of itself and the set given. #[inline] pub fn union(&mut self, other: LookSet) { From 72187cb63521f357ec174609392d1cc965172fc7 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Jan 2023 14:07:46 -0500 Subject: [PATCH 46/79] syntax: move around somethings This gets rid of the AsRef<[u8]> FromIterator impl for Seq, which is unfortunate, but it lets us provide an AsRef<[u8]> impl for Literal. The latter ends up being quite useful to avoid copying and/or extra allocs. --- regex-syntax/src/hir/literal.rs | 174 +++++++++++++++++--------------- 1 file changed, 91 insertions(+), 83 deletions(-) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index de46db291..05278b5a9 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -169,6 +169,37 @@ impl Extractor { } } + /// Execute the extractor and return a sequence of literals. + pub fn extract(&self, hir: &Hir) -> Seq { + use crate::hir::HirKind::*; + + match *hir.kind() { + Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), + Literal(hir::Literal(ref bytes)) => { + let mut seq = + Seq::singleton(self::Literal::exact(bytes.to_vec())); + self.enforce_literal_len(&mut seq); + seq + } + Class(hir::Class::Unicode(ref cls)) => { + self.extract_class_unicode(cls) + } + Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls), + Repetition(ref rep) => self.extract_repetition(rep), + Group(hir::Group { ref hir, .. }) => self.extract(hir), + Concat(ref hirs) => match self.kind { + ExtractKind::Prefix => self.extract_concat(hirs.iter()), + ExtractKind::Suffix => self.extract_concat(hirs.iter().rev()), + }, + Alternation(ref hirs) => { + // Unlike concat, we always union starting from the beginning, + // since the beginning corresponds to the highest preference, + // which doesn't change based on forwards vs reverse. + self.extract_alternation(hirs.iter()) + } + } + } + /// Set the kind of literal sequence to extract from an [`Hir`] expression. /// /// The default is to extract prefixes, but suffixes can be selected @@ -211,7 +242,7 @@ impl Extractor { /// let hir = Parser::new().parse(r"[0-9]")?; /// /// let got = Extractor::new().extract(&hir); - /// let expected = Seq::from_iter([ + /// let expected = Seq::new([ /// "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", /// ]); /// assert_eq!(expected, got); @@ -248,7 +279,7 @@ impl Extractor { /// let hir = Parser::new().parse(r"(abc){8}")?; /// /// let got = Extractor::new().extract(&hir); - /// let expected = Seq::from_iter(["abcabcabcabcabcabcabcabc"]); + /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); /// assert_eq!(expected, got); /// /// // Now let's shrink the limit and see how that changes things. @@ -285,7 +316,7 @@ impl Extractor { /// let hir = Parser::new().parse(r"(abc){2}{2}{2}")?; /// /// let got = Extractor::new().extract(&hir); - /// let expected = Seq::from_iter(["abcabcabcabcabcabcabcabc"]); + /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); /// assert_eq!(expected, got); /// /// // Now let's shrink the limit and see how that changes things. @@ -327,7 +358,7 @@ impl Extractor { /// let hir = Parser::new().parse(r"[ab]{2}{2}")?; /// /// let got = Extractor::new().extract(&hir); - /// let expected = Seq::from_iter([ + /// let expected = Seq::new([ /// "aaaa", "aaab", "aaba", "aabb", /// "abaa", "abab", "abba", "abbb", /// "baaa", "baab", "baba", "babb", @@ -358,37 +389,6 @@ impl Extractor { self } - /// Execute the extractor and return a sequence of literals. - pub fn extract(&self, hir: &Hir) -> Seq { - use crate::hir::HirKind::*; - - match *hir.kind() { - Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), - Literal(hir::Literal(ref bytes)) => { - let mut seq = - Seq::singleton(self::Literal::exact(bytes.to_vec())); - self.enforce_literal_len(&mut seq); - seq - } - Class(hir::Class::Unicode(ref cls)) => { - self.extract_class_unicode(cls) - } - Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls), - Repetition(ref rep) => self.extract_repetition(rep), - Group(hir::Group { ref hir, .. }) => self.extract(hir), - Concat(ref hirs) => match self.kind { - ExtractKind::Prefix => self.extract_concat(hirs.iter()), - ExtractKind::Suffix => self.extract_concat(hirs.iter().rev()), - }, - Alternation(ref hirs) => { - // Unlike concat, we always union starting from the beginning, - // since the beginning corresponds to the highest preference, - // which doesn't change based on forwards vs reverse. - self.extract_alternation(hirs.iter()) - } - } - } - /// Extract a sequence from the given concatenation. Sequences from each of /// the child HIR expressions are combined via cross product. /// @@ -682,7 +682,7 @@ impl Default for ExtractKind { /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// -/// let mut seq = Seq::from_iter(&[ +/// let mut seq = Seq::new(&[ /// "farm", /// "appliance", /// "faraway", @@ -729,12 +729,6 @@ impl Seq { Seq { literals: Some(vec![]) } } - /// Returns a sequence containing a single literal. - #[inline] - pub fn singleton(lit: Literal) -> Seq { - Seq { literals: Some(vec![lit]) } - } - /// Returns a sequence of literals without a finite size and may contain /// any literal. /// @@ -758,6 +752,22 @@ impl Seq { Seq { literals: None } } + /// Returns a sequence containing a single literal. + #[inline] + pub fn singleton(lit: Literal) -> Seq { + Seq { literals: Some(vec![lit]) } + } + + /// Returns a sequence of exact literals from the given byte strings. + #[inline] + pub fn new(it: I) -> Seq + where + I: IntoIterator, + B: AsRef<[u8]>, + { + it.into_iter().map(|b| Literal::exact(b.as_ref())).collect() + } + /// If this is a finite sequence, return its members as a slice of /// literals. /// @@ -1152,15 +1162,15 @@ impl Seq { /// ``` /// use regex_syntax::hir::literal::Seq; /// - /// let mut seq1 = Seq::from_iter(&["foo", "bar"]); - /// let mut seq2 = Seq::from_iter(&["bar", "quux", "foo"]); + /// let mut seq1 = Seq::new(&["foo", "bar"]); + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); /// seq1.union(&mut seq2); /// /// // The literals are pulled out of seq2. /// assert_eq!(Some(0), seq2.len()); /// /// // Adjacent literals are deduped, but non-adjacent literals may not be. - /// assert_eq!(Seq::from_iter(&["foo", "bar", "quux", "foo"]), seq1); + /// assert_eq!(Seq::new(&["foo", "bar", "quux", "foo"]), seq1); /// ``` /// /// This example shows that literals are drained from `other` even when @@ -1173,7 +1183,7 @@ impl Seq { /// // Infinite sequences have no finite length. /// assert_eq!(None, seq1.len()); /// - /// let mut seq2 = Seq::from_iter(&["bar", "quux", "foo"]); + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); /// seq1.union(&mut seq2); /// /// // seq1 is still infinite and seq2 has been drained. @@ -1223,14 +1233,14 @@ impl Seq { /// ``` /// use regex_syntax::hir::literal::Seq; /// - /// let mut seq1 = Seq::from_iter(&["a", "", "f", ""]); - /// let mut seq2 = Seq::from_iter(&["foo"]); + /// let mut seq1 = Seq::new(&["a", "", "f", ""]); + /// let mut seq2 = Seq::new(&["foo"]); /// seq1.union_into_empty(&mut seq2); /// /// // The literals are pulled out of seq2. /// assert_eq!(Some(0), seq2.len()); /// // 'foo' gets spliced into seq1 where the first empty string occurs. - /// assert_eq!(Seq::from_iter(&["a", "foo", "f"]), seq1); + /// assert_eq!(Seq::new(&["a", "foo", "f"]), seq1); /// ``` /// /// This example shows that literals are drained from `other` even when @@ -1239,12 +1249,12 @@ impl Seq { /// ``` /// use regex_syntax::hir::literal::Seq; /// - /// let mut seq1 = Seq::from_iter(&["foo", "bar"]); - /// let mut seq2 = Seq::from_iter(&["bar", "quux", "foo"]); + /// let mut seq1 = Seq::new(&["foo", "bar"]); + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); /// seq1.union_into_empty(&mut seq2); /// /// // seq1 has no zero length literals, so no splicing happens. - /// assert_eq!(Seq::from_iter(&["foo", "bar"]), seq1); + /// assert_eq!(Seq::new(&["foo", "bar"]), seq1); /// // Even though no splicing happens, seq2 is still drained. /// assert_eq!(Some(0), seq2.len()); /// ``` @@ -1334,10 +1344,10 @@ impl Seq { /// ``` /// use regex_syntax::hir::literal::Seq; /// - /// let mut seq = Seq::from_iter(&["foo", "quux", "bar"]); + /// let mut seq = Seq::new(&["foo", "quux", "bar"]); /// seq.sort(); /// - /// assert_eq!(Seq::from_iter(&["bar", "foo", "quux"]), seq); + /// assert_eq!(Seq::new(&["bar", "foo", "quux"]), seq); /// ``` #[inline] pub fn sort(&mut self) { @@ -1357,9 +1367,9 @@ impl Seq { /// ``` /// use regex_syntax::hir::literal::Seq; /// - /// let mut seq = Seq::from_iter(&["oof", "rab"]); + /// let mut seq = Seq::new(&["oof", "rab"]); /// seq.reverse_literals(); - /// assert_eq!(Seq::from_iter(&["foo", "bar"]), seq); + /// assert_eq!(Seq::new(&["foo", "bar"]), seq); /// ``` #[inline] pub fn reverse_literals(&mut self) { @@ -1390,15 +1400,15 @@ impl Seq { /// /// // If 'sam' comes before 'samwise' and a preference order search is /// // executed, then 'samwise' can never match. - /// let mut seq = Seq::from_iter(&["sam", "samwise"]); + /// let mut seq = Seq::new(&["sam", "samwise"]); /// seq.minimize_by_preference(); /// assert_eq!(Seq::from_iter([Literal::inexact("sam")]), seq); /// /// // But if they are reversed, then it's possible for 'samwise' to match /// // since it is given higher preference. - /// let mut seq = Seq::from_iter(&["samwise", "sam"]); + /// let mut seq = Seq::new(&["samwise", "sam"]); /// seq.minimize_by_preference(); - /// assert_eq!(Seq::from_iter(&["samwise", "sam"]), seq); + /// assert_eq!(Seq::new(&["samwise", "sam"]), seq); /// ``` /// /// This example shows that if an empty string is in this seq, then @@ -1409,7 +1419,7 @@ impl Seq { /// /// // An empty string is a prefix of all strings, so it automatically /// // inhibits any subsequent strings from matching. - /// let mut seq = Seq::from_iter(&["foo", "bar", "", "quux", "fox"]); + /// let mut seq = Seq::new(&["foo", "bar", "", "quux", "fox"]); /// seq.minimize_by_preference(); /// let expected = Seq::from_iter([ /// Literal::exact("foo"), @@ -1420,7 +1430,7 @@ impl Seq { /// /// // And of course, if it's at the beginning, then it makes it impossible /// // for anything else to match. - /// let mut seq = Seq::from_iter(&["", "foo", "quux", "fox"]); + /// let mut seq = Seq::new(&["", "foo", "quux", "fox"]); /// seq.minimize_by_preference(); /// assert_eq!(Seq::from_iter([Literal::inexact("")]), seq); /// ``` @@ -1440,7 +1450,7 @@ impl Seq { /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// - /// let mut seq = Seq::from_iter(&["a", "foo", "quux"]); + /// let mut seq = Seq::new(&["a", "foo", "quux"]); /// seq.keep_first_bytes(2); /// /// let expected = Seq::from_iter([ @@ -1468,7 +1478,7 @@ impl Seq { /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// - /// let mut seq = Seq::from_iter(&["a", "foo", "quux"]); + /// let mut seq = Seq::new(&["a", "foo", "quux"]); /// seq.keep_last_bytes(2); /// /// let expected = Seq::from_iter([ @@ -1576,13 +1586,13 @@ impl Seq { /// ``` /// use regex_syntax::hir::literal::Seq; /// - /// let seq = Seq::from_iter(&["foo", "foobar", "fo"]); + /// let seq = Seq::new(&["foo", "foobar", "fo"]); /// assert_eq!(Some(&b"fo"[..]), seq.longest_common_prefix()); - /// let seq = Seq::from_iter(&["foo", "foo"]); + /// let seq = Seq::new(&["foo", "foo"]); /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_prefix()); - /// let seq = Seq::from_iter(&["foo", "bar"]); + /// let seq = Seq::new(&["foo", "bar"]); /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); - /// let seq = Seq::from_iter(&[""]); + /// let seq = Seq::new(&[""]); /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); /// /// let seq = Seq::infinite(); @@ -1629,13 +1639,13 @@ impl Seq { /// ``` /// use regex_syntax::hir::literal::Seq; /// - /// let seq = Seq::from_iter(&["oof", "raboof", "of"]); + /// let seq = Seq::new(&["oof", "raboof", "of"]); /// assert_eq!(Some(&b"of"[..]), seq.longest_common_suffix()); - /// let seq = Seq::from_iter(&["foo", "foo"]); + /// let seq = Seq::new(&["foo", "foo"]); /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_suffix()); - /// let seq = Seq::from_iter(&["foo", "bar"]); + /// let seq = Seq::new(&["foo", "bar"]); /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); - /// let seq = Seq::from_iter(&[""]); + /// let seq = Seq::new(&[""]); /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); /// /// let seq = Seq::infinite(); @@ -1670,24 +1680,16 @@ impl Seq { } Some(&base[base.len() - len..]) } -} -impl FromIterator for Seq { - fn from_iter>(it: T) -> Seq { - let mut seq = Seq::empty(); - for literal in it { - seq.push(literal); } - seq } } -/// Creates a sequence of exact literals from an iterator of byte strings. -impl> FromIterator for Seq { - fn from_iter>(it: T) -> Seq { +impl FromIterator for Seq { + fn from_iter>(it: T) -> Seq { let mut seq = Seq::empty(); for literal in it { - seq.push(Literal::exact(literal.as_ref())); + seq.push(literal); } seq } @@ -1822,6 +1824,12 @@ impl From for Literal { } } +impl AsRef<[u8]> for Literal { + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + impl core::fmt::Debug for Literal { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let tag = if self.exact { "E" } else { "I" }; @@ -1995,7 +2003,7 @@ mod tests { } fn exact, I: IntoIterator>(it: I) -> (Seq, Seq) { - let s1 = Seq::from_iter(it); + let s1 = Seq::new(it); let s2 = s1.clone(); (s1, s2) } From 7a7522281a2f37f200efee43da1e3592692f2f5f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Jan 2023 14:09:11 -0500 Subject: [PATCH 47/79] syntax: add 'optimize' routines to 'hir::literal::Seq' Their docs explain their utility. In the old literal extraction setup, some (but not all) of this "optimization" was somewhat baked into the extraction itself, but now we codify it a bit more explicitly. --- regex-syntax/src/hir/literal.rs | 356 +++++++++++++++++++++++++++++++- regex-syntax/src/hir/mod.rs | 154 +++++++++++++- regex-syntax/src/lib.rs | 1 + regex-syntax/src/rank.rs | 258 +++++++++++++++++++++++ 4 files changed, 755 insertions(+), 14 deletions(-) create mode 100644 regex-syntax/src/rank.rs diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 05278b5a9..9aba79664 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -704,7 +704,7 @@ impl Default for ExtractKind { /// ]); /// assert_eq!(expected, seq); /// ``` -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Eq, PartialEq)] pub struct Seq { /// The members of this seq. /// @@ -1437,7 +1437,7 @@ impl Seq { #[inline] pub fn minimize_by_preference(&mut self) { if let Some(ref mut lits) = self.literals { - PreferenceTrie::minimize(lits); + PreferenceTrie::minimize(lits, false); } } @@ -1681,6 +1681,274 @@ impl Seq { Some(&base[base.len() - len..]) } + /// Optimizes this seq while treating its literals as prefixes and + /// respecting the preference order of its literals. + /// + /// The specific way "optimization" works is meant to be an implementation + /// detail, as it essentially represents a set of heuristics. The goal + /// that optimization tries to accomplish is to make the literals in this + /// set reflect inputs that will result in a more effective prefilter. + /// Principally by reducing the false positive rate of candidates found by + /// the literals in this sequence. That is, when a match of a literal is + /// found, we would like it to be a strong predictor of the overall match + /// of the regex. If it isn't, then much time will be spent starting and + /// stopping the prefilter search and attempting to confirm the match only + /// to have it fail. + /// + /// Some of those heuristics might be: + /// + /// * Identifying a common prefix from a larger sequence of literals, and + /// shrinking the sequence down to that single common prefix. + /// * Rejecting the sequence entirely if it is believed to result in very + /// high false positive rate. When this happens, the sequence is made + /// infinite. + /// * Shrinking the sequence to a smaller number of literals representing + /// prefixes, but not shrinking it so much as to make literals too short. + /// (A sequence with very short literals, of 1 or 2 bytes, will typically + /// result in a higher false positive rate.) + /// + /// Optimization should only be run once extraction is complete. Namely, + /// optimization may make assumptions that do not compose with other + /// operations in the middle of extraction. For example, optimization will + /// reduce `[E(sam), E(samwise)]` to `[E(sam)]`, but such a transformation + /// is only valid if no other extraction will occur. If other extraction + /// may occur, then the correct transformation would be to `[I(sam)]`. + /// + /// The [`Seq::optimize_for_suffix_by_preference`] does the same thing, but + /// for suffixes. + /// + /// # Example + /// + /// This shows how optimization might transform a sequence. Note that + /// the specific behavior is not a documented guarantee. The heuristics + /// used are an implementation detail and may change over time in semver + /// compatible releases. + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// "sam", + /// "samwise", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert_eq!(Seq::from_iter([ + /// Literal::exact("samantha"), + /// // Kept exact even though 'samwise' got pruned + /// // because optimization assumes literal extraction + /// // has finished. + /// Literal::exact("sam"), + /// Literal::exact("frodo"), + /// ]), seq); + /// ``` + /// + /// # Example: optimization may make the sequence infinite + /// + /// If the heuristics deem that the sequence could cause a very high false + /// positive rate, then it may make the sequence infinite, effectively + /// disabling its use as a prefilter. + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// // An empty string matches at every position, + /// // thus rendering the prefilter completely + /// // ineffective. + /// "", + /// "sam", + /// "samwise", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert!(!seq.is_finite()); + /// ``` + /// + /// Do note that just because there is a `" "` in the sequence, that + /// doesn't mean the sequence will always be made infinite after it is + /// optimized. Namely, if the sequence is considered exact (any match + /// corresponds to an overall match of the original regex), then any match + /// is an overall match, and so the false positive rate is always `0`. + /// + /// To demonstrate this, we remove `samwise` from our sequence. This + /// results in no optimization happening and all literals remain exact. + /// Thus the entire sequence is exact, and it is kept as-is, even though + /// one is an ASCII space: + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// " ", + /// "sam", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert!(seq.is_finite()); + /// ``` + #[inline] + pub fn optimize_for_prefix_by_preference(&mut self) { + self.optimize_by_preference(true); + } + + /// Optimizes this seq while treating its literals as suffixes and + /// respecting the preference order of its literals. + /// + /// Optimization should only be run once extraction is complete. + /// + /// The [`Seq::optimize_for_prefix_by_preference`] does the same thing, but + /// for prefixes. See its documentation for more explanation. + #[inline] + pub fn optimize_for_suffix_by_preference(&mut self) { + self.optimize_by_preference(false); + } + + fn optimize_by_preference(&mut self, prefix: bool) { + let origlen = match self.len() { + None => return, + Some(len) => len, + }; + // Make sure we start with the smallest sequence possible. We use a + // special version of preference minimization that retains exactness. + // This is legal because optimization is only expected to occur once + // extraction is complete. + if prefix { + if let Some(ref mut lits) = self.literals { + PreferenceTrie::minimize(lits, true); + } + } + + // Look for a common prefix (or suffix). If we found one of those and + // it's long enough, then it's a good bet that it will be our fastest + // possible prefilter since single-substring search is so fast. + let fix = if prefix { + self.longest_common_prefix() + } else { + self.longest_common_suffix() + }; + if let Some(fix) = fix { + // As a special case, if we have a common prefix and the leading + // byte of that prefix is one that we think probably occurs rarely, + // then strip everything down to just that single byte. This should + // promote the use of memchr. + // + // ... we only do this though if our sequence has more than one + // literal. Otherwise, we'd rather just stick with a single literal + // scan. That is, using memchr is probably better than looking + // for 2 or more literals, but probably not as good as a straight + // memmem search. + // + // ... and also only do this when the prefix is short and probably + // not too discriminatory anyway. If it's longer, then it's + // probably quite discriminatory and thus is likely to have a low + // false positive rate. + if prefix + && origlen > 1 + && fix.len() >= 1 + && fix.len() <= 3 + && rank(fix[0]) < 200 + { + self.keep_first_bytes(1); + self.dedup(); + return; + } + // We only strip down to the common prefix/suffix if we think + // the existing set of literals isn't great, or if the common + // prefix/suffix is expected to be particularly discriminatory. + let isfast = + self.is_exact() && self.len().map_or(false, |len| len <= 16); + let usefix = fix.len() > 4 || (fix.len() > 1 && !isfast); + if usefix { + // If we keep exactly the number of bytes equal to the length + // of the prefix (or suffix), then by the definition of a + // prefix, every literal in the sequence will be equivalent. + // Thus, 'dedup' will leave us with one literal. + // + // We do it this way to avoid an alloc, but also to make sure + // the exactness of literals is kept (or not). + if prefix { + self.keep_first_bytes(fix.len()); + } else { + self.keep_last_bytes(fix.len()); + } + self.dedup(); + assert_eq!(Some(1), self.len()); + // We still fall through here. In particular, we want our + // longest common prefix to be subject to the poison check. + } + } + // Everything below this check is more-or-less about trying to + // heuristically reduce the false positive rate of a prefilter. But + // if our sequence is completely exact, then it's possible the regex + // engine can be skipped entirely. In this case, the false positive + // rate is zero because every literal match corresponds to a regex + // match. + // + // This is OK even if the sequence contains a poison literal. Remember, + // a literal is only poisononous because of what we assume about its + // impact on the false positive rate. However, we do still check for + // an empty string. Empty strings are weird and it's best to let the + // regex engine handle those. + // + // We do currently do this check after the longest common prefix (or + // suffix) check, under the theory that single-substring search is so + // fast that we want that even if we'd end up turning an exact sequence + // into an inexact one. But this might be wrong... + if self.is_exact() + && self.min_literal_len().map_or(false, |len| len > 0) + { + return; + } + // Now we attempt to shorten the sequence. The idea here is that we + // don't want to look for too many literals, but we want to shorten + // our sequence enough to improve our odds of using better algorithms + // downstream (such as Teddy). + const ATTEMPTS: [(usize, usize); 5] = + [(5, 64), (4, 64), (3, 64), (2, 64), (1, 10)]; + for (keep, limit) in ATTEMPTS { + let len = match self.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + if prefix { + self.keep_first_bytes(keep); + } else { + self.keep_last_bytes(keep); + } + self.minimize_by_preference(); + } + // Check for a poison literal. A poison literal is one that is short + // and is believed to have a very high match count. These poisons + // generally lead to a prefilter with a very high false positive rate, + // and thus overall worse performance. + // + // We do this last because we could have gone from a non-poisonous + // sequence to a poisonous one. Perhaps we should add some code to + // prevent such transitions in the first place, but then again, we + // likely only made the transition in the first place if the sequence + // was itself huge. And huge sequences are themselves poisonous. So... + if let Some(lits) = self.literals() { + if lits.iter().any(|lit| lit.is_poisonous()) { + self.make_infinite(); + } + } + } +} + +impl core::fmt::Debug for Seq { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "Seq")?; + if let Some(lits) = self.literals() { + f.debug_list().entries(lits.iter()).finish() + } else { + write!(f, "[∅]") } } } @@ -1809,6 +2077,12 @@ impl Literal { self.make_inexact(); self.bytes.drain(..self.len() - len); } + + /// Returns true if it is believe that this literal is likely to match very + /// frequently, and is thus not a good candidate for a prefilter. + fn is_poisonous(&self) -> bool { + self.is_empty() || (self.len() == 1 && rank(self.as_bytes()[0]) >= 250) + } } impl From for Literal { @@ -1854,7 +2128,9 @@ impl core::fmt::Debug for Literal { /// the "minimal" sequence, we simply only keep literals that were successfully /// inserted. (Since we don't need traversal, one wonders whether we can make /// some simplifications here, but I haven't given it a ton of thought and I've -/// never seen this show up on a profile.) +/// never seen this show up on a profile. Because of the heuristic limits +/// imposed on literal extractions, the size of the inputs here is usually +/// very small.) #[derive(Debug, Default)] struct PreferenceTrie { /// The states in this trie. The index of a state in this vector is its ID. @@ -1879,7 +2155,14 @@ struct State { impl PreferenceTrie { /// Minimizes the given sequence of literals while preserving preference /// order semantics. - fn minimize(literals: &mut Vec) { + /// + /// When `keep_exact` is true, the exactness of every literal retained is + /// kept. This is useful when dealing with a fully extracted `Seq` that + /// only contains exact literals. In that case, we can keep all retained + /// literals as exact because we know we'll never need to match anything + /// after them and because any removed literals are guaranteed to never + /// match. + fn minimize(literals: &mut Vec, keep_exact: bool) { use core::cell::RefCell; // MSRV(1.61): Use retain_mut here to avoid interior mutability. @@ -1889,7 +2172,9 @@ impl PreferenceTrie { match trie.borrow_mut().insert(lit.as_bytes()) { Ok(_) => true, Err(i) => { - make_inexact.push(i); + if !keep_exact { + make_inexact.push(i); + } false } } @@ -1952,6 +2237,17 @@ impl PreferenceTrie { } } +/// Returns the "rank" of the given byte. +/// +/// The minimum rank value is `0` and the maximum rank value is `255`. +/// +/// The rank of a byte is derived from a heuristic background distribution of +/// relative frequencies of bytes. The heuristic says that lower the rank of a +/// byte, the less likely that byte is to appear in any arbitrary haystack. +pub fn rank(byte: u8) -> u8 { + crate::rank::BYTE_FREQUENCIES[usize::from(byte)] +} + #[cfg(test)] mod tests { use super::*; @@ -2008,6 +2304,13 @@ mod tests { (s1, s2) } + fn opt, I: IntoIterator>(it: I) -> (Seq, Seq) { + let (mut p, mut s) = exact(it); + p.optimize_for_prefix_by_preference(); + s.optimize_for_suffix_by_preference(); + (p, s) + } + #[test] fn literal() { assert_eq!(exact(["a"]), e("a")); @@ -2122,6 +2425,16 @@ mod tests { // in reverse, and then [bc, ac, c] ordering is indeed correct from // that perspective. We also test a few more equivalent regexes, and // we get the same result, so it is consistent at least I suppose. + // + // The reason why this isn't an issue is that it only messes up + // preference order, and currently, suffixes are never used in a + // context where preference order matters. For prefixes it matters + // because we sometimes want to use prefilters without confirmation + // when all of the literals are exact (and there's no look-around). But + // we never do that for suffixes. Any time we use suffixes, we always + // include a confirmation step. If that ever changes, then it's likely + // this bug will need to be fixed, but last time I looked, it appears + // hard to do so. assert_eq!( inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), e(r"a*b*c") @@ -2772,4 +3085,37 @@ mod tests { assert!(!suffixes.is_finite()); assert_eq!(Some(247), prefixes.len()); } + + #[test] + fn optimize() { + // This gets a common prefix that isn't too short. + let (p, s) = + opt(["foobarfoobar", "foobar", "foobarzfoobar", "foobarfoobar"]); + assert_eq!(seq([I("foobar")]), p); + assert_eq!(seq([I("foobar")]), s); + + // This also finds a common prefix, but since it's only one byte, it + // prefers the multiple literals. + let (p, s) = opt(["abba", "akka", "abccba"]); + assert_eq!(exact(["abba", "akka", "abccba"]), (p, s)); + + let (p, s) = opt(["sam", "samwise"]); + assert_eq!((seq([E("sam")]), seq([E("sam"), E("samwise")])), (p, s)); + + // The empty string is poisonous, so our seq becomes infinite, even + // though all literals are exact. + let (p, s) = opt(["foobarfoo", "foo", "", "foozfoo", "foofoo"]); + assert!(!p.is_finite()); + assert!(!s.is_finite()); + + // A space is also poisonous, so our seq becomes infinite. But this + // only gets triggered when we don't have a completely exact sequence. + // When the sequence is exact, spaces are okay, since we presume that + // any prefilter will match a space more quickly than the regex engine. + // (When the sequence is exact, there's a chance of the prefilter being + // used without needing the regex engine at all.) + let mut p = seq([E("foobarfoo"), I("foo"), E(" "), E("foofoo")]); + p.optimize_for_prefix_by_preference(); + assert!(!p.is_finite()); + } } diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index ec273a922..0ccf6bac1 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -666,6 +666,36 @@ impl Class { /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or /// `4`. For empty classes, `None` is returned. It is impossible for `0` to /// be returned. + /// + /// # Example + /// + /// This example shows some examples of regexes and their corresponding + /// minimum length, if any. + /// + /// ``` + /// use regex_syntax::{hir::Properties, Parser}; + /// + /// // The empty string has a min length of 0. + /// let hir = Parser::new().parse(r"")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // As do other types of regexes that only match the empty string. + /// let hir = Parser::new().parse(r"^$\b\B")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // A regex that can match the empty string but match more is still 0. + /// let hir = Parser::new().parse(r"a*")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // A regex that matches nothing has no minimum defined. + /// let hir = Parser::new().parse(r"[a&&b]")?; + /// assert_eq!(None, hir.properties().minimum_len()); + /// // Character classes usually have a minimum length of 1. + /// let hir = Parser::new().parse(r"\w")?; + /// assert_eq!(Some(1), hir.properties().minimum_len()); + /// // But sometimes Unicode classes might be bigger! + /// let hir = Parser::new().parse(r"\p{Cyrillic}")?; + /// assert_eq!(Some(2), hir.properties().minimum_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` pub fn minimum_len(&self) -> Option { match *self { Class::Unicode(ref x) => x.minimum_len(), @@ -680,6 +710,39 @@ impl Class { /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or /// `4`. For empty classes, `None` is returned. It is impossible for `0` to /// be returned. + /// + /// # Example + /// + /// This example shows some examples of regexes and their corresponding + /// maximum length, if any. + /// + /// ``` + /// use regex_syntax::{hir::Properties, Parser}; + /// + /// // The empty string has a max length of 0. + /// let hir = Parser::new().parse(r"")?; + /// assert_eq!(Some(0), hir.properties().maximum_len()); + /// // As do other types of regexes that only match the empty string. + /// let hir = Parser::new().parse(r"^$\b\B")?; + /// assert_eq!(Some(0), hir.properties().maximum_len()); + /// // A regex that matches nothing has no maximum defined. + /// let hir = Parser::new().parse(r"[a&&b]")?; + /// assert_eq!(None, hir.properties().maximum_len()); + /// // Bounded repeats work as you expect. + /// let hir = Parser::new().parse(r"x{2,10}")?; + /// assert_eq!(Some(10), hir.properties().maximum_len()); + /// // An unbounded repeat means there is no maximum. + /// let hir = Parser::new().parse(r"x{2,}")?; + /// assert_eq!(None, hir.properties().maximum_len()); + /// // With Unicode enabled, \w can match up to 4 bytes! + /// let hir = Parser::new().parse(r"\w")?; + /// assert_eq!(Some(4), hir.properties().maximum_len()); + /// // Without Unicode enabled, \w matches at most 1 byte. + /// let hir = Parser::new().parse(r"(?-u)\w")?; + /// assert_eq!(Some(1), hir.properties().maximum_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` pub fn maximum_len(&self) -> Option { match *self { Class::Unicode(ref x) => x.maximum_len(), @@ -1532,9 +1595,9 @@ impl Properties { /// the empty string is in the language described by this HIR. /// /// `None` is returned when there is no longest matching string. This - /// occurs when the HIR matches nothing or when there is no upper bound - /// on the length of matching strings. An example of such a regex is - /// `\P{any}`. + /// occurs when the HIR matches nothing or when there is no upper bound on + /// the length of matching strings. Example of such regexes are `\P{any}` + /// (matches nothing) and `a+` (has no upper bound). pub fn maximum_len(&self) -> Option { self.0.maximum_len } @@ -1624,6 +1687,68 @@ impl Properties { /// corresponding `Hir`. This routine provides a way of combining the /// properties of each `Hir` expression into one set of properties /// representing the union of those expressions. + /// + /// # Example: union with HIRs that never match + /// + /// This example shows that unioning properties together with one that + /// represents a regex that never matches will "poison" certain attributes, + /// like the minimum and maximum lengths. + /// + /// ``` + /// use regex_syntax::{hir::Properties, Parser}; + /// + /// let hir1 = Parser::new().parse("ab?c?")?; + /// assert_eq!(Some(1), hir1.properties().minimum_len()); + /// assert_eq!(Some(3), hir1.properties().maximum_len()); + /// + /// let hir2 = Parser::new().parse(r"[a&&b]")?; + /// assert_eq!(None, hir2.properties().minimum_len()); + /// assert_eq!(None, hir2.properties().maximum_len()); + /// + /// let hir3 = Parser::new().parse(r"wxy?z?")?; + /// assert_eq!(Some(2), hir3.properties().minimum_len()); + /// assert_eq!(Some(4), hir3.properties().maximum_len()); + /// + /// let unioned = Properties::union([ + /// hir1.properties(), + /// hir2.properties(), + /// hir3.properties(), + /// ]); + /// assert_eq!(None, unioned.minimum_len()); + /// assert_eq!(None, unioned.maximum_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// The maximum length can also be "poisoned" by a pattern that has no + /// upper bound on the length of a match. The minimum length remains + /// unaffected: + /// + /// ``` + /// use regex_syntax::{hir::Properties, Parser}; + /// + /// let hir1 = Parser::new().parse("ab?c?")?; + /// assert_eq!(Some(1), hir1.properties().minimum_len()); + /// assert_eq!(Some(3), hir1.properties().maximum_len()); + /// + /// let hir2 = Parser::new().parse(r"a+")?; + /// assert_eq!(Some(1), hir2.properties().minimum_len()); + /// assert_eq!(None, hir2.properties().maximum_len()); + /// + /// let hir3 = Parser::new().parse(r"wxy?z?")?; + /// assert_eq!(Some(2), hir3.properties().minimum_len()); + /// assert_eq!(Some(4), hir3.properties().maximum_len()); + /// + /// let unioned = Properties::union([ + /// hir1.properties(), + /// hir2.properties(), + /// hir3.properties(), + /// ]); + /// assert_eq!(Some(1), unioned.minimum_len()); + /// assert_eq!(None, unioned.maximum_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` pub fn union(props: I) -> Properties where I: IntoIterator, @@ -1654,6 +1779,7 @@ impl Properties { literal: false, alternation_literal: true, }; + let (mut min_poisoned, mut max_poisoned) = (false, false); // Handle properties that need to visit every child hir. for prop in it { let p = prop.borrow(); @@ -1665,14 +1791,24 @@ impl Properties { props.captures_len.saturating_add(p.captures_len()); props.alternation_literal = props.alternation_literal && p.is_alternation_literal(); - if let Some(xmin) = p.minimum_len() { - if props.minimum_len.map_or(true, |pmin| xmin < pmin) { - props.minimum_len = Some(xmin); + if !min_poisoned { + if let Some(xmin) = p.minimum_len() { + if props.minimum_len.map_or(true, |pmin| xmin < pmin) { + props.minimum_len = Some(xmin); + } + } else { + props.minimum_len = None; + min_poisoned = true; } } - if let Some(xmax) = p.maximum_len() { - if props.maximum_len.map_or(true, |pmax| xmax > pmax) { - props.maximum_len = Some(xmax); + if !max_poisoned { + if let Some(xmax) = p.maximum_len() { + if props.maximum_len.map_or(true, |pmax| xmax > pmax) { + props.maximum_len = Some(xmax); + } + } else { + props.maximum_len = None; + max_poisoned = true; } } } diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index f44bba7f0..c56f9d1ff 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -183,6 +183,7 @@ mod either; mod error; pub mod hir; mod parser; +mod rank; mod unicode; mod unicode_tables; pub mod utf8; diff --git a/regex-syntax/src/rank.rs b/regex-syntax/src/rank.rs new file mode 100644 index 000000000..ccb25a20a --- /dev/null +++ b/regex-syntax/src/rank.rs @@ -0,0 +1,258 @@ +pub(crate) const BYTE_FREQUENCIES: [u8; 256] = [ + 55, // '\x00' + 52, // '\x01' + 51, // '\x02' + 50, // '\x03' + 49, // '\x04' + 48, // '\x05' + 47, // '\x06' + 46, // '\x07' + 45, // '\x08' + 103, // '\t' + 242, // '\n' + 66, // '\x0b' + 67, // '\x0c' + 229, // '\r' + 44, // '\x0e' + 43, // '\x0f' + 42, // '\x10' + 41, // '\x11' + 40, // '\x12' + 39, // '\x13' + 38, // '\x14' + 37, // '\x15' + 36, // '\x16' + 35, // '\x17' + 34, // '\x18' + 33, // '\x19' + 56, // '\x1a' + 32, // '\x1b' + 31, // '\x1c' + 30, // '\x1d' + 29, // '\x1e' + 28, // '\x1f' + 255, // ' ' + 148, // '!' + 164, // '"' + 149, // '#' + 136, // '$' + 160, // '%' + 155, // '&' + 173, // "'" + 221, // '(' + 222, // ')' + 134, // '*' + 122, // '+' + 232, // ',' + 202, // '-' + 215, // '.' + 224, // '/' + 208, // '0' + 220, // '1' + 204, // '2' + 187, // '3' + 183, // '4' + 179, // '5' + 177, // '6' + 168, // '7' + 178, // '8' + 200, // '9' + 226, // ':' + 195, // ';' + 154, // '<' + 184, // '=' + 174, // '>' + 126, // '?' + 120, // '@' + 191, // 'A' + 157, // 'B' + 194, // 'C' + 170, // 'D' + 189, // 'E' + 162, // 'F' + 161, // 'G' + 150, // 'H' + 193, // 'I' + 142, // 'J' + 137, // 'K' + 171, // 'L' + 176, // 'M' + 185, // 'N' + 167, // 'O' + 186, // 'P' + 112, // 'Q' + 175, // 'R' + 192, // 'S' + 188, // 'T' + 156, // 'U' + 140, // 'V' + 143, // 'W' + 123, // 'X' + 133, // 'Y' + 128, // 'Z' + 147, // '[' + 138, // '\\' + 146, // ']' + 114, // '^' + 223, // '_' + 151, // '`' + 249, // 'a' + 216, // 'b' + 238, // 'c' + 236, // 'd' + 253, // 'e' + 227, // 'f' + 218, // 'g' + 230, // 'h' + 247, // 'i' + 135, // 'j' + 180, // 'k' + 241, // 'l' + 233, // 'm' + 246, // 'n' + 244, // 'o' + 231, // 'p' + 139, // 'q' + 245, // 'r' + 243, // 's' + 251, // 't' + 235, // 'u' + 201, // 'v' + 196, // 'w' + 240, // 'x' + 214, // 'y' + 152, // 'z' + 182, // '{' + 205, // '|' + 181, // '}' + 127, // '~' + 27, // '\x7f' + 212, // '\x80' + 211, // '\x81' + 210, // '\x82' + 213, // '\x83' + 228, // '\x84' + 197, // '\x85' + 169, // '\x86' + 159, // '\x87' + 131, // '\x88' + 172, // '\x89' + 105, // '\x8a' + 80, // '\x8b' + 98, // '\x8c' + 96, // '\x8d' + 97, // '\x8e' + 81, // '\x8f' + 207, // '\x90' + 145, // '\x91' + 116, // '\x92' + 115, // '\x93' + 144, // '\x94' + 130, // '\x95' + 153, // '\x96' + 121, // '\x97' + 107, // '\x98' + 132, // '\x99' + 109, // '\x9a' + 110, // '\x9b' + 124, // '\x9c' + 111, // '\x9d' + 82, // '\x9e' + 108, // '\x9f' + 118, // '\xa0' + 141, // '¡' + 113, // '¢' + 129, // '£' + 119, // '¤' + 125, // '¥' + 165, // '¦' + 117, // '§' + 92, // '¨' + 106, // '©' + 83, // 'ª' + 72, // '«' + 99, // '¬' + 93, // '\xad' + 65, // '®' + 79, // '¯' + 166, // '°' + 237, // '±' + 163, // '²' + 199, // '³' + 190, // '´' + 225, // 'µ' + 209, // '¶' + 203, // '·' + 198, // '¸' + 217, // '¹' + 219, // 'º' + 206, // '»' + 234, // '¼' + 248, // '½' + 158, // '¾' + 239, // '¿' + 255, // 'À' + 255, // 'Á' + 255, // 'Â' + 255, // 'Ã' + 255, // 'Ä' + 255, // 'Å' + 255, // 'Æ' + 255, // 'Ç' + 255, // 'È' + 255, // 'É' + 255, // 'Ê' + 255, // 'Ë' + 255, // 'Ì' + 255, // 'Í' + 255, // 'Î' + 255, // 'Ï' + 255, // 'Ð' + 255, // 'Ñ' + 255, // 'Ò' + 255, // 'Ó' + 255, // 'Ô' + 255, // 'Õ' + 255, // 'Ö' + 255, // '×' + 255, // 'Ø' + 255, // 'Ù' + 255, // 'Ú' + 255, // 'Û' + 255, // 'Ü' + 255, // 'Ý' + 255, // 'Þ' + 255, // 'ß' + 255, // 'à' + 255, // 'á' + 255, // 'â' + 255, // 'ã' + 255, // 'ä' + 255, // 'å' + 255, // 'æ' + 255, // 'ç' + 255, // 'è' + 255, // 'é' + 255, // 'ê' + 255, // 'ë' + 255, // 'ì' + 255, // 'í' + 255, // 'î' + 255, // 'ï' + 255, // 'ð' + 255, // 'ñ' + 255, // 'ò' + 255, // 'ó' + 255, // 'ô' + 255, // 'õ' + 255, // 'ö' + 255, // '÷' + 255, // 'ø' + 255, // 'ù' + 255, // 'ú' + 255, // 'û' + 255, // 'ü' + 255, // 'ý' + 255, // 'þ' + 255, // 'ÿ' +]; From c5754efc6c5d75e404d4c1b6fbe9284bd82ca6f9 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 10 Jan 2023 21:10:53 -0500 Subject: [PATCH 48/79] syntax: rename 'allow_invalid_utf8' to 'utf8' This also inverts its meaning, i.e., utf8=!allow_invalid_utf8. This naming is consistent with the naming used in regex-automata. In general, I find that using names without negations in them to be clearer, since it avoids double negations. --- regex-syntax/src/hir/literal.rs | 6 +--- regex-syntax/src/hir/mod.rs | 2 +- regex-syntax/src/hir/print.rs | 2 +- regex-syntax/src/hir/translate.rs | 55 ++++++++++++++++--------------- regex-syntax/src/parser.rs | 32 ++++++++++-------- src/exec.rs | 2 +- 6 files changed, 50 insertions(+), 49 deletions(-) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 9aba79664..436f08a02 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -2253,11 +2253,7 @@ mod tests { use super::*; fn parse(pattern: &str) -> Hir { - crate::ParserBuilder::new() - .allow_invalid_utf8(true) - .build() - .parse(pattern) - .unwrap() + crate::ParserBuilder::new().utf8(false).build().parse(pattern).unwrap() } fn prefixes(pattern: &str) -> Seq { diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 0ccf6bac1..28f8ea662 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -71,7 +71,7 @@ pub enum ErrorKind { /// support is disabled. For example `(?-u:\pL)` would trigger this error. UnicodeNotAllowed, /// This error occurs when translating a pattern that could match a byte - /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled. + /// sequence that isn't UTF-8 and `utf8` was enabled. InvalidUtf8, /// This occurs when an unrecognized Unicode property name could not /// be found. diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index d976ff668..63d78ad00 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -309,7 +309,7 @@ mod tests { } fn roundtrip_bytes(given: &str, expected: &str) { - roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected); + roundtrip_with(|b| b.utf8(false), given, expected); } fn roundtrip_with(mut f: F, given: &str, expected: &str) diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index b7608c064..a31998a6b 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -18,7 +18,7 @@ type Result = core::result::Result; /// A builder for constructing an AST->HIR translator. #[derive(Clone, Debug)] pub struct TranslatorBuilder { - allow_invalid_utf8: bool, + utf8: bool, flags: Flags, } @@ -31,10 +31,7 @@ impl Default for TranslatorBuilder { impl TranslatorBuilder { /// Create a new translator builder with a default c onfiguration. pub fn new() -> TranslatorBuilder { - TranslatorBuilder { - allow_invalid_utf8: false, - flags: Flags::default(), - } + TranslatorBuilder { utf8: true, flags: Flags::default() } } /// Build a translator using the current configuration. @@ -42,23 +39,27 @@ impl TranslatorBuilder { Translator { stack: RefCell::new(vec![]), flags: Cell::new(self.flags), - allow_invalid_utf8: self.allow_invalid_utf8, + utf8: self.utf8, } } - /// When enabled, translation will permit the construction of a regular + /// When disabled, translation will permit the construction of a regular /// expression that may match invalid UTF-8. /// - /// When disabled (the default), the translator is guaranteed to produce - /// an expression that will only ever match valid UTF-8 (otherwise, the - /// translator will return an error). + /// When enabled (the default), the translator is guaranteed to produce an + /// expression that, for non-empty matches, will only ever produce spans + /// that are entirely valid UTF-8 (otherwise, the translator will return an + /// error). /// - /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII - /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause - /// the parser to return an error. Namely, a negated ASCII word boundary - /// can result in matching positions that aren't valid UTF-8 boundaries. - pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { - self.allow_invalid_utf8 = yes; + /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even + /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete + /// syntax) will be allowed even though they can produce matches that split + /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" + /// matches, and it is expected that the regex engine itself must handle + /// these cases if necessary (perhaps by suppressing any zero-width matches + /// that split a codepoint). + pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.utf8 = yes; self } @@ -112,7 +113,7 @@ pub struct Translator { /// The current flag settings. flags: Cell, /// Whether we're allowed to produce HIR that can match arbitrary bytes. - allow_invalid_utf8: bool, + utf8: bool, } impl Translator { @@ -162,8 +163,8 @@ enum HirFrame { /// recursive structure). /// /// Byte character classes are created when Unicode mode (`u`) is disabled. - /// If `allow_invalid_utf8` is disabled (the default), then a byte - /// character is only permitted to match ASCII text. + /// If `utf8` is enabled (the default), then a byte character is only + /// permitted to match ASCII text. ClassBytes(hir::ClassBytes), /// This is pushed whenever a repetition is observed. After visiting every /// sub-expression in the repetition, the translator's stack is expected to @@ -805,7 +806,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { if byte <= 0x7F { return Ok(Either::Left(char::try_from(byte).unwrap())); } - if !self.trans().allow_invalid_utf8 { + if self.trans().utf8 { return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); } Ok(Either::Right(byte)) @@ -856,7 +857,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } fn hir_dot(&self, span: Span) -> Result { - if !self.flags().unicode() && !self.trans().allow_invalid_utf8 { + if !self.flags().unicode() && self.trans().utf8 { return Err(self.error(span, ErrorKind::InvalidUtf8)); } Ok(Hir::dot(self.flags().dot())) @@ -890,7 +891,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { // It is possible for negated ASCII word boundaries to // match at invalid UTF-8 boundaries, even when searching // valid UTF-8. - if !self.trans().allow_invalid_utf8 { + if self.trans().utf8 { return Err( self.error(asst.span, ErrorKind::InvalidUtf8) ); @@ -1039,7 +1040,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { // Negating a Perl byte class is likely to cause it to match invalid // UTF-8. That's only OK if the translator is configured to allow such // things. - if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() { + if self.trans().utf8 && !class.is_all_ascii() { return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8)); } Ok(class) @@ -1107,7 +1108,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { if negated { class.negate(); } - if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() { + if self.trans().utf8 && !class.is_all_ascii() { return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); } Ok(()) @@ -1313,7 +1314,7 @@ mod tests { fn t(pattern: &str) -> Hir { TranslatorBuilder::new() - .allow_invalid_utf8(false) + .utf8(true) .build() .translate(pattern, &parse(pattern)) .unwrap() @@ -1321,7 +1322,7 @@ mod tests { fn t_err(pattern: &str) -> hir::Error { TranslatorBuilder::new() - .allow_invalid_utf8(false) + .utf8(true) .build() .translate(pattern, &parse(pattern)) .unwrap_err() @@ -1329,7 +1330,7 @@ mod tests { fn t_bytes(pattern: &str) -> Hir { TranslatorBuilder::new() - .allow_invalid_utf8(true) + .utf8(false) .build() .translate(pattern, &parse(pattern)) .unwrap() diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index 509ce3e15..8eb88e042 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -74,19 +74,23 @@ impl ParserBuilder { self } - /// When enabled, the parser will permit the construction of a regular + /// When disabled, translation will permit the construction of a regular /// expression that may match invalid UTF-8. /// - /// When disabled (the default), the parser is guaranteed to produce - /// an expression that will only ever match valid UTF-8 (otherwise, the - /// parser will return an error). - /// - /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII - /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause - /// the parser to return an error. Namely, a negated ASCII word boundary - /// can result in matching positions that aren't valid UTF-8 boundaries. - pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder { - self.hir.allow_invalid_utf8(yes); + /// When enabled (the default), the translator is guaranteed to produce an + /// expression that, for non-empty matches, will only ever produce spans + /// that are entirely valid UTF-8 (otherwise, the translator will return an + /// error). + /// + /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even + /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete + /// syntax) will be allowed even though they can produce matches that split + /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" + /// matches, and it is expected that the regex engine itself must handle + /// these cases if necessary (perhaps by suppressing any zero-width matches + /// that split a codepoint). + pub fn utf8(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.utf8(yes); self } @@ -144,9 +148,9 @@ impl ParserBuilder { /// By default this is **enabled**. It may alternatively be selectively /// disabled in the regular expression itself via the `u` flag. /// - /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by - /// default), a regular expression will fail to parse if Unicode mode is - /// disabled and a sub-expression could possibly match invalid UTF-8. + /// Note that unless `utf8` is disabled (it's enabled by default), a + /// regular expression will fail to parse if Unicode mode is disabled and a + /// sub-expression could possibly match invalid UTF-8. pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.unicode(yes); self diff --git a/src/exec.rs b/src/exec.rs index 194cf71fe..96792b1e7 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -246,7 +246,7 @@ impl ExecBuilder { .swap_greed(self.options.swap_greed) .ignore_whitespace(self.options.ignore_whitespace) .unicode(self.options.unicode) - .allow_invalid_utf8(!self.only_utf8) + .utf8(self.only_utf8) .nest_limit(self.options.nest_limit) .build(); let expr = From a0454c25fc38107ad914e0899748cd094faf670b Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 24 Jan 2023 17:20:49 -0500 Subject: [PATCH 49/79] syntax: trim literal sequence if necessary On some occasions, it can make sense to trim the current literal sequences before doing a 'union' IF doing that union would cause the sequences to become infinite because of a blown limit. If we can keep literal extraction going by trimming things down, that's usually beneficial. For now, we just kind of guess that '3' is a good sweet spot for this. --- regex-syntax/src/hir/literal.rs | 54 +++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 436f08a02..4a892d837 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -583,7 +583,38 @@ impl Extractor { fn union(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { if seq1.max_union_len(seq2).map_or(false, |len| len > self.limit_total) { - seq2.make_infinite(); + // We try to trim our literal sequences to see if we can make + // room for more literals. The idea is that we'd rather trim down + // literals already in our sequence if it means we can add a few + // more and retain a finite sequence. Otherwise, we'll union with + // an infinite sequence and that infects everything and effectively + // stops literal extraction in its tracks. + // + // We do we keep 4 bytes here? Well, it's a bit of an abstraction + // leakage. Downstream, the literals may wind up getting fed to + // the Teddy algorithm, which supports searching literals up to + // length 4. So that's why we pick that number here. Arguably this + // should be a tuneable parameter, but it seems a little tricky to + // describe. And I'm still unsure if this is the right way to go + // about culling literal sequences. + match self.kind { + ExtractKind::Prefix => { + seq1.keep_first_bytes(4); + seq2.keep_first_bytes(4); + } + ExtractKind::Suffix => { + seq1.keep_last_bytes(4); + seq2.keep_last_bytes(4); + } + } + seq1.dedup(); + seq2.dedup(); + if seq1 + .max_union_len(seq2) + .map_or(false, |len| len > self.limit_total) + { + seq2.make_infinite(); + } } seq1.union(seq2); assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); @@ -2763,8 +2794,27 @@ mod tests { assert_eq!(expected, (prefixes, suffixes)); } + // This tests that we get some kind of literals extracted for a beefier + // alternation with case insensitive mode enabled. At one point during + // development, this returned nothing, and motivated some special case + // code in Extractor::union to try and trim down the literal sequences + // if the union would blow the limits set. + #[test] + #[cfg(feature = "unicode-case")] + fn holmes_alt() { + let mut pre = + prefixes(r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker"); + assert!(pre.len().unwrap() > 0); + pre.optimize_for_prefix_by_preference(); + assert!(pre.len().unwrap() > 0); + } + // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 // See: CVE-2022-24713 + // + // We test this here to ensure literal extraction completes in reasonable + // time and isn't materially impacted by these sorts of pathological + // repeats. #[test] fn crazy_repeats() { assert_eq!(inexact([I("")], [I("")]), e(r"(?:){4294967295}")); @@ -3079,7 +3129,7 @@ mod tests { // literal optimizations. let (prefixes, suffixes) = e(pat); assert!(!suffixes.is_finite()); - assert_eq!(Some(247), prefixes.len()); + assert_eq!(Some(243), prefixes.len()); } #[test] From 557f0ea85e613a34e7485878f8e2f1492f7061d0 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 9 Feb 2023 11:12:49 -0500 Subject: [PATCH 50/79] syntax: factor out common prefixes of alternations It is generally quite subtle to reason clearly about how this actually helps things in a finite automata based regex engine, but this sort of factoring can lead to lots of improvements: * We do use a bounded backtracker, so "pushing branches" down will help things there, just like it would with a classical backtracker. * It may lead to better literal extraction due to the simpler regex. Whether prefix factoring is really to blame here is somewhat unclear, but some downstream optimizations are more brittle than others. For example, the "reverse inner" optimization requires examining a "top level" concatenation to find literals to search for. By factoring out a common prefix, we potentially expand the number of regexes that have a top-level concat. For example, `\wfoo|\wbar` has no top-level concat but `\w(?:foo|bar)` does. * It should lead to faster matching even in finite automata oriented engines like the PikeVM, and also faster construction of DFAs (lazy or not). Namely, by pushing the branches down, we make it so they are visited less frequently, and thus the constant state shuffling caused by branches is reduced. The prefix extraction could be better, as mentioned in the comments, but this is a good start. --- regex-syntax/src/hir/mod.rs | 81 ++++++++++++++++++++++++++++--- regex-syntax/src/hir/translate.rs | 30 ++++++++++++ 2 files changed, 105 insertions(+), 6 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 28f8ea662..d70cad947 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -429,12 +429,14 @@ impl Hir { return new.pop().unwrap(); } // Now that it's completely flattened, look for the special case of - // 'char1|char2|...|charN' and collapse that into a class. Note that we - // look for 'char' first and then bytes. The issue here is that if we - // find both non-ASCII codepoints and non-ASCII singleton bytes, then - // it isn't actually possible to smush them into a single class. So we - // look for all chars and then all bytes, and don't handle anything - // else. + // 'char1|char2|...|charN' and collapse that into a class. Note that + // we look for 'char' first and then bytes. The issue here is that if + // we find both non-ASCII codepoints and non-ASCII singleton bytes, + // then it isn't actually possible to smush them into a single class. + // (Because classes are either "all codepoints" or "all bytes." You + // can have a class that both matches non-ASCII but valid UTF-8 and + // invalid UTF-8.) So we look for all chars and then all bytes, and + // don't handle anything else. if let Some(singletons) = singleton_chars(&new) { let it = singletons .into_iter() @@ -455,6 +457,14 @@ impl Hir { if let Some(cls) = class_bytes(&new) { return Hir::class(cls); } + // Factor out a common prefix if we can, which might potentially + // simplify the expression and unlock other optimizations downstream. + // It also might generally make NFA matching and DFA construction + // faster by reducing the scope of branching in the regex. + new = match lift_common_prefix(new) { + Ok(hir) => return hir, + Err(unchanged) => unchanged, + }; let props = Properties::alternation(&new); Hir { kind: HirKind::Alternation(new), props } } @@ -2251,6 +2261,65 @@ fn singleton_bytes(hirs: &[Hir]) -> Option> { Some(singletons) } +/// Looks for a common prefix in the list of alternation branches given. If one +/// is found, then an equivalent but (hopefully) simplified Hir is returned. +/// Otherwise, the original given list of branches is returned unmodified. +/// +/// This is not quite as good as it could be. Right now, it requires that +/// all branches are 'Concat' expressions. It also doesn't do well with +/// literals. For example, given 'foofoo|foobar', it will not refactor it to +/// 'foo(?:foo|bar)' because literals are flattened into their own special +/// concatenation. (One wonders if perhaps 'Literal' should be a single atom +/// instead of a string of bytes because of this. Otherwise, handling the +/// current representation in this routine will be pretty gnarly. Sigh.) +fn lift_common_prefix(hirs: Vec) -> Result> { + if hirs.len() <= 1 { + return Err(hirs); + } + let mut prefix = match hirs[0].kind() { + HirKind::Concat(ref xs) => &**xs, + _ => return Err(hirs), + }; + if prefix.is_empty() { + return Err(hirs); + } + for h in hirs.iter().skip(1) { + let concat = match h.kind() { + HirKind::Concat(ref xs) => xs, + _ => return Err(hirs), + }; + let common_len = prefix + .iter() + .zip(concat.iter()) + .take_while(|(x, y)| x == y) + .count(); + prefix = &prefix[..common_len]; + if prefix.is_empty() { + return Err(hirs); + } + } + let len = prefix.len(); + assert_ne!(0, len); + let mut prefix_concat = vec![]; + let mut suffix_alts = vec![]; + for h in hirs { + let mut concat = match h.into_kind() { + HirKind::Concat(xs) => xs, + // We required all sub-expressions to be + // concats above, so we're only here if we + // have a concat. + _ => unreachable!(), + }; + suffix_alts.push(Hir::concat(concat.split_off(len))); + if prefix_concat.is_empty() { + prefix_concat = concat; + } + } + let mut concat = prefix_concat; + concat.push(Hir::alternation(suffix_alts)); + Ok(Hir::concat(concat)) +} + #[cfg(test)] mod tests { use super::*; diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index a31998a6b..a787b79c4 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -3428,5 +3428,35 @@ mod tests { t("a|b|c|d|e|f|x|y|z"), hir_uclass(&[('a', 'f'), ('x', 'z')]), ); + // Tests that we lift common prefixes out of an alternation. + assert_eq!( + t("[A-Z]foo|[A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_lit("foo"), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z][A-Z]|[A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z][A-Z]|[A-Z][A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![Hir::empty(), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z]foo|[A-Z]foobar"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]), + ]), + ); } } From 541aa423fb3b67dced0e16d0b11f8ae4f0665afc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20F=C3=A4rber?= <01mf02@gmail.com> Date: Thu, 9 Feb 2023 12:59:59 +0100 Subject: [PATCH 51/79] syntax: support `(?<` syntax for named groups It turns out that both '(?P...)' and '(?...)' are rather common among regex engines. There are several that support just one or the other. Until this commit, the regex crate only supported the former, along with both RE2, RE2/J and Go's regexp package. There are also several regex engines that only supported the latter, such as Onigmo, Onuguruma, Java, Ruby, Boost, .NET and Javascript. To decrease friction, and because there is somewhat little cost to doing so, we elect to support both. It looks like perhaps RE2 and Go's regexp package will go the same route, but it isn't fully decided yet: https://github.com/golang/go/issues/58458 Closes #955, Closes #956 --- regex-syntax/src/ast/mod.rs | 13 ++-- regex-syntax/src/ast/parse.rs | 103 ++++++++++++++++++++---------- regex-syntax/src/ast/print.rs | 8 ++- regex-syntax/src/hir/translate.rs | 4 +- src/lib.rs | 1 + 5 files changed, 87 insertions(+), 42 deletions(-) diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 7329fabbe..f36f27791 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -1162,7 +1162,7 @@ impl Group { /// Returns true if and only if this group is capturing. pub fn is_capturing(&self) -> bool { match self.kind { - GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true, + GroupKind::CaptureIndex(_) | GroupKind::CaptureName { .. } => true, GroupKind::NonCapturing(_) => false, } } @@ -1173,7 +1173,7 @@ impl Group { pub fn capture_index(&self) -> Option { match self.kind { GroupKind::CaptureIndex(i) => Some(i), - GroupKind::CaptureName(ref x) => Some(x.index), + GroupKind::CaptureName { ref name, .. } => Some(name.index), GroupKind::NonCapturing(_) => None, } } @@ -1184,8 +1184,13 @@ impl Group { pub enum GroupKind { /// `(a)` CaptureIndex(u32), - /// `(?Pa)` - CaptureName(CaptureName), + /// `(?a)` or `(?Pa)` + CaptureName { + /// True if the `?P<` syntax is used and false if the `?<` syntax is used. + starts_with_p: bool, + /// The capture name. + name: CaptureName, + }, /// `(?:a)` and `(?i:a)` NonCapturing(Flags), } diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 48a0507e2..1d6d4d046 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -1202,12 +1202,16 @@ impl<'s, P: Borrow> ParserI<'s, P> { )); } let inner_span = self.span(); - if self.bump_if("?P<") { + let mut starts_with_p = true; + if self.bump_if("?P<") || { + starts_with_p = false; + self.bump_if("?<") + } { let capture_index = self.next_capture_index(open_span)?; - let cap = self.parse_capture_name(capture_index)?; + let name = self.parse_capture_name(capture_index)?; Ok(Either::Right(ast::Group { span: open_span, - kind: ast::GroupKind::CaptureName(cap), + kind: ast::GroupKind::CaptureName { starts_with_p, name }, ast: Box::new(Ast::Empty(self.span())), })) } else if self.bump_if("?") { @@ -2800,11 +2804,14 @@ bar flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), Ast::Group(ast::Group { span: span_range(pat, 4..pat.len()), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span_range(pat, 9..12), - name: s("foo"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span_range(pat, 9..12), + name: s("foo"), + index: 1, + } + }, ast: Box::new(lit_with('a', span_range(pat, 14..15))), }), ] @@ -3819,15 +3826,33 @@ bar #[test] fn parse_capture_name() { + assert_eq!( + parser("(?z)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..7), + kind: ast::GroupKind::CaptureName { + starts_with_p: false, + name: ast::CaptureName { + span: span(3..4), + name: s("a"), + index: 1, + } + }, + ast: Box::new(lit('z', 5)), + })) + ); assert_eq!( parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..8), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..5), - name: s("a"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..5), + name: s("a"), + index: 1, + } + }, ast: Box::new(lit('z', 6)), })) ); @@ -3835,11 +3860,14 @@ bar parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..10), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..7), - name: s("abc"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("abc"), + index: 1, + } + }, ast: Box::new(lit('z', 8)), })) ); @@ -3848,11 +3876,14 @@ bar parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..10), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..7), - name: s("a_1"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("a_1"), + index: 1, + } + }, ast: Box::new(lit('z', 8)), })) ); @@ -3861,11 +3892,14 @@ bar parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..10), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..7), - name: s("a.1"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("a.1"), + index: 1, + } + }, ast: Box::new(lit('z', 8)), })) ); @@ -3874,11 +3908,14 @@ bar parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..11), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..8), - name: s("a[1]"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..8), + name: s("a[1]"), + index: 1, + } + }, ast: Box::new(lit('z', 9)), })) ); diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index e6c000d57..0922ea0e3 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -160,9 +160,10 @@ impl Writer { use crate::ast::GroupKind::*; match ast.kind { CaptureIndex(_) => self.wtr.write_str("("), - CaptureName(ref x) => { - self.wtr.write_str("(?P<")?; - self.wtr.write_str(&x.name)?; + CaptureName { ref name, starts_with_p } => { + let start = if starts_with_p { "(?P<" } else { "(?<" }; + self.wtr.write_str(start)?; + self.wtr.write_str(&name.name)?; self.wtr.write_str(">")?; Ok(()) } @@ -505,6 +506,7 @@ mod tests { fn print_group() { roundtrip("(?i:a)"); roundtrip("(?Pa)"); + roundtrip("(?a)"); roundtrip("(a)"); } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index a787b79c4..b5bb41767 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -905,8 +905,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir { let (index, name) = match group.kind { ast::GroupKind::CaptureIndex(index) => (index, None), - ast::GroupKind::CaptureName(ref cap) => { - (cap.index, Some(cap.name.clone().into_boxed_str())) + ast::GroupKind::CaptureName { ref name, .. } => { + (name.index, Some(name.name.clone().into_boxed_str())) } // The HIR doesn't need to use non-capturing groups, since the way // in which the data type is defined handles this automatically. diff --git a/src/lib.rs b/src/lib.rs index 6b95739c5..1de347861 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -361,6 +361,7 @@ regex matches `abc` at positions `0`, `1`, `2` and `3`.
 (exp)          numbered capture group (indexed by opening parenthesis)
 (?P<name>exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
+(?<name>exp)   named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
 (?:exp)        non-capturing group
 (?flags)       set flags within current group
 (?flags:exp)   set flags for exp (non-capturing)

From 19b29cf66e2ed57347ae2df27ba29c97e3873697 Mon Sep 17 00:00:00 2001
From: Andrew Gallant 
Date: Fri, 20 Jan 2023 11:39:31 -0500
Subject: [PATCH 52/79] dfa: fix approximate cache size

Unbelievably, this was using the size of the compiled prog *and* the
heap memory used by the cache to compute the total memory used by the
cache. The effect of this is that the reported size might be much bigger
than what is actually used by the cache. This in turn would result in
the lazy DFA thrashing the cache and going quite slow.
---
 src/dfa.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dfa.rs b/src/dfa.rs
index dc9952120..78ed71021 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -1576,7 +1576,7 @@ impl<'a> Fsm<'a> {
     /// inputs, a new state could be created for every byte of input. (This is
     /// bad for memory use, so we bound it with a cache.)
     fn approximate_size(&self) -> usize {
-        self.cache.size + self.prog.approximate_size()
+        self.cache.size
     }
 }
 

From ca03c73e1ed3debd83ebef108c88a1a2ef421561 Mon Sep 17 00:00:00 2001
From: Andrew Gallant 
Date: Fri, 6 Jan 2023 14:18:17 -0500
Subject: [PATCH 53/79] impl: switch to aho-corasick 1.0

This is a transitory commit that will need to be updated once
aho-corasick 1.0 is actually released. Its purpose is to make it so the
regex crate, the "old" regex crate and regex-automata all agree on the
same version of aho-corasick to use while in development.
---
 Cargo.toml         |  2 +-
 src/exec.rs        | 11 +++++------
 src/literal/imp.rs | 16 ++++++++--------
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 1e6ec664d..f4c70aa1a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -107,7 +107,7 @@ pattern = []
 
 # For very fast prefix literal matching.
 [dependencies.aho-corasick]
-version = "0.7.18"
+version = "1.0.0"
 optional = true
 
 # For skipping along search text quickly when a leading byte is known.
diff --git a/src/exec.rs b/src/exec.rs
index 96792b1e7..b183d24d1 100644
--- a/src/exec.rs
+++ b/src/exec.rs
@@ -4,7 +4,7 @@ use std::panic::AssertUnwindSafe;
 use std::sync::Arc;
 
 #[cfg(feature = "perf-literal")]
-use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
+use aho_corasick::{AhoCorasick, MatchKind};
 use regex_syntax::hir::literal;
 use regex_syntax::hir::{Hir, Look};
 use regex_syntax::ParserBuilder;
@@ -101,7 +101,7 @@ struct ExecReadOnly {
     /// if we were to exhaust the ID space, we probably would have long
     /// surpassed the compilation size limit.
     #[cfg(feature = "perf-literal")]
-    ac: Option>,
+    ac: Option,
     /// match_type encodes as much upfront knowledge about how we're going to
     /// execute a search as possible.
     match_type: MatchType,
@@ -395,7 +395,7 @@ impl ExecBuilder {
     }
 
     #[cfg(feature = "perf-literal")]
-    fn build_aho_corasick(&self, parsed: &Parsed) -> Option> {
+    fn build_aho_corasick(&self, parsed: &Parsed) -> Option {
         if parsed.exprs.len() != 1 {
             return None;
         }
@@ -409,10 +409,9 @@ impl ExecBuilder {
             return None;
         }
         Some(
-            AhoCorasickBuilder::new()
+            AhoCorasick::builder()
                 .match_kind(MatchKind::LeftmostFirst)
-                .auto_configure(&lits)
-                .build_with_size::(&lits)
+                .build(&lits)
                 // This should never happen because we'd long exceed the
                 // compilation limit for regexes first.
                 .expect("AC automaton too big"),
diff --git a/src/literal/imp.rs b/src/literal/imp.rs
index 89fddca0e..75fa6e37b 100644
--- a/src/literal/imp.rs
+++ b/src/literal/imp.rs
@@ -1,6 +1,6 @@
 use std::mem;
 
-use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
+use aho_corasick::{self, packed, AhoCorasick};
 use memchr::{memchr, memchr2, memchr3, memmem};
 use regex_syntax::hir::literal::{Literal, Seq};
 
@@ -26,7 +26,7 @@ enum Matcher {
     /// A single substring, using vector accelerated routines when available.
     Memmem(Memmem),
     /// An Aho-Corasick automaton.
-    AC { ac: AhoCorasick, lits: Vec },
+    AC { ac: AhoCorasick, lits: Vec },
     /// A packed multiple substring searcher, using SIMD.
     ///
     /// Note that Aho-Corasick will actually use this packed searcher
@@ -149,7 +149,7 @@ impl LiteralSearcher {
             Empty => 0,
             Bytes(ref sset) => sset.dense.len(),
             Memmem(_) => 1,
-            AC { ref ac, .. } => ac.pattern_count(),
+            AC { ref ac, .. } => ac.patterns_len(),
             Packed { ref lits, .. } => lits.len(),
         }
     }
@@ -161,8 +161,8 @@ impl LiteralSearcher {
             Empty => 0,
             Bytes(ref sset) => sset.approximate_size(),
             Memmem(ref single) => single.approximate_size(),
-            AC { ref ac, .. } => ac.heap_bytes(),
-            Packed { ref s, .. } => s.heap_bytes(),
+            AC { ref ac, .. } => ac.memory_usage(),
+            Packed { ref s, .. } => s.memory_usage(),
         }
     }
 }
@@ -212,10 +212,10 @@ impl Matcher {
                 return Matcher::Packed { s, lits: lits.to_owned() };
             }
         }
-        let ac = AhoCorasickBuilder::new()
+        let ac = AhoCorasick::builder()
             .match_kind(aho_corasick::MatchKind::LeftmostFirst)
-            .dfa(true)
-            .build_with_size::(&pats)
+            .kind(Some(aho_corasick::AhoCorasickKind::DFA))
+            .build(&pats)
             .unwrap();
         Matcher::AC { ac, lits: lits.to_owned() }
     }

From 99d8436a4e526e20d0677a8268fc885cab21cbab Mon Sep 17 00:00:00 2001
From: Andrew Gallant 
Date: Tue, 28 Feb 2023 17:09:01 -0500
Subject: [PATCH 54/79] syntax: rename 'Group' to 'Capture'

Now that it *only* represents a capturing group, it makes sense to give
it a more specific name.
---
 regex-syntax/src/hir/literal.rs   |  2 +-
 regex-syntax/src/hir/mod.rs       | 36 ++++++------
 regex-syntax/src/hir/print.rs     |  4 +-
 regex-syntax/src/hir/translate.rs | 94 ++++++++++++++++---------------
 regex-syntax/src/hir/visitor.rs   | 10 ++--
 src/compile.rs                    |  2 +-
 6 files changed, 75 insertions(+), 73 deletions(-)

diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs
index 4a892d837..2fbcdcff2 100644
--- a/regex-syntax/src/hir/literal.rs
+++ b/regex-syntax/src/hir/literal.rs
@@ -186,7 +186,7 @@ impl Extractor {
             }
             Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls),
             Repetition(ref rep) => self.extract_repetition(rep),
-            Group(hir::Group { ref hir, .. }) => self.extract(hir),
+            Capture(hir::Capture { ref hir, .. }) => self.extract(hir),
             Concat(ref hirs) => match self.kind {
                 ExtractKind::Prefix => self.extract_concat(hirs.iter()),
                 ExtractKind::Suffix => self.extract_concat(hirs.iter().rev()),
diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
index d70cad947..7be604a66 100644
--- a/regex-syntax/src/hir/mod.rs
+++ b/regex-syntax/src/hir/mod.rs
@@ -185,8 +185,8 @@ pub enum HirKind {
     Look(Look),
     /// A repetition operation applied to a child expression.
     Repetition(Repetition),
-    /// A possibly capturing group, which contains a child expression.
-    Group(Group),
+    /// A capturing group, which contains a child expression.
+    Capture(Capture),
     /// A concatenation of expressions. A concatenation always has at least two
     /// child expressions.
     ///
@@ -329,11 +329,11 @@ impl Hir {
         Hir { kind: HirKind::Repetition(rep), props }
     }
 
-    /// Creates a group HIR expression.
+    /// Creates a capture HIR expression.
     #[inline]
-    pub fn group(group: Group) -> Hir {
-        let props = Properties::group(&group);
-        Hir { kind: HirKind::Group(group), props }
+    pub fn capture(capture: Capture) -> Hir {
+        let props = Properties::capture(&capture);
+        Hir { kind: HirKind::Capture(capture), props }
     }
 
     /// Returns the concatenation of the given expressions.
@@ -529,7 +529,7 @@ impl HirKind {
             | HirKind::Literal(_)
             | HirKind::Class(_)
             | HirKind::Look(_) => false,
-            HirKind::Group(_)
+            HirKind::Capture(_)
             | HirKind::Repetition(_)
             | HirKind::Concat(_)
             | HirKind::Alternation(_) => true,
@@ -1431,10 +1431,10 @@ impl Look {
 /// in a `Hir`. Instead, non-capturing grouping is handled automatically by
 /// the recursive structure of the `Hir` itself.
 #[derive(Clone, Debug, Eq, PartialEq)]
-pub struct Group {
-    /// The capture index of the group.
+pub struct Capture {
+    /// The capture index of the capture.
     pub index: u32,
-    /// The name of the group, if it exists.
+    /// The name of the capture, if it exists.
     pub name: Option>,
     /// The expression inside the capturing group, which may be empty.
     pub hir: Box,
@@ -1523,7 +1523,7 @@ impl Drop for Hir {
             | HirKind::Literal(_)
             | HirKind::Class(_)
             | HirKind::Look(_) => return,
-            HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
+            HirKind::Capture(ref x) if !x.hir.kind.has_subexprs() => return,
             HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
             HirKind::Concat(ref x) if x.is_empty() => return,
             HirKind::Alternation(ref x) if x.is_empty() => return,
@@ -1537,7 +1537,7 @@ impl Drop for Hir {
                 | HirKind::Literal(_)
                 | HirKind::Class(_)
                 | HirKind::Look(_) => {}
-                HirKind::Group(ref mut x) => {
+                HirKind::Capture(ref mut x) => {
                     stack.push(mem::replace(&mut x.hir, Hir::empty()));
                 }
                 HirKind::Repetition(ref mut x) => {
@@ -1955,13 +1955,9 @@ impl Properties {
         Properties(Box::new(inner))
     }
 
-    /// Create a new set of HIR properties for a group.
-    fn group(group: &Group) -> Properties {
-        // FIXME: Groups really should always have the same properties as
-        // their child expressions. But the literal properties somewhat
-        // over-constrained in what they represent in order to make downstream
-        // analyses a bit more straight-forward.
-        let p = group.hir.properties();
+    /// Create a new set of HIR properties for a capture.
+    fn capture(capture: &Capture) -> Properties {
+        let p = capture.hir.properties();
         Properties(Box::new(PropertiesI {
             captures_len: p.captures_len().saturating_add(1),
             literal: false,
@@ -3055,7 +3051,7 @@ mod tests {
         let run = || {
             let mut expr = Hir::empty();
             for _ in 0..100 {
-                expr = Hir::group(Group {
+                expr = Hir::capture(Capture {
                     index: 1,
                     name: None,
                     hir: Box::new(expr),
diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs
index 63d78ad00..ef654d40c 100644
--- a/regex-syntax/src/hir/print.rs
+++ b/regex-syntax/src/hir/print.rs
@@ -190,7 +190,7 @@ impl Visitor for Writer {
                     self.wtr.write_str(r"\B")?;
                 }
             },
-            HirKind::Group(hir::Group { ref name, .. }) => {
+            HirKind::Capture(hir::Capture { ref name, .. }) => {
                 self.wtr.write_str("(")?;
                 if let Some(ref name) = *name {
                     write!(self.wtr, "?P<{}>", name)?;
@@ -254,7 +254,7 @@ impl Visitor for Writer {
                     self.wtr.write_str("?")?;
                 }
             }
-            HirKind::Group(_)
+            HirKind::Capture(_)
             | HirKind::Concat(_)
             | HirKind::Alternation(_) => {
                 self.wtr.write_str(r")")?;
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index b5bb41767..fcef47a1e 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -173,7 +173,7 @@ enum HirFrame {
     /// This sentinel only exists to stop other things (like flattening
     /// literals) from reaching across repetition operators.
     Repetition,
-    /// This is pushed on to the stack upon first seeing any kind of group,
+    /// This is pushed on to the stack upon first seeing any kind of capture,
     /// indicated by parentheses (including non-capturing groups). It is popped
     /// upon leaving a group.
     Group {
@@ -414,7 +414,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
                 let expr = self.pop().unwrap().unwrap_expr();
                 let old_flags = self.pop().unwrap().unwrap_group();
                 self.trans().flags.set(old_flags);
-                self.push(HirFrame::Expr(self.hir_group(x, expr)));
+                self.push(HirFrame::Expr(self.hir_capture(x, expr)));
             }
             Ast::Concat(_) => {
                 let mut exprs = vec![];
@@ -902,7 +902,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         })
     }
 
-    fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir {
+    fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir {
         let (index, name) = match group.kind {
             ast::GroupKind::CaptureIndex(index) => (index, None),
             ast::GroupKind::CaptureName { ref name, .. } => {
@@ -912,7 +912,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
             // in which the data type is defined handles this automatically.
             ast::GroupKind::NonCapturing(_) => return expr,
         };
-        Hir::group(hir::Group { index, name, hir: Box::new(expr) })
+        Hir::capture(hir::Capture { index, name, hir: Box::new(expr) })
     }
 
     fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
@@ -1352,12 +1352,12 @@ mod tests {
         Hir::literal(s)
     }
 
-    fn hir_group(index: u32, expr: Hir) -> Hir {
-        Hir::group(hir::Group { index, name: None, hir: Box::new(expr) })
+    fn hir_capture(index: u32, expr: Hir) -> Hir {
+        Hir::capture(hir::Capture { index, name: None, hir: Box::new(expr) })
     }
 
-    fn hir_group_name(index: u32, name: &str, expr: Hir) -> Hir {
-        Hir::group(hir::Group {
+    fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir {
+        Hir::capture(hir::Capture {
             index,
             name: Some(name.into()),
             hir: Box::new(expr),
@@ -1528,35 +1528,35 @@ mod tests {
     fn empty() {
         assert_eq!(t(""), Hir::empty());
         assert_eq!(t("(?i)"), Hir::empty());
-        assert_eq!(t("()"), hir_group(1, Hir::empty()));
+        assert_eq!(t("()"), hir_capture(1, Hir::empty()));
         assert_eq!(t("(?:)"), Hir::empty());
-        assert_eq!(t("(?P)"), hir_group_name(1, "wat", Hir::empty()));
+        assert_eq!(t("(?P)"), hir_capture_name(1, "wat", Hir::empty()));
         assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
         assert_eq!(
             t("()|()"),
             hir_alt(vec![
-                hir_group(1, Hir::empty()),
-                hir_group(2, Hir::empty()),
+                hir_capture(1, Hir::empty()),
+                hir_capture(2, Hir::empty()),
             ])
         );
         assert_eq!(
             t("(|b)"),
-            hir_group(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
+            hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
         );
         assert_eq!(
             t("(a|)"),
-            hir_group(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
+            hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
         );
         assert_eq!(
             t("(a||c)"),
-            hir_group(
+            hir_capture(
                 1,
                 hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),])
             )
         );
         assert_eq!(
             t("(||)"),
-            hir_group(
+            hir_capture(
                 1,
                 hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),])
             )
@@ -1740,56 +1740,59 @@ mod tests {
 
     #[test]
     fn group() {
-        assert_eq!(t("(a)"), hir_group(1, hir_lit("a")));
+        assert_eq!(t("(a)"), hir_capture(1, hir_lit("a")));
         assert_eq!(
             t("(a)(b)"),
             hir_cat(vec![
-                hir_group(1, hir_lit("a")),
-                hir_group(2, hir_lit("b")),
+                hir_capture(1, hir_lit("a")),
+                hir_capture(2, hir_lit("b")),
             ])
         );
         assert_eq!(
             t("(a)|(b)"),
             hir_alt(vec![
-                hir_group(1, hir_lit("a")),
-                hir_group(2, hir_lit("b")),
+                hir_capture(1, hir_lit("a")),
+                hir_capture(2, hir_lit("b")),
             ])
         );
-        assert_eq!(t("(?P)"), hir_group_name(1, "foo", Hir::empty()));
-        assert_eq!(t("(?Pa)"), hir_group_name(1, "foo", hir_lit("a")));
+        assert_eq!(t("(?P)"), hir_capture_name(1, "foo", Hir::empty()));
+        assert_eq!(t("(?Pa)"), hir_capture_name(1, "foo", hir_lit("a")));
         assert_eq!(
             t("(?Pa)(?Pb)"),
             hir_cat(vec![
-                hir_group_name(1, "foo", hir_lit("a")),
-                hir_group_name(2, "bar", hir_lit("b")),
+                hir_capture_name(1, "foo", hir_lit("a")),
+                hir_capture_name(2, "bar", hir_lit("b")),
             ])
         );
         assert_eq!(t("(?:)"), Hir::empty());
         assert_eq!(t("(?:a)"), hir_lit("a"));
         assert_eq!(
             t("(?:a)(b)"),
-            hir_cat(vec![hir_lit("a"), hir_group(1, hir_lit("b")),])
+            hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),])
         );
         assert_eq!(
             t("(a)(?:b)(c)"),
             hir_cat(vec![
-                hir_group(1, hir_lit("a")),
+                hir_capture(1, hir_lit("a")),
                 hir_lit("b"),
-                hir_group(2, hir_lit("c")),
+                hir_capture(2, hir_lit("c")),
             ])
         );
         assert_eq!(
             t("(a)(?Pb)(c)"),
             hir_cat(vec![
-                hir_group(1, hir_lit("a")),
-                hir_group_name(2, "foo", hir_lit("b")),
-                hir_group(3, hir_lit("c")),
+                hir_capture(1, hir_lit("a")),
+                hir_capture_name(2, "foo", hir_lit("b")),
+                hir_capture(3, hir_lit("c")),
             ])
         );
-        assert_eq!(t("()"), hir_group(1, Hir::empty()));
-        assert_eq!(t("((?i))"), hir_group(1, Hir::empty()));
-        assert_eq!(t("((?x))"), hir_group(1, Hir::empty()));
-        assert_eq!(t("(((?x)))"), hir_group(1, hir_group(2, Hir::empty())));
+        assert_eq!(t("()"), hir_capture(1, Hir::empty()));
+        assert_eq!(t("((?i))"), hir_capture(1, Hir::empty()));
+        assert_eq!(t("((?x))"), hir_capture(1, Hir::empty()));
+        assert_eq!(
+            t("(((?x)))"),
+            hir_capture(1, hir_capture(2, Hir::empty()))
+        );
     }
 
     #[test]
@@ -1818,7 +1821,7 @@ mod tests {
         assert_eq!(
             t("((?i-u)a)b"),
             hir_cat(vec![
-                hir_group(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
+                hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
                 hir_lit("b"),
             ])
         );
@@ -1908,7 +1911,7 @@ mod tests {
             t("ab?"),
             hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
         );
-        assert_eq!(t("(ab)?"), hir_quest(true, hir_group(1, hir_lit("ab"))));
+        assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab"))));
         assert_eq!(
             t("a|b?"),
             hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
@@ -1922,7 +1925,7 @@ mod tests {
         let c = || hir_look(hir::Look::WordUnicode);
         let d = || hir_look(hir::Look::WordUnicodeNegate);
 
-        assert_eq!(t("(^$)"), hir_group(1, hir_cat(vec![a(), b()])));
+        assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()])));
         assert_eq!(t("^|$"), hir_alt(vec![a(), b()]));
         assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()]));
         assert_eq!(
@@ -1933,11 +1936,14 @@ mod tests {
                 hir_cat(vec![c(), d()]),
             ])
         );
-        assert_eq!(t("(^|$)"), hir_group(1, hir_alt(vec![a(), b()])));
-        assert_eq!(t(r"(^|$|\b)"), hir_group(1, hir_alt(vec![a(), b(), c()])));
+        assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()])));
+        assert_eq!(
+            t(r"(^|$|\b)"),
+            hir_capture(1, hir_alt(vec![a(), b(), c()]))
+        );
         assert_eq!(
             t(r"(^$|$\b|\b\B)"),
-            hir_group(
+            hir_capture(
                 1,
                 hir_alt(vec![
                     hir_cat(vec![a(), b()]),
@@ -1948,15 +1954,15 @@ mod tests {
         );
         assert_eq!(
             t(r"(^$|($\b|(\b\B)))"),
-            hir_group(
+            hir_capture(
                 1,
                 hir_alt(vec![
                     hir_cat(vec![a(), b()]),
-                    hir_group(
+                    hir_capture(
                         2,
                         hir_alt(vec![
                             hir_cat(vec![b(), c()]),
-                            hir_group(3, hir_cat(vec![c(), d()])),
+                            hir_capture(3, hir_cat(vec![c(), d()])),
                         ])
                     ),
                 ])
diff --git a/regex-syntax/src/hir/visitor.rs b/regex-syntax/src/hir/visitor.rs
index 0012d5697..ba1db238a 100644
--- a/regex-syntax/src/hir/visitor.rs
+++ b/regex-syntax/src/hir/visitor.rs
@@ -75,9 +75,9 @@ enum Frame<'a> {
     /// A stack frame allocated just before descending into a repetition
     /// operator's child node.
     Repetition(&'a hir::Repetition),
-    /// A stack frame allocated just before descending into a group's child
+    /// A stack frame allocated just before descending into a capture's child
     /// node.
-    Group(&'a hir::Group),
+    Capture(&'a hir::Capture),
     /// The stack frame used while visiting every child node of a concatenation
     /// of expressions.
     Concat {
@@ -150,7 +150,7 @@ impl<'a> HeapVisitor<'a> {
     fn induct(&mut self, hir: &'a Hir) -> Option> {
         match *hir.kind() {
             HirKind::Repetition(ref x) => Some(Frame::Repetition(x)),
-            HirKind::Group(ref x) => Some(Frame::Group(x)),
+            HirKind::Capture(ref x) => Some(Frame::Capture(x)),
             HirKind::Concat(ref x) if x.is_empty() => None,
             HirKind::Concat(ref x) => {
                 Some(Frame::Concat { head: &x[0], tail: &x[1..] })
@@ -168,7 +168,7 @@ impl<'a> HeapVisitor<'a> {
     fn pop(&self, induct: Frame<'a>) -> Option> {
         match induct {
             Frame::Repetition(_) => None,
-            Frame::Group(_) => None,
+            Frame::Capture(_) => None,
             Frame::Concat { tail, .. } => {
                 if tail.is_empty() {
                     None
@@ -196,7 +196,7 @@ impl<'a> Frame<'a> {
     fn child(&self) -> &'a Hir {
         match *self {
             Frame::Repetition(rep) => &rep.hir,
-            Frame::Group(group) => &group.hir,
+            Frame::Capture(capture) => &capture.hir,
             Frame::Concat { head, .. } => head,
             Frame::Alternation { head, .. } => head,
         }
diff --git a/src/compile.rs b/src/compile.rs
index 692533340..50ab78700 100644
--- a/src/compile.rs
+++ b/src/compile.rs
@@ -368,7 +368,7 @@ impl Compiler {
                     self.c_empty_look(prog::EmptyLook::NotWordBoundary)
                 }
             },
-            Group(hir::Group { index, ref name, ref hir }) => {
+            Capture(hir::Capture { index, ref name, ref hir }) => {
                 if index as usize >= self.compiled.captures.len() {
                     let name = match *name {
                         None => None,

From 6cb02d93094cd7a15620e6a054703e2f9cfa1277 Mon Sep 17 00:00:00 2001
From: Andrew Gallant 
Date: Tue, 28 Feb 2023 17:18:10 -0500
Subject: [PATCH 55/79] syntax: rename 'hir' to 'sub'

Where 'sub' is short for 'sub-expression.'
---
 regex-syntax/src/hir/literal.rs   |  4 ++--
 regex-syntax/src/hir/mod.rs       | 22 +++++++++++-----------
 regex-syntax/src/hir/print.rs     |  8 ++++----
 regex-syntax/src/hir/translate.rs | 16 ++++++++--------
 regex-syntax/src/hir/visitor.rs   |  4 ++--
 src/compile.rs                    | 16 ++++++++--------
 6 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs
index 2fbcdcff2..121216ae5 100644
--- a/regex-syntax/src/hir/literal.rs
+++ b/regex-syntax/src/hir/literal.rs
@@ -186,7 +186,7 @@ impl Extractor {
             }
             Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls),
             Repetition(ref rep) => self.extract_repetition(rep),
-            Capture(hir::Capture { ref hir, .. }) => self.extract(hir),
+            Capture(hir::Capture { ref sub, .. }) => self.extract(sub),
             Concat(ref hirs) => match self.kind {
                 ExtractKind::Prefix => self.extract_concat(hirs.iter()),
                 ExtractKind::Suffix => self.extract_concat(hirs.iter().rev()),
@@ -448,7 +448,7 @@ impl Extractor {
     /// literals being extracted, which might actually be a better prefilter
     /// than just 'a'.
     fn extract_repetition(&self, rep: &hir::Repetition) -> Seq {
-        let mut subseq = self.extract(&rep.hir);
+        let mut subseq = self.extract(&rep.sub);
         match *rep {
             hir::Repetition { min: 0, max, greedy, .. } => {
                 // When 'max=1', we can retain exactness, since 'a?' is
diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
index 7be604a66..a2465e98a 100644
--- a/regex-syntax/src/hir/mod.rs
+++ b/regex-syntax/src/hir/mod.rs
@@ -323,7 +323,7 @@ impl Hir {
         if rep.min == 0 && rep.max == Some(0) {
             return Hir::empty();
         } else if rep.min == 1 && rep.max == Some(1) {
-            return *rep.hir;
+            return *rep.sub;
         }
         let props = Properties::repetition(&rep);
         Hir { kind: HirKind::Repetition(rep), props }
@@ -1437,7 +1437,7 @@ pub struct Capture {
     /// The name of the capture, if it exists.
     pub name: Option>,
     /// The expression inside the capturing group, which may be empty.
-    pub hir: Box,
+    pub sub: Box,
 }
 
 /// The high-level intermediate representation of a repetition operator.
@@ -1467,7 +1467,7 @@ pub struct Repetition {
     /// not. However, this can be inverted via the `U` "ungreedy" flag.
     pub greedy: bool,
     /// The expression being repeated.
-    pub hir: Box,
+    pub sub: Box,
 }
 
 impl Repetition {
@@ -1523,8 +1523,8 @@ impl Drop for Hir {
             | HirKind::Literal(_)
             | HirKind::Class(_)
             | HirKind::Look(_) => return,
-            HirKind::Capture(ref x) if !x.hir.kind.has_subexprs() => return,
-            HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
+            HirKind::Capture(ref x) if !x.sub.kind.has_subexprs() => return,
+            HirKind::Repetition(ref x) if !x.sub.kind.has_subexprs() => return,
             HirKind::Concat(ref x) if x.is_empty() => return,
             HirKind::Alternation(ref x) if x.is_empty() => return,
             _ => {}
@@ -1538,10 +1538,10 @@ impl Drop for Hir {
                 | HirKind::Class(_)
                 | HirKind::Look(_) => {}
                 HirKind::Capture(ref mut x) => {
-                    stack.push(mem::replace(&mut x.hir, Hir::empty()));
+                    stack.push(mem::replace(&mut x.sub, Hir::empty()));
                 }
                 HirKind::Repetition(ref mut x) => {
-                    stack.push(mem::replace(&mut x.hir, Hir::empty()));
+                    stack.push(mem::replace(&mut x.sub, Hir::empty()));
                 }
                 HirKind::Concat(ref mut x) => {
                     stack.extend(x.drain(..));
@@ -1926,7 +1926,7 @@ impl Properties {
 
     /// Create a new set of HIR properties for a repetition.
     fn repetition(rep: &Repetition) -> Properties {
-        let p = rep.hir.properties();
+        let p = rep.sub.properties();
         let minimum_len = p.minimum_len().map(|child_min| {
             let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
             child_min.saturating_mul(rep_min)
@@ -1957,7 +1957,7 @@ impl Properties {
 
     /// Create a new set of HIR properties for a capture.
     fn capture(capture: &Capture) -> Properties {
-        let p = capture.hir.properties();
+        let p = capture.sub.properties();
         Properties(Box::new(PropertiesI {
             captures_len: p.captures_len().saturating_add(1),
             literal: false,
@@ -3054,13 +3054,13 @@ mod tests {
                 expr = Hir::capture(Capture {
                     index: 1,
                     name: None,
-                    hir: Box::new(expr),
+                    sub: Box::new(expr),
                 });
                 expr = Hir::repetition(Repetition {
                     min: 0,
                     max: Some(1),
                     greedy: true,
-                    hir: Box::new(expr),
+                    sub: Box::new(expr),
                 });
 
                 expr = Hir {
diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs
index ef654d40c..40f8905b7 100644
--- a/regex-syntax/src/hir/print.rs
+++ b/regex-syntax/src/hir/print.rs
@@ -478,7 +478,7 @@ mod tests {
                 min: 1,
                 max: None,
                 greedy: true,
-                hir: Box::new(Hir::literal("ab".as_bytes())),
+                sub: Box::new(Hir::literal("ab".as_bytes())),
             }),
             Hir::literal("y".as_bytes()),
         ]);
@@ -490,7 +490,7 @@ mod tests {
                 min: 1,
                 max: None,
                 greedy: true,
-                hir: Box::new(Hir::concat(alloc::vec![
+                sub: Box::new(Hir::concat(alloc::vec![
                     Hir::look(hir::Look::Start),
                     Hir::look(hir::Look::End),
                 ])),
@@ -512,7 +512,7 @@ mod tests {
                 min: 1,
                 max: None,
                 greedy: true,
-                hir: Box::new(Hir::alternation(alloc::vec![
+                sub: Box::new(Hir::alternation(alloc::vec![
                     Hir::literal("cd".as_bytes()),
                     Hir::literal("ef".as_bytes()),
                 ])),
@@ -527,7 +527,7 @@ mod tests {
                 min: 1,
                 max: None,
                 greedy: true,
-                hir: Box::new(Hir::alternation(alloc::vec![
+                sub: Box::new(Hir::alternation(alloc::vec![
                     Hir::look(hir::Look::Start),
                     Hir::look(hir::Look::End),
                 ])),
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index fcef47a1e..b1ebf7b17 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -912,7 +912,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
             // in which the data type is defined handles this automatically.
             ast::GroupKind::NonCapturing(_) => return expr,
         };
-        Hir::capture(hir::Capture { index, name, hir: Box::new(expr) })
+        Hir::capture(hir::Capture { index, name, sub: Box::new(expr) })
     }
 
     fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
@@ -937,7 +937,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
             min,
             max,
             greedy,
-            hir: Box::new(expr),
+            sub: Box::new(expr),
         })
     }
 
@@ -1353,14 +1353,14 @@ mod tests {
     }
 
     fn hir_capture(index: u32, expr: Hir) -> Hir {
-        Hir::capture(hir::Capture { index, name: None, hir: Box::new(expr) })
+        Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) })
     }
 
     fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir {
         Hir::capture(hir::Capture {
             index,
             name: Some(name.into()),
-            hir: Box::new(expr),
+            sub: Box::new(expr),
         })
     }
 
@@ -1369,7 +1369,7 @@ mod tests {
             min: 0,
             max: Some(1),
             greedy,
-            hir: Box::new(expr),
+            sub: Box::new(expr),
         })
     }
 
@@ -1378,7 +1378,7 @@ mod tests {
             min: 0,
             max: None,
             greedy,
-            hir: Box::new(expr),
+            sub: Box::new(expr),
         })
     }
 
@@ -1387,7 +1387,7 @@ mod tests {
             min: 1,
             max: None,
             greedy,
-            hir: Box::new(expr),
+            sub: Box::new(expr),
         })
     }
 
@@ -1396,7 +1396,7 @@ mod tests {
             min,
             max,
             greedy,
-            hir: Box::new(expr),
+            sub: Box::new(expr),
         })
     }
 
diff --git a/regex-syntax/src/hir/visitor.rs b/regex-syntax/src/hir/visitor.rs
index ba1db238a..e5f15cf1c 100644
--- a/regex-syntax/src/hir/visitor.rs
+++ b/regex-syntax/src/hir/visitor.rs
@@ -195,8 +195,8 @@ impl<'a> Frame<'a> {
     /// child HIR node to visit.
     fn child(&self) -> &'a Hir {
         match *self {
-            Frame::Repetition(rep) => &rep.hir,
-            Frame::Capture(capture) => &capture.hir,
+            Frame::Repetition(rep) => &rep.sub,
+            Frame::Capture(capture) => &capture.sub,
             Frame::Concat { head, .. } => head,
             Frame::Alternation { head, .. } => head,
         }
diff --git a/src/compile.rs b/src/compile.rs
index 50ab78700..9ee52354d 100644
--- a/src/compile.rs
+++ b/src/compile.rs
@@ -368,7 +368,7 @@ impl Compiler {
                     self.c_empty_look(prog::EmptyLook::NotWordBoundary)
                 }
             },
-            Capture(hir::Capture { index, ref name, ref hir }) => {
+            Capture(hir::Capture { index, ref name, ref sub }) => {
                 if index as usize >= self.compiled.captures.len() {
                     let name = match *name {
                         None => None,
@@ -379,7 +379,7 @@ impl Compiler {
                         self.capture_name_idx.insert(name, index as usize);
                     }
                 }
-                self.c_capture(2 * index as usize, hir)
+                self.c_capture(2 * index as usize, sub)
             }
             Concat(ref es) => {
                 if self.compiled.is_reverse {
@@ -434,7 +434,7 @@ impl Compiler {
                 min: 0,
                 max: None,
                 greedy: false,
-                hir: Box::new(hir),
+                sub: Box::new(hir),
             }))?
             .unwrap())
     }
@@ -644,14 +644,14 @@ impl Compiler {
 
     fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
         match (rep.min, rep.max) {
-            (0, Some(1)) => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
-            (0, None) => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
-            (1, None) => self.c_repeat_one_or_more(&rep.hir, rep.greedy),
+            (0, Some(1)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy),
+            (0, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy),
+            (1, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy),
             (min, None) => {
-                self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min)
+                self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min)
             }
             (min, Some(max)) => {
-                self.c_repeat_range(&rep.hir, rep.greedy, min, max)
+                self.c_repeat_range(&rep.sub, rep.greedy, min, max)
             }
         }
     }

From 0114235f9a0ffa5ce933cff31a821b5ab63fdc69 Mon Sep 17 00:00:00 2001
From: Andrew Gallant 
Date: Wed, 1 Mar 2023 22:28:42 -0500
Subject: [PATCH 56/79] syntax: add support for CRLF-aware line anchors

This adds Look::StartCRLF and Look::EndCRLF. And also adds a new flag,
'R', for making ^/$ be CRLF aware in multi-line mode. The 'R' flag also
causes '.' to *not* match \r in addition to \n (unless the 's' flag is
enabled of course).

The intended semantics are that CRLF mode makes \r\n, \r and \n line
terminators but with one key property: \r\n is treated as a single line
terminator. That is, ^/$ do not match between \r and \n.

This partially addresses #244 by adding syntax support. Currently, if
you try to use this new flag, the regex compiler will report an error.
We intend to finish support for this once #656 is complete. (Indeed, at
time of writing, CRLF matching works in regex-automata.)
---
 regex-syntax/src/ast/mod.rs       |   2 +
 regex-syntax/src/ast/parse.rs     |  30 ++++++++
 regex-syntax/src/ast/print.rs     |   1 +
 regex-syntax/src/hir/mod.rs       |  83 +++++++++++++++++------
 regex-syntax/src/hir/print.rs     |   6 ++
 regex-syntax/src/hir/translate.rs | 109 ++++++++++++++++++++++++++++--
 regex-syntax/src/parser.rs        |  17 +++++
 src/compile.rs                    |   6 ++
 8 files changed, 226 insertions(+), 28 deletions(-)

diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs
index f36f27791..9be867c56 100644
--- a/regex-syntax/src/ast/mod.rs
+++ b/regex-syntax/src/ast/mod.rs
@@ -1314,6 +1314,8 @@ pub enum Flag {
     SwapGreed,
     /// `u`
     Unicode,
+    /// `R`
+    CRLF,
     /// `x`
     IgnoreWhitespace,
 }
diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs
index 1d6d4d046..93452cb18 100644
--- a/regex-syntax/src/ast/parse.rs
+++ b/regex-syntax/src/ast/parse.rs
@@ -1381,6 +1381,7 @@ impl<'s, P: Borrow> ParserI<'s, P> {
             's' => Ok(ast::Flag::DotMatchesNewLine),
             'U' => Ok(ast::Flag::SwapGreed),
             'u' => Ok(ast::Flag::Unicode),
+            'R' => Ok(ast::Flag::CRLF),
             'x' => Ok(ast::Flag::IgnoreWhitespace),
             _ => {
                 Err(self
@@ -4084,6 +4085,34 @@ bar
                 ],
             })
         );
+        assert_eq!(
+            parser("i-sR:").parse_flags(),
+            Ok(ast::Flags {
+                span: span(0..4),
+                items: vec![
+                    ast::FlagsItem {
+                        span: span(0..1),
+                        kind: ast::FlagsItemKind::Flag(
+                            ast::Flag::CaseInsensitive
+                        ),
+                    },
+                    ast::FlagsItem {
+                        span: span(1..2),
+                        kind: ast::FlagsItemKind::Negation,
+                    },
+                    ast::FlagsItem {
+                        span: span(2..3),
+                        kind: ast::FlagsItemKind::Flag(
+                            ast::Flag::DotMatchesNewLine
+                        ),
+                    },
+                    ast::FlagsItem {
+                        span: span(3..4),
+                        kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF),
+                    },
+                ],
+            })
+        );
 
         assert_eq!(
             parser("isU").parse_flags().unwrap_err(),
@@ -4145,6 +4174,7 @@ bar
         assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
         assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
         assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
+        assert_eq!(parser("R").parse_flag(), Ok(ast::Flag::CRLF));
         assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));
 
         assert_eq!(
diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs
index 0922ea0e3..40f967cfa 100644
--- a/regex-syntax/src/ast/print.rs
+++ b/regex-syntax/src/ast/print.rs
@@ -289,6 +289,7 @@ impl Writer {
                     Flag::DotMatchesNewLine => self.wtr.write_str("s"),
                     Flag::SwapGreed => self.wtr.write_str("U"),
                     Flag::Unicode => self.wtr.write_str("u"),
+                    Flag::CRLF => self.wtr.write_str("R"),
                     Flag::IgnoreWhitespace => self.wtr.write_str("x"),
                 },
             }?;
diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
index a2465e98a..ae361f48a 100644
--- a/regex-syntax/src/hir/mod.rs
+++ b/regex-syntax/src/hir/mod.rs
@@ -471,10 +471,12 @@ impl Hir {
 
     /// Returns an HIR expression for `.`.
     ///
-    /// * [`Dot::AnyChar`] maps to `(?su:.)`.
-    /// * [`Dot::AnyByte`] maps to `(?s-u:.)`.
-    /// * [`Dot::AnyCharExceptNL`] maps to `(?u-s:.)`.
-    /// * [`Dot::AnyByteExceptNL`] maps to `(?-su:.)`.
+    /// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
+    /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
+    /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
+    /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
+    /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
+    /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
     ///
     /// Note that this is a convenience routine for constructing the correct
     /// character class based on the value of `Dot`. There is no explicit "dot"
@@ -492,18 +494,32 @@ impl Hir {
                 cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
                 Hir::class(Class::Bytes(cls))
             }
-            Dot::AnyCharExceptNL => {
+            Dot::AnyCharExceptLF => {
                 let mut cls = ClassUnicode::empty();
                 cls.push(ClassUnicodeRange::new('\0', '\x09'));
                 cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
                 Hir::class(Class::Unicode(cls))
             }
-            Dot::AnyByteExceptNL => {
+            Dot::AnyCharExceptCRLF => {
+                let mut cls = ClassUnicode::empty();
+                cls.push(ClassUnicodeRange::new('\0', '\x09'));
+                cls.push(ClassUnicodeRange::new('\x0B', '\x0C'));
+                cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
+                Hir::class(Class::Unicode(cls))
+            }
+            Dot::AnyByteExceptLF => {
                 let mut cls = ClassBytes::empty();
                 cls.push(ClassBytesRange::new(b'\0', b'\x09'));
                 cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
                 Hir::class(Class::Bytes(cls))
             }
+            Dot::AnyByteExceptCRLF => {
+                let mut cls = ClassBytes::empty();
+                cls.push(ClassBytesRange::new(b'\0', b'\x09'));
+                cls.push(ClassBytesRange::new(b'\x0B', b'\x0C'));
+                cls.push(ClassBytesRange::new(b'\x0E', b'\xFF'));
+                Hir::class(Class::Bytes(cls))
+            }
         }
     }
 }
@@ -1365,6 +1381,16 @@ pub enum Look {
     /// at the end position of the input, or at the position immediately
     /// preceding a `\n` character.
     EndLF,
+    /// Match the beginning of a line or the beginning of text. Specifically,
+    /// this matches at the starting position of the input, or at the position
+    /// immediately following either a `\r` or `\n` character, but never after
+    /// a `\r` when a `\n` follows.
+    StartCRLF,
+    /// Match the end of a line or the end of text. Specifically, this matches
+    /// at the end position of the input, or at the position immediately
+    /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
+    /// precedes it.
+    EndCRLF,
     /// Match an ASCII-only word boundary. That is, this matches a position
     /// where the left adjacent character and right adjacent character
     /// correspond to a word and non-word or a non-word and word character.
@@ -1380,30 +1406,34 @@ pub enum Look {
 }
 
 impl Look {
-    fn from_repr(repr: u8) -> Option {
+    fn from_repr(repr: u16) -> Option {
         match repr {
             0 => Some(Look::Start),
             1 => Some(Look::End),
             2 => Some(Look::StartLF),
             3 => Some(Look::EndLF),
-            4 => Some(Look::WordAscii),
-            5 => Some(Look::WordAsciiNegate),
-            6 => Some(Look::WordUnicode),
-            7 => Some(Look::WordUnicodeNegate),
+            4 => Some(Look::StartCRLF),
+            5 => Some(Look::EndCRLF),
+            6 => Some(Look::WordAscii),
+            7 => Some(Look::WordAsciiNegate),
+            8 => Some(Look::WordUnicode),
+            9 => Some(Look::WordUnicodeNegate),
             _ => None,
         }
     }
 
-    fn as_repr(&self) -> u8 {
+    fn as_repr(&self) -> u16 {
         match *self {
             Look::Start => 0,
             Look::End => 1,
             Look::StartLF => 2,
             Look::EndLF => 3,
-            Look::WordAscii => 4,
-            Look::WordAsciiNegate => 5,
-            Look::WordUnicode => 6,
-            Look::WordUnicodeNegate => 7,
+            Look::StartCRLF => 5,
+            Look::EndCRLF => 5,
+            Look::WordAscii => 6,
+            Look::WordAsciiNegate => 7,
+            Look::WordUnicode => 8,
+            Look::WordUnicodeNegate => 9,
         }
     }
 
@@ -1413,6 +1443,8 @@ impl Look {
             Look::End => 'z',
             Look::StartLF => '^',
             Look::EndLF => '$',
+            Look::StartCRLF => '^',
+            Look::EndCRLF => '$',
             Look::WordAscii => 'b',
             Look::WordAsciiNegate => 'B',
             Look::WordUnicode => '𝛃',
@@ -1505,11 +1537,20 @@ pub enum Dot {
     /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
     ///
     /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
-    AnyCharExceptNL,
+    AnyCharExceptLF,
+    /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
+    /// and `\n`.
+    ///
+    /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
+    AnyCharExceptCRLF,
     /// Matches any byte value except for `\n`.
     ///
     /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
-    AnyByteExceptNL,
+    AnyByteExceptLF,
+    /// Matches any byte value except for `\r` and `\n`.
+    ///
+    /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
+    AnyByteExceptCRLF,
 }
 
 /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
@@ -2038,7 +2079,7 @@ impl Properties {
 /// example, an [`Hir`] provides properties that return `LookSet`s.
 #[derive(Clone, Copy, Default, Eq, PartialEq)]
 pub struct LookSet {
-    bits: u8,
+    bits: u16,
 }
 
 impl LookSet {
@@ -2170,8 +2211,8 @@ impl Iterator for LookSetIter {
     #[inline]
     fn next(&mut self) -> Option {
         // We'll never have more than u8::MAX distinct look-around assertions,
-        // so 'repr' will always fit into a usize.
-        let repr = u8::try_from(self.set.bits.trailing_zeros()).unwrap();
+        // so 'repr' will always fit into a u16.
+        let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
         let look = Look::from_repr(repr)?;
         self.set.remove(look);
         Some(look)
diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs
index 40f8905b7..fcb7cd252 100644
--- a/regex-syntax/src/hir/print.rs
+++ b/regex-syntax/src/hir/print.rs
@@ -177,6 +177,12 @@ impl Visitor for Writer {
                 hir::Look::EndLF => {
                     self.wtr.write_str("(?m:$)")?;
                 }
+                hir::Look::StartCRLF => {
+                    self.wtr.write_str("(?mR:^)")?;
+                }
+                hir::Look::EndCRLF => {
+                    self.wtr.write_str("(?mR:$)")?;
+                }
                 hir::Look::WordAscii => {
                     self.wtr.write_str(r"(?-u:\b)")?;
                 }
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index b1ebf7b17..c1ebf85c2 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -85,6 +85,12 @@ impl TranslatorBuilder {
         self
     }
 
+    /// Enable or disable the CRLF mode flag (`R`) by default.
+    pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder {
+        self.flags.crlf = if yes { Some(true) } else { None };
+        self
+    }
+
     /// Enable or disable the "swap greed" flag (`U`) by default.
     pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
         self.flags.swap_greed = if yes { Some(true) } else { None };
@@ -866,14 +872,23 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
     fn hir_assertion(&self, asst: &ast::Assertion) -> Result {
         let unicode = self.flags().unicode();
         let multi_line = self.flags().multi_line();
+        let crlf = self.flags().crlf();
         Ok(match asst.kind {
             ast::AssertionKind::StartLine => Hir::look(if multi_line {
-                hir::Look::StartLF
+                if crlf {
+                    hir::Look::StartCRLF
+                } else {
+                    hir::Look::StartLF
+                }
             } else {
                 hir::Look::Start
             }),
             ast::AssertionKind::EndLine => Hir::look(if multi_line {
-                hir::Look::EndLF
+                if crlf {
+                    hir::Look::EndCRLF
+                } else {
+                    hir::Look::EndLF
+                }
             } else {
                 hir::Look::End
             }),
@@ -1146,6 +1161,7 @@ struct Flags {
     dot_matches_new_line: Option,
     swap_greed: Option,
     unicode: Option,
+    crlf: Option,
     // Note that `ignore_whitespace` is omitted here because it is handled
     // entirely in the parser.
 }
@@ -1174,6 +1190,9 @@ impl Flags {
                 ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
                     flags.unicode = Some(enable);
                 }
+                ast::FlagsItemKind::Flag(ast::Flag::CRLF) => {
+                    flags.crlf = Some(enable);
+                }
                 ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
             }
         }
@@ -1196,6 +1215,9 @@ impl Flags {
         if self.unicode.is_none() {
             self.unicode = previous.unicode;
         }
+        if self.crlf.is_none() {
+            self.crlf = previous.crlf;
+        }
     }
 
     fn dot(&self) -> hir::Dot {
@@ -1207,9 +1229,17 @@ impl Flags {
             }
         } else {
             if self.unicode() {
-                hir::Dot::AnyCharExceptNL
+                if self.crlf() {
+                    hir::Dot::AnyCharExceptCRLF
+                } else {
+                    hir::Dot::AnyCharExceptLF
+                }
             } else {
-                hir::Dot::AnyByteExceptNL
+                if self.crlf() {
+                    hir::Dot::AnyByteExceptCRLF
+                } else {
+                    hir::Dot::AnyByteExceptLF
+                }
             }
         }
     }
@@ -1233,6 +1263,10 @@ impl Flags {
     fn unicode(&self) -> bool {
         self.unicode.unwrap_or(true)
     }
+
+    fn crlf(&self) -> bool {
+        self.crlf.unwrap_or(false)
+    }
 }
 
 fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
@@ -1678,14 +1712,32 @@ mod tests {
     fn dot() {
         assert_eq!(
             t("."),
-            hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}'),])
+            hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')])
         );
-        assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}'),]));
+        assert_eq!(
+            t("(?R)."),
+            hir_uclass(&[
+                ('\0', '\t'),
+                ('\x0B', '\x0C'),
+                ('\x0E', '\u{10FFFF}'),
+            ])
+        );
+        assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
+        assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
         assert_eq!(
             t_bytes("(?-u)."),
-            hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF'),])
+            hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')])
+        );
+        assert_eq!(
+            t_bytes("(?R-u)."),
+            hir_bclass(&[
+                (b'\0', b'\t'),
+                (b'\x0B', b'\x0C'),
+                (b'\x0E', b'\xFF'),
+            ])
         );
         assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
+        assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
 
         // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
         assert_eq!(
@@ -1698,6 +1750,16 @@ mod tests {
                 ),
             }
         );
+        assert_eq!(
+            t_err("(?R-u)."),
+            TestError {
+                kind: hir::ErrorKind::InvalidUtf8,
+                span: Span::new(
+                    Position::new(6, 1, 7),
+                    Position::new(7, 1, 8)
+                ),
+            }
+        );
         assert_eq!(
             t_err("(?s-u)."),
             TestError {
@@ -1708,6 +1770,16 @@ mod tests {
                 ),
             }
         );
+        assert_eq!(
+            t_err("(?Rs-u)."),
+            TestError {
+                kind: hir::ErrorKind::InvalidUtf8,
+                span: Span::new(
+                    Position::new(7, 1, 8),
+                    Position::new(8, 1, 9)
+                ),
+            }
+        );
     }
 
     #[test]
@@ -1795,6 +1867,29 @@ mod tests {
         );
     }
 
+    #[test]
+    fn line_anchors() {
+        assert_eq!(t("^"), hir_look(hir::Look::Start));
+        assert_eq!(t("$"), hir_look(hir::Look::End));
+        assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
+        assert_eq!(t(r"\z"), hir_look(hir::Look::End));
+
+        assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
+        assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
+        assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
+        assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
+
+        assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start));
+        assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End));
+        assert_eq!(t("(?R)^"), hir_look(hir::Look::Start));
+        assert_eq!(t("(?R)$"), hir_look(hir::Look::End));
+
+        assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start));
+        assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End));
+        assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF));
+        assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF));
+    }
+
     #[test]
     fn flags() {
         #[cfg(feature = "unicode-case")]
diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs
index 8eb88e042..2851cda33 100644
--- a/regex-syntax/src/parser.rs
+++ b/regex-syntax/src/parser.rs
@@ -134,6 +134,23 @@ impl ParserBuilder {
         self
     }
 
+    /// Enable or disable the CRLF mode flag by default.
+    ///
+    /// By default this is disabled. It may alternatively be selectively
+    /// enabled in the regular expression itself via the `R` flag.
+    ///
+    /// When CRLF mode is enabled, the following happens:
+    ///
+    /// * Unless `dot_matches_new_line` is enabled, `.` will match any character
+    /// except for `\r` and `\n`.
+    /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,
+    /// `\r` and `\n` as line terminators. And in particular, neither will
+    /// match between a `\r` and a `\n`.
+    pub fn crlf(&mut self, yes: bool) -> &mut ParserBuilder {
+        self.hir.crlf(yes);
+        self
+    }
+
     /// Enable or disable the "swap greed" flag by default.
     ///
     /// By default this is disabled. It may alternatively be selectively
diff --git a/src/compile.rs b/src/compile.rs
index 9ee52354d..20eebf0ed 100644
--- a/src/compile.rs
+++ b/src/compile.rs
@@ -326,6 +326,12 @@ impl Compiler {
                     self.byte_classes.set_range(b'\n', b'\n');
                     self.c_empty_look(prog::EmptyLook::EndLine)
                 }
+                hir::Look::StartCRLF | hir::Look::EndCRLF => {
+                    return Err(Error::Syntax(
+                        "CRLF-aware line anchors are not supported yet"
+                            .to_string(),
+                    ));
+                }
                 hir::Look::WordAscii => {
                     self.byte_classes.set_word_boundary();
                     self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)

From e4006afcb856da9dae9af29a624a9dcc2da2eaf4 Mon Sep 17 00:00:00 2001
From: Andrew Gallant 
Date: Tue, 28 Feb 2023 16:49:39 -0500
Subject: [PATCH 57/79] syntax: polish and doc updates

This updates docs in a number of places, including adding examples.

We also make it so zero-width matches never impact the 'utf8' property.
In practice, this means '(?-u:\B)' is now considered to match valid
UTF-8, which is consistent with the fact that 'a*' is considered to
match valid UTF-8 too.

We also do a refresh of the 'Look' and 'LookSet' APIs.
---
 regex-syntax/README.md            |   9 +-
 regex-syntax/src/error.rs         |  17 +-
 regex-syntax/src/hir/literal.rs   |  26 +-
 regex-syntax/src/hir/mod.rs       | 914 ++++++++++++++++++++++--------
 regex-syntax/src/hir/translate.rs |  47 +-
 regex-syntax/src/lib.rs           |  13 +-
 regex-syntax/src/parser.rs        |  18 +-
 regex-syntax/test                 |   2 +-
 src/compile.rs                    |   2 +-
 src/exec.rs                       |  14 +-
 10 files changed, 746 insertions(+), 316 deletions(-)

diff --git a/regex-syntax/README.md b/regex-syntax/README.md
index 592f84268..ff4fe094c 100644
--- a/regex-syntax/README.md
+++ b/regex-syntax/README.md
@@ -30,13 +30,12 @@ concrete syntax that produced the `Hir`.
 This example shows how to parse a pattern string into its HIR:
 
 ```rust
-use regex_syntax::Parser;
-use regex_syntax::hir::{self, Hir};
+use regex_syntax::{hir::Hir, parse};
 
-let hir = Parser::new().parse("a|b").unwrap();
+let hir = parse("a|b").unwrap();
 assert_eq!(hir, Hir::alternation(vec![
-    Hir::literal(hir::Literal::Unicode('a')),
-    Hir::literal(hir::Literal::Unicode('b')),
+    Hir::literal("a".as_bytes()),
+    Hir::literal("b".as_bytes()),
 ]));
 ```
 
diff --git a/regex-syntax/src/error.rs b/regex-syntax/src/error.rs
index a10230a87..98869c4f7 100644
--- a/regex-syntax/src/error.rs
+++ b/regex-syntax/src/error.rs
@@ -1,5 +1,3 @@
-use core::{cmp, fmt, result};
-
 use alloc::{
     format,
     string::{String, ToString},
@@ -9,9 +7,6 @@ use alloc::{
 
 use crate::{ast, hir};
 
-/// A type alias for dealing with errors returned by this crate.
-pub type Result = result::Result;
-
 /// This error type encompasses any error that can be returned by this crate.
 ///
 /// This error type is marked as `non_exhaustive`. This means that adding a
@@ -42,8 +37,8 @@ impl From for Error {
 #[cfg(feature = "std")]
 impl std::error::Error for Error {}
 
-impl fmt::Display for Error {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+impl core::fmt::Display for Error {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         match *self {
             Error::Parse(ref x) => x.fmt(f),
             Error::Translate(ref x) => x.fmt(f),
@@ -91,8 +86,8 @@ impl<'e> From<&'e hir::Error> for Formatter<'e, hir::ErrorKind> {
     }
 }
 
-impl<'e, E: fmt::Display> fmt::Display for Formatter<'e, E> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+impl<'e, E: core::fmt::Display> core::fmt::Display for Formatter<'e, E> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
         let spans = Spans::from_formatter(self);
         if self.pattern.contains('\n') {
             let divider = repeat_char('~', 79);
@@ -158,7 +153,7 @@ struct Spans<'p> {
 
 impl<'p> Spans<'p> {
     /// Build a sequence of spans from a formatter.
-    fn from_formatter<'e, E: fmt::Display>(
+    fn from_formatter<'e, E: core::fmt::Display>(
         fmter: &'p Formatter<'e, E>,
     ) -> Spans<'p> {
         let mut line_count = fmter.pattern.lines().count();
@@ -238,7 +233,7 @@ impl<'p> Spans<'p> {
                 pos += 1;
             }
             let note_len = span.end.column.saturating_sub(span.start.column);
-            for _ in 0..cmp::max(1, note_len) {
+            for _ in 0..core::cmp::max(1, note_len) {
                 notes.push('^');
                 pos += 1;
             }
diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs
index 121216ae5..bd3a2d143 100644
--- a/regex-syntax/src/hir/literal.rs
+++ b/regex-syntax/src/hir/literal.rs
@@ -101,10 +101,9 @@ use crate::hir::{self, Hir};
 /// This shows how to extract prefixes:
 ///
 /// ```
-/// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, Parser};
-///
-/// let hir = Parser::new().parse(r"(a|b|c)(x|y|z)[A-Z]+foo")?;
+/// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse};
 ///
+/// let hir = parse(r"(a|b|c)(x|y|z)[A-Z]+foo")?;
 /// let got = Extractor::new().extract(&hir);
 /// // All literals returned are "inexact" because none of them reach the
 /// // match state.
@@ -129,11 +128,10 @@ use crate::hir::{self, Hir};
 /// ```
 /// use regex_syntax::{
 ///     hir::literal::{Extractor, ExtractKind, Literal, Seq},
-///     Parser,
+///     parse,
 /// };
 ///
-/// let hir = Parser::new().parse(r"foo|[A-Z]+bar")?;
-///
+/// let hir = parse(r"foo|[A-Z]+bar")?;
 /// let got = Extractor::new().kind(ExtractKind::Suffix).extract(&hir);
 /// // Since 'foo' gets to a match state, it is considered exact. But 'bar'
 /// // does not because of the '[A-Z]+', and thus is marked inexact.
@@ -237,9 +235,9 @@ impl Extractor {
     /// for character classes being turned into literal sequences.
     ///
     /// ```
-    /// use regex_syntax::{hir::literal::{Extractor, Seq}, Parser};
+    /// use regex_syntax::{hir::literal::{Extractor, Seq}, parse};
     ///
-    /// let hir = Parser::new().parse(r"[0-9]")?;
+    /// let hir = parse(r"[0-9]")?;
     ///
     /// let got = Extractor::new().extract(&hir);
     /// let expected = Seq::new([
@@ -274,9 +272,9 @@ impl Extractor {
     /// This shows how to decrease the limit and compares it with the default.
     ///
     /// ```
-    /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, Parser};
+    /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse};
     ///
-    /// let hir = Parser::new().parse(r"(abc){8}")?;
+    /// let hir = parse(r"(abc){8}")?;
     ///
     /// let got = Extractor::new().extract(&hir);
     /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]);
@@ -311,9 +309,9 @@ impl Extractor {
     /// This shows how to decrease the limit and compares it with the default.
     ///
     /// ```
-    /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, Parser};
+    /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse};
     ///
-    /// let hir = Parser::new().parse(r"(abc){2}{2}{2}")?;
+    /// let hir = parse(r"(abc){2}{2}{2}")?;
     ///
     /// let got = Extractor::new().extract(&hir);
     /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]);
@@ -353,9 +351,9 @@ impl Extractor {
     /// sequence returned.
     ///
     /// ```
-    /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, Parser};
+    /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse};
     ///
-    /// let hir = Parser::new().parse(r"[ab]{2}{2}")?;
+    /// let hir = parse(r"[ab]{2}{2}")?;
     ///
     /// let got = Extractor::new().extract(&hir);
     /// let expected = Seq::new([
diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
index ae361f48a..4102bfec5 100644
--- a/regex-syntax/src/hir/mod.rs
+++ b/regex-syntax/src/hir/mod.rs
@@ -1,5 +1,20 @@
 /*!
-Defines a high-level intermediate representation for regular expressions.
+Defines a high-level intermediate (HIR) representation for regular expressions.
+
+The HIR is represented by the [`Hir`] type, and it principally constructed via
+[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
+may use the smart constructors defined on `Hir` to build their own by hand. The
+smart constructors simultaneously simplify and "optimize" the HIR, and are also
+the same routines used by translation.
+
+Most regex engines only have an HIR like this, and usually construct it
+directly from the concrete syntax. This crate however first parses the
+concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
+as mentioned above. It's done this way to facilitate better error reporting,
+and to have a structured representation of a regex that faithfully represents
+its concrete syntax. Namely, while an `Hir` value can be converted back to an
+equivalent regex pattern string, it is unlikely to look like the original due
+to its simplified structure.
 */
 
 use core::{char, cmp};
@@ -122,6 +137,12 @@ impl core::fmt::Display for ErrorKind {
 
 /// A high-level intermediate representation (HIR) for a regular expression.
 ///
+/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
+/// An `HirKind` indicates what kind of regular expression it is (a literal,
+/// a repetition, a look-around assertion, etc.), where as a `Properties`
+/// describes various facts about the regular expression. For example, whether
+/// it matches UTF-8 or if it matches the empty string.
+///
 /// The HIR of a regular expression represents an intermediate step between
 /// its abstract syntax (a structured description of the concrete syntax) and
 /// an actual regex matcher. The purpose of HIR is to make regular expressions
@@ -133,24 +154,39 @@ impl core::fmt::Display for ErrorKind {
 /// it is handled automatically by the translator (e.g., by translating
 /// `(?i:A)` to `[aA]`).
 ///
-/// If the HIR was produced by a translator that disallows invalid UTF-8, then
-/// the HIR is guaranteed to match UTF-8 exclusively.
-///
-/// This type defines its own destructor that uses constant stack space and
-/// heap space proportional to the size of the HIR.
-///
 /// The specific type of an HIR expression can be accessed via its `kind`
 /// or `into_kind` methods. This extra level of indirection exists for two
 /// reasons:
 ///
-/// 1. Construction of an HIR expression *must* use the constructor methods
-///    on this `Hir` type instead of building the `HirKind` values directly.
-///    This permits construction to enforce invariants like "concatenations
-///    always consist of two or more sub-expressions."
+/// 1. Construction of an HIR expression *must* use the constructor methods on
+/// this `Hir` type instead of building the `HirKind` values directly. This
+/// permits construction to enforce invariants like "concatenations always
+/// consist of two or more sub-expressions."
 /// 2. Every HIR expression contains attributes that are defined inductively,
-///    and can be computed cheaply during the construction process. For
-///    example, one such attribute is whether the expression must match at the
-///    beginning of the haystack.
+/// and can be computed cheaply during the construction process. For example,
+/// one such attribute is whether the expression must match at the beginning of
+/// the haystack.
+///
+/// In particular, if you have an `HirKind` value, then there is intentionally
+/// no way to build an `Hir` value from it. You instead need to do case
+/// analysis on the `HirKind` value and build the `Hir` value using its smart
+/// constructors.
+///
+/// # UTF-8
+///
+/// If the HIR was produced by a translator with
+/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
+/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
+/// matches.
+///
+/// For empty matches, those can occur at any position. It is the
+/// repsonsibility of the regex engine to determine whether empty matches are
+/// permitted between the code units of a single codepoint.
+///
+/// # Stack space
+///
+/// This type defines its own destructor that uses constant stack space and
+/// heap space proportional to the size of the HIR.
 ///
 /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
 /// expression pattern string, and uses constant stack space and heap space
@@ -169,38 +205,6 @@ pub struct Hir {
     props: Properties,
 }
 
-/// The kind of an arbitrary `Hir` expression.
-#[derive(Clone, Debug, Eq, PartialEq)]
-pub enum HirKind {
-    /// The empty regular expression, which matches everything, including the
-    /// empty string.
-    Empty,
-    /// A literalstring that matches exactly these bytes.
-    Literal(Literal),
-    /// A single character class that matches any of the characters in the
-    /// class. A class can either consist of Unicode scalar values as
-    /// characters, or it can use bytes.
-    Class(Class),
-    /// A look-around assertion. A look-around match always has zero length.
-    Look(Look),
-    /// A repetition operation applied to a child expression.
-    Repetition(Repetition),
-    /// A capturing group, which contains a child expression.
-    Capture(Capture),
-    /// A concatenation of expressions. A concatenation always has at least two
-    /// child expressions.
-    ///
-    /// A concatenation matches only if each of its child expression matches
-    /// one after the other.
-    Concat(Vec),
-    /// An alternation of expressions. An alternation always has at least two
-    /// child expressions.
-    ///
-    /// An alternation matches only if at least one of its child expression
-    /// matches. If multiple expressions match, then the leftmost is preferred.
-    Alternation(Vec),
-}
-
 /// Methods for accessing the underlying `HirKind` and `Properties`.
 impl Hir {
     /// Returns a reference to the underlying HIR kind.
@@ -233,7 +237,7 @@ impl Hir {
 
 /// Smart constructors for HIR values.
 ///
-/// These constructors are called "smart" because they inductive work or
+/// These constructors are called "smart" because they do inductive work or
 /// simplifications. For example, calling `Hir::repetition` with a repetition
 /// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
 /// since it is equivalent to an empty regex. Another example is calling
@@ -252,8 +256,9 @@ impl Hir {
         Hir { kind: HirKind::Empty, props }
     }
 
-    /// Returns an HIR expression that can never match anything. That is, the
-    /// set of strings in the language described by the HIR returned is `0`.
+    /// Returns an HIR expression that can never match anything. That is,
+    /// the size of the set of strings in the language described by the HIR
+    /// returned is `0`.
     ///
     /// This is distinct from [`Hir::empty`] in that the empty string matches
     /// the HIR returned by `Hir::empty`. That is, the set of strings in the
@@ -278,9 +283,41 @@ impl Hir {
 
     /// Creates a literal HIR expression.
     ///
-    /// If the given literal has a `Byte` variant with an ASCII byte, then this
-    /// method panics. This enforces the invariant that `Byte` variants are
-    /// only used to express matching of invalid UTF-8.
+    /// This accepts anything that can be converted into a `Box<[u8]>`.
+    ///
+    /// Note that there is no mechanism for storing a `char` or a `Box`
+    /// in an HIR. Everything is "just bytes." Whether a `Literal` (or
+    /// any HIR node) matches valid UTF-8 exclusively can be queried via
+    /// [`Properties::is_utf8`].
+    ///
+    /// # Example
+    ///
+    /// This example shows that concatenations of `Literal` HIR values will
+    /// automatically get flattened and combined together. So for example, even
+    /// if you concat multiple `Literal` values that are themselves not valid
+    /// UTF-8, they might add up to valid UTF-8. This also demonstrates just
+    /// how "smart" Hir's smart constructors are.
+    ///
+    /// ```
+    /// use regex_syntax::hir::{Hir, HirKind, Literal};
+    ///
+    /// let literals = vec![
+    ///     Hir::literal([0xE2]),
+    ///     Hir::literal([0x98]),
+    ///     Hir::literal([0x83]),
+    /// ];
+    /// // Each literal, on its own, is invalid UTF-8.
+    /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8()));
+    ///
+    /// let concat = Hir::concat(literals);
+    /// // But the concatenation is valid UTF-8!
+    /// assert!(concat.properties().is_utf8());
+    ///
+    /// // And also notice that the literals have been concatenated into a
+    /// // single `Literal`, to the point where there is no explicit `Concat`!
+    /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
+    /// assert_eq!(&expected, concat.kind());
+    /// ```
     #[inline]
     pub fn literal>>(lit: B) -> Hir {
         let bytes = lit.into();
@@ -293,7 +330,11 @@ impl Hir {
         Hir { kind: HirKind::Literal(lit), props }
     }
 
-    /// Creates a class HIR expression.
+    /// Creates a class HIR expression. The class may either be defined over
+    /// ranges of Unicode codepoints or ranges of raw byte values.
+    ///
+    /// Note that an empty class is permitted. An empty class is equivalent to
+    /// `Hir::fail()`.
     #[inline]
     pub fn class(class: Class) -> Hir {
         if class.is_empty() {
@@ -330,6 +371,12 @@ impl Hir {
     }
 
     /// Creates a capture HIR expression.
+    ///
+    /// Note that there is no explicit HIR value for a non-capturing group.
+    /// Since a non-capturing group only exists to override precedence in the
+    /// concrete syntax and since an HIR already does its own grouping based on
+    /// what is parsed, there is no need to explicitly represent non-capturing
+    /// groups in the HIR.
     #[inline]
     pub fn capture(capture: Capture) -> Hir {
         let props = Properties::capture(&capture);
@@ -338,8 +385,32 @@ impl Hir {
 
     /// Returns the concatenation of the given expressions.
     ///
-    /// This flattens the concatenation as appropriate.
-    pub fn concat(hirs: Vec) -> Hir {
+    /// This attempts to flatten and simplify the concatenation as appropriate.
+    ///
+    /// # Example
+    ///
+    /// This shows a simple example of basic flattening of both concatenations
+    /// and literals.
+    ///
+    /// ```
+    /// use regex_syntax::hir::Hir;
+    ///
+    /// let hir = Hir::concat(vec![
+    ///     Hir::concat(vec![
+    ///         Hir::literal([b'a']),
+    ///         Hir::literal([b'b']),
+    ///         Hir::literal([b'c']),
+    ///     ]),
+    ///     Hir::concat(vec![
+    ///         Hir::literal([b'x']),
+    ///         Hir::literal([b'y']),
+    ///         Hir::literal([b'z']),
+    ///     ]),
+    /// ]);
+    /// let expected = Hir::literal("abcxyz".as_bytes());
+    /// assert_eq!(expected, hir);
+    /// ```
+    pub fn concat(subs: Vec) -> Hir {
         // We rebuild the concatenation by simplifying it. Would be nice to do
         // it in place, but that seems a little tricky?
         let mut new = vec![];
@@ -348,8 +419,8 @@ impl Hir {
         // to 'prior_lit', and whenever we see anything else, we first take
         // any bytes in 'prior_lit' and add it to the 'new' concatenation.
         let mut prior_lit: Option> = None;
-        for hir in hirs {
-            let (kind, props) = hir.into_parts();
+        for sub in subs {
+            let (kind, props) = sub.into_parts();
             match kind {
                 HirKind::Literal(Literal(bytes)) => {
                     if let Some(ref mut prior_bytes) = prior_lit {
@@ -362,9 +433,9 @@ impl Hir {
                 // concat. We only need to do this one level deep since
                 // Hir::concat is the only way to build concatenations, and so
                 // flattening happens inductively.
-                HirKind::Concat(hirs2) => {
-                    for hir2 in hirs2 {
-                        let (kind2, props2) = hir2.into_parts();
+                HirKind::Concat(subs2) => {
+                    for sub2 in subs2 {
+                        let (kind2, props2) = sub2.into_parts();
                         match kind2 {
                             HirKind::Literal(Literal(bytes)) => {
                                 if let Some(ref mut prior_bytes) = prior_lit {
@@ -406,17 +477,82 @@ impl Hir {
 
     /// Returns the alternation of the given expressions.
     ///
-    /// This flattens the alternation as appropriate.
-    pub fn alternation(hirs: Vec) -> Hir {
+    /// This flattens and simplifies the alternation as appropriate. This may
+    /// include factoring out common prefixes or even rewriting the alternation
+    /// as a character class.
+    ///
+    /// Note that an empty alternation is equivalent to `Hir::fail()`. (It
+    /// is not possible for one to write an empty alternation, or even an
+    /// alternation with a single sub-expression, in the concrete syntax of a
+    /// regex.)
+    ///
+    /// # Example
+    ///
+    /// This is a simple example showing how an alternation might get
+    /// simplified.
+    ///
+    /// ```
+    /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
+    ///
+    /// let hir = Hir::alternation(vec![
+    ///     Hir::literal([b'a']),
+    ///     Hir::literal([b'b']),
+    ///     Hir::literal([b'c']),
+    ///     Hir::literal([b'd']),
+    ///     Hir::literal([b'e']),
+    ///     Hir::literal([b'f']),
+    /// ]);
+    /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
+    ///     ClassUnicodeRange::new('a', 'f'),
+    /// ])));
+    /// assert_eq!(expected, hir);
+    /// ```
+    ///
+    /// And another example showing how common prefixes might get factored
+    /// out.
+    ///
+    /// ```
+    /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
+    ///
+    /// let hir = Hir::alternation(vec![
+    ///     Hir::concat(vec![
+    ///         Hir::literal("abc".as_bytes()),
+    ///         Hir::class(Class::Unicode(ClassUnicode::new([
+    ///             ClassUnicodeRange::new('A', 'Z'),
+    ///         ]))),
+    ///     ]),
+    ///     Hir::concat(vec![
+    ///         Hir::literal("abc".as_bytes()),
+    ///         Hir::class(Class::Unicode(ClassUnicode::new([
+    ///             ClassUnicodeRange::new('a', 'z'),
+    ///         ]))),
+    ///     ]),
+    /// ]);
+    /// let expected = Hir::concat(vec![
+    ///     Hir::literal("abc".as_bytes()),
+    ///     Hir::alternation(vec![
+    ///         Hir::class(Class::Unicode(ClassUnicode::new([
+    ///             ClassUnicodeRange::new('A', 'Z'),
+    ///         ]))),
+    ///         Hir::class(Class::Unicode(ClassUnicode::new([
+    ///             ClassUnicodeRange::new('a', 'z'),
+    ///         ]))),
+    ///     ]),
+    /// ]);
+    /// assert_eq!(expected, hir);
+    /// ```
+    ///
+    /// Note that these sorts of simplifications are not guaranteed.
+    pub fn alternation(subs: Vec) -> Hir {
         // We rebuild the alternation by simplifying it. We proceed similarly
         // as the concatenation case. But in this case, there's no literal
         // simplification happening. We're just flattening alternations.
         let mut new = vec![];
-        for hir in hirs {
-            let (kind, props) = hir.into_parts();
+        for sub in subs {
+            let (kind, props) = sub.into_parts();
             match kind {
-                HirKind::Alternation(hirs2) => {
-                    new.extend(hirs2);
+                HirKind::Alternation(subs2) => {
+                    new.extend(subs2);
                 }
                 kind => {
                     new.push(Hir { kind, props });
@@ -478,9 +614,21 @@ impl Hir {
     /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
     /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
     ///
+    /// # Example
+    ///
     /// Note that this is a convenience routine for constructing the correct
     /// character class based on the value of `Dot`. There is no explicit "dot"
     /// HIR value. It is just an abbreviation for a common character class.
+    ///
+    /// ```
+    /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
+    ///
+    /// let hir = Hir::dot(Dot::AnyByte);
+    /// let expected = Hir::class(Class::Bytes(ClassBytes::new([
+    ///     ClassBytesRange::new(0x00, 0xFF),
+    /// ])));
+    /// assert_eq!(expected, hir);
+    /// ```
     #[inline]
     pub fn dot(dot: Dot) -> Hir {
         match dot {
@@ -524,31 +672,65 @@ impl Hir {
     }
 }
 
-impl HirKind {
-    /// Return true if and only if this HIR is the empty regular expression.
+/// The underlying kind of an arbitrary [`Hir`] expression.
+///
+/// An `HirKind` is principally useful for doing case analysis on the type
+/// of a regular expression. If you're looking to build new `Hir` values,
+/// then you _must_ use the smart constructors defined on `Hir`, like
+/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
+/// not expose any way of building an `Hir` directly from an `HirKind`.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum HirKind {
+    /// The empty regular expression, which matches everything, including the
+    /// empty string.
+    Empty,
+    /// A literalstring that matches exactly these bytes.
+    Literal(Literal),
+    /// A single character class that matches any of the characters in the
+    /// class. A class can either consist of Unicode scalar values as
+    /// characters, or it can use bytes.
     ///
-    /// Note that this is not defined inductively. That is, it only tests if
-    /// this kind is the `Empty` variant. To get the inductive definition, use
-    /// the `is_match_empty` method on [`Hir`].
-    pub fn is_empty(&self) -> bool {
-        match *self {
-            HirKind::Empty => true,
-            _ => false,
-        }
-    }
+    /// A class may be empty. In which case, it matches nothing.
+    Class(Class),
+    /// A look-around assertion. A look-around match always has zero length.
+    Look(Look),
+    /// A repetition operation applied to a sub-expression.
+    Repetition(Repetition),
+    /// A capturing group, which contains a sub-expression.
+    Capture(Capture),
+    /// A concatenation of expressions.
+    ///
+    /// A concatenation matches only if each of its sub-expressions match one
+    /// after the other.
+    ///
+    /// Concatenations are guaranteed by `Hir`'s smart constructors to always
+    /// have at least two sub-expressions.
+    Concat(Vec),
+    /// An alternation of expressions.
+    ///
+    /// An alternation matches only if at least one of its sub-expressions
+    /// match. If multiple sub-expressions match, then the leftmost is
+    /// preferred.
+    ///
+    /// Alternations are guaranteed by `Hir`'s smart constructors to always
+    /// have at least two sub-expressions.
+    Alternation(Vec),
+}
+
+impl HirKind {
+    /// Returns a slice of this kind's sub-expressions, if any.
+    pub fn subs(&self) -> &[Hir] {
+        use core::slice::from_ref;
 
-    /// Returns true if and only if this kind has any (including possibly
-    /// empty) subexpressions.
-    pub fn has_subexprs(&self) -> bool {
         match *self {
             HirKind::Empty
             | HirKind::Literal(_)
             | HirKind::Class(_)
-            | HirKind::Look(_) => false,
-            HirKind::Capture(_)
-            | HirKind::Repetition(_)
-            | HirKind::Concat(_)
-            | HirKind::Alternation(_) => true,
+            | HirKind::Look(_) => &[],
+            HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub),
+            HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub),
+            HirKind::Concat(ref subs) => subs,
+            HirKind::Alternation(ref subs) => subs,
         }
     }
 }
@@ -573,10 +755,14 @@ impl core::fmt::Display for Hir {
 
 /// The high-level intermediate representation of a literal.
 ///
-/// A literal corresponds to a single character, where a character is either
-/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
-/// are preferred whenever possible. In particular, a `Byte` variant is only
-/// ever produced when it could match invalid UTF-8.
+/// A literal corresponds to `0` or more bytes that should be matched
+/// literally. The smart constructors defined on `Hir` will automatically
+/// concatenate adjacent literals into one literal, and will even automatically
+/// replace empty literals with `Hir::empty()`.
+///
+/// Note that despite a literal being represented by a sequence of bytes, its
+/// `Debug` implementation will attempt to print it as a normal string. (That
+/// is, not a sequence of decimal numbers.)
 #[derive(Clone, Eq, PartialEq)]
 pub struct Literal(pub Box<[u8]>);
 
@@ -681,7 +867,7 @@ impl Class {
     pub fn is_utf8(&self) -> bool {
         match *self {
             Class::Unicode(_) => true,
-            Class::Bytes(ref x) => x.is_all_ascii(),
+            Class::Bytes(ref x) => x.is_ascii(),
         }
     }
 
@@ -699,25 +885,25 @@ impl Class {
     /// minimum length, if any.
     ///
     /// ```
-    /// use regex_syntax::{hir::Properties, Parser};
+    /// use regex_syntax::{hir::Properties, parse};
     ///
     /// // The empty string has a min length of 0.
-    /// let hir = Parser::new().parse(r"")?;
+    /// let hir = parse(r"")?;
     /// assert_eq!(Some(0), hir.properties().minimum_len());
     /// // As do other types of regexes that only match the empty string.
-    /// let hir = Parser::new().parse(r"^$\b\B")?;
+    /// let hir = parse(r"^$\b\B")?;
     /// assert_eq!(Some(0), hir.properties().minimum_len());
     /// // A regex that can match the empty string but match more is still 0.
-    /// let hir = Parser::new().parse(r"a*")?;
+    /// let hir = parse(r"a*")?;
     /// assert_eq!(Some(0), hir.properties().minimum_len());
     /// // A regex that matches nothing has no minimum defined.
-    /// let hir = Parser::new().parse(r"[a&&b]")?;
+    /// let hir = parse(r"[a&&b]")?;
     /// assert_eq!(None, hir.properties().minimum_len());
     /// // Character classes usually have a minimum length of 1.
-    /// let hir = Parser::new().parse(r"\w")?;
+    /// let hir = parse(r"\w")?;
     /// assert_eq!(Some(1), hir.properties().minimum_len());
     /// // But sometimes Unicode classes might be bigger!
-    /// let hir = Parser::new().parse(r"\p{Cyrillic}")?;
+    /// let hir = parse(r"\p{Cyrillic}")?;
     /// assert_eq!(Some(2), hir.properties().minimum_len());
     ///
     /// # Ok::<(), Box>(())
@@ -743,28 +929,28 @@ impl Class {
     /// maximum length, if any.
     ///
     /// ```
-    /// use regex_syntax::{hir::Properties, Parser};
+    /// use regex_syntax::{hir::Properties, parse};
     ///
     /// // The empty string has a max length of 0.
-    /// let hir = Parser::new().parse(r"")?;
+    /// let hir = parse(r"")?;
     /// assert_eq!(Some(0), hir.properties().maximum_len());
     /// // As do other types of regexes that only match the empty string.
-    /// let hir = Parser::new().parse(r"^$\b\B")?;
+    /// let hir = parse(r"^$\b\B")?;
     /// assert_eq!(Some(0), hir.properties().maximum_len());
     /// // A regex that matches nothing has no maximum defined.
-    /// let hir = Parser::new().parse(r"[a&&b]")?;
+    /// let hir = parse(r"[a&&b]")?;
     /// assert_eq!(None, hir.properties().maximum_len());
     /// // Bounded repeats work as you expect.
-    /// let hir = Parser::new().parse(r"x{2,10}")?;
+    /// let hir = parse(r"x{2,10}")?;
     /// assert_eq!(Some(10), hir.properties().maximum_len());
     /// // An unbounded repeat means there is no maximum.
-    /// let hir = Parser::new().parse(r"x{2,}")?;
+    /// let hir = parse(r"x{2,}")?;
     /// assert_eq!(None, hir.properties().maximum_len());
     /// // With Unicode enabled, \w can match up to 4 bytes!
-    /// let hir = Parser::new().parse(r"\w")?;
+    /// let hir = parse(r"\w")?;
     /// assert_eq!(Some(4), hir.properties().maximum_len());
     /// // Without Unicode enabled, \w matches at most 1 byte.
-    /// let hir = Parser::new().parse(r"(?-u)\w")?;
+    /// let hir = parse(r"(?-u)\w")?;
     /// assert_eq!(Some(1), hir.properties().maximum_len());
     ///
     /// # Ok::<(), Box>(())
@@ -810,7 +996,8 @@ impl ClassUnicode {
     /// Create a new class from a sequence of ranges.
     ///
     /// The given ranges do not need to be in any specific order, and ranges
-    /// may overlap.
+    /// may overlap. Ranges will automatically be sorted into a canonical
+    /// non-overlapping order.
     pub fn new(ranges: I) -> ClassUnicode
     where
         I: IntoIterator,
@@ -819,6 +1006,9 @@ impl ClassUnicode {
     }
 
     /// Create a new class with no ranges.
+    ///
+    /// An empty class matches nothing. That is, it is equivalent to
+    /// [`Hir::fail`].
     pub fn empty() -> ClassUnicode {
         ClassUnicode::new(vec![])
     }
@@ -914,7 +1104,7 @@ impl ClassUnicode {
     /// Returns true if and only if this character class will either match
     /// nothing or only ASCII bytes. Stated differently, this returns false
     /// if and only if this class contains a non-ASCII codepoint.
-    pub fn is_all_ascii(&self) -> bool {
+    pub fn is_ascii(&self) -> bool {
         self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
     }
 
@@ -955,7 +1145,7 @@ impl ClassUnicode {
     /// If this class consists of only ASCII ranges, then return its
     /// corresponding and equivalent byte class.
     pub fn to_byte_class(&self) -> Option {
-        if !self.is_all_ascii() {
+        if !self.is_ascii() {
             return None;
         }
         Some(ClassBytes::new(self.ranges().iter().map(|r| {
@@ -1115,7 +1305,8 @@ impl ClassBytes {
     /// Create a new class from a sequence of ranges.
     ///
     /// The given ranges do not need to be in any specific order, and ranges
-    /// may overlap.
+    /// may overlap. Ranges will automatically be sorted into a canonical
+    /// non-overlapping order.
     pub fn new(ranges: I) -> ClassBytes
     where
         I: IntoIterator,
@@ -1124,6 +1315,9 @@ impl ClassBytes {
     }
 
     /// Create a new class with no ranges.
+    ///
+    /// An empty class matches nothing. That is, it is equivalent to
+    /// [`Hir::fail`].
     pub fn empty() -> ClassBytes {
         ClassBytes::new(vec![])
     }
@@ -1193,7 +1387,7 @@ impl ClassBytes {
     /// Returns true if and only if this character class will either match
     /// nothing or only ASCII bytes. Stated differently, this returns false
     /// if and only if this class contains a non-ASCII byte.
-    pub fn is_all_ascii(&self) -> bool {
+    pub fn is_ascii(&self) -> bool {
         self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
     }
 
@@ -1238,7 +1432,7 @@ impl ClassBytes {
     /// If this class consists of only ASCII ranges, then return its
     /// corresponding and equivalent Unicode class.
     pub fn to_unicode_class(&self) -> Option {
-        if !self.is_all_ascii() {
+        if !self.is_ascii() {
             return None;
         }
         Some(ClassUnicode::new(self.ranges().iter().map(|r| {
@@ -1369,82 +1563,111 @@ impl core::fmt::Debug for ClassBytesRange {
 pub enum Look {
     /// Match the beginning of text. Specifically, this matches at the starting
     /// position of the input.
-    Start,
+    Start = 1 << 0,
     /// Match the end of text. Specifically, this matches at the ending
     /// position of the input.
-    End,
+    End = 1 << 1,
     /// Match the beginning of a line or the beginning of text. Specifically,
     /// this matches at the starting position of the input, or at the position
     /// immediately following a `\n` character.
-    StartLF,
+    StartLF = 1 << 2,
     /// Match the end of a line or the end of text. Specifically, this matches
     /// at the end position of the input, or at the position immediately
     /// preceding a `\n` character.
-    EndLF,
+    EndLF = 1 << 3,
     /// Match the beginning of a line or the beginning of text. Specifically,
     /// this matches at the starting position of the input, or at the position
     /// immediately following either a `\r` or `\n` character, but never after
     /// a `\r` when a `\n` follows.
-    StartCRLF,
+    StartCRLF = 1 << 4,
     /// Match the end of a line or the end of text. Specifically, this matches
     /// at the end position of the input, or at the position immediately
     /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
     /// precedes it.
-    EndCRLF,
+    EndCRLF = 1 << 5,
     /// Match an ASCII-only word boundary. That is, this matches a position
     /// where the left adjacent character and right adjacent character
     /// correspond to a word and non-word or a non-word and word character.
-    WordAscii,
+    WordAscii = 1 << 6,
     /// Match an ASCII-only negation of a word boundary.
-    WordAsciiNegate,
+    WordAsciiNegate = 1 << 7,
     /// Match a Unicode-aware word boundary. That is, this matches a position
     /// where the left adjacent character and right adjacent character
     /// correspond to a word and non-word or a non-word and word character.
-    WordUnicode,
+    WordUnicode = 1 << 8,
     /// Match a Unicode-aware negation of a word boundary.
-    WordUnicodeNegate,
+    WordUnicodeNegate = 1 << 9,
 }
 
 impl Look {
-    fn from_repr(repr: u16) -> Option {
-        match repr {
-            0 => Some(Look::Start),
-            1 => Some(Look::End),
-            2 => Some(Look::StartLF),
-            3 => Some(Look::EndLF),
-            4 => Some(Look::StartCRLF),
-            5 => Some(Look::EndCRLF),
-            6 => Some(Look::WordAscii),
-            7 => Some(Look::WordAsciiNegate),
-            8 => Some(Look::WordUnicode),
-            9 => Some(Look::WordUnicodeNegate),
-            _ => None,
+    /// Flip the look-around assertion to its equivalent for reverse searches.
+    /// For example, `StartLF` gets translated to `EndLF`.
+    ///
+    /// Some assertions, such as `WordUnicode`, remain the same since they
+    /// match the same positions regardless of the direction of the search.
+    #[inline]
+    pub const fn reversed(self) -> Look {
+        match self {
+            Look::Start => Look::End,
+            Look::End => Look::Start,
+            Look::StartLF => Look::EndLF,
+            Look::EndLF => Look::StartLF,
+            Look::StartCRLF => Look::EndCRLF,
+            Look::EndCRLF => Look::StartCRLF,
+            Look::WordAscii => Look::WordAscii,
+            Look::WordAsciiNegate => Look::WordAsciiNegate,
+            Look::WordUnicode => Look::WordUnicode,
+            Look::WordUnicodeNegate => Look::WordUnicodeNegate,
         }
     }
 
-    fn as_repr(&self) -> u16 {
-        match *self {
-            Look::Start => 0,
-            Look::End => 1,
-            Look::StartLF => 2,
-            Look::EndLF => 3,
-            Look::StartCRLF => 5,
-            Look::EndCRLF => 5,
-            Look::WordAscii => 6,
-            Look::WordAsciiNegate => 7,
-            Look::WordUnicode => 8,
-            Look::WordUnicodeNegate => 9,
+    /// Return the underlying representation of this look-around enumeration
+    /// as an integer. Giving the return value to the [`Look::from_repr`]
+    /// constructor is guaranteed to return the same look-around variant that
+    /// one started with within a semver compatible release of this crate.
+    #[inline]
+    pub const fn as_repr(self) -> u16 {
+        // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
+        // actual int.
+        self as u16
+    }
+
+    /// Given the underlying representation of a `Look` value, return the
+    /// corresponding `Look` value if the representation is valid. Otherwise
+    /// `None` is returned.
+    #[inline]
+    pub const fn from_repr(repr: u16) -> Option {
+        match repr {
+            0b00_0000_0001 => Some(Look::Start),
+            0b00_0000_0010 => Some(Look::End),
+            0b00_0000_0100 => Some(Look::StartLF),
+            0b00_0000_1000 => Some(Look::EndLF),
+            0b00_0001_0000 => Some(Look::StartCRLF),
+            0b00_0010_0000 => Some(Look::EndCRLF),
+            0b00_0100_0000 => Some(Look::WordAscii),
+            0b00_1000_0000 => Some(Look::WordAsciiNegate),
+            0b01_0000_0000 => Some(Look::WordUnicode),
+            0b10_0000_0000 => Some(Look::WordUnicodeNegate),
+            _ => None,
         }
     }
 
-    fn as_char(self) -> char {
+    /// Returns a convenient single codepoint representation of this
+    /// look-around assertion. Each assertion is guaranteed to be represented
+    /// by a distinct character.
+    ///
+    /// This is useful for succinctly representing a look-around assertion in
+    /// human friendly but succinct output intended for a programmer working on
+    /// regex internals.
+    #[inline]
+    pub const fn as_char(self) -> char {
         match self {
             Look::Start => 'A',
             Look::End => 'z',
             Look::StartLF => '^',
             Look::EndLF => '$',
-            Look::StartCRLF => '^',
-            Look::EndCRLF => '$',
+            Look::StartCRLF => 'r',
+            Look::EndCRLF => 'R',
             Look::WordAscii => 'b',
             Look::WordAsciiNegate => 'B',
             Look::WordUnicode => '𝛃',
@@ -1482,6 +1705,9 @@ pub struct Repetition {
     ///
     /// Note that special cases like `?`, `+` and `*` all get translated into
     /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
+    ///
+    /// When `min` is zero, this expression can match the empty string
+    /// regardless of what its sub-expression is.
     pub min: u32,
     /// The maximum range of the repetition.
     ///
@@ -1502,23 +1728,6 @@ pub struct Repetition {
     pub sub: Box,
 }
 
-impl Repetition {
-    /// Returns true if and only if this repetition operator makes it possible
-    /// to match the empty string.
-    ///
-    /// Note that this is not defined inductively. For example, while `a*`
-    /// will report `true`, `()+` will not, even though `()` matches the empty
-    /// string and one or more occurrences of something that matches the
-    /// empty string will always match the empty string. In order to get the
-    /// inductive definition, see the corresponding method on [`Hir`].
-    ///
-    /// This returns true in precisely the cases that [`Repetition::min`]
-    /// is equal to `0`.
-    pub fn is_match_empty(&self) -> bool {
-        self.min == 0
-    }
-}
-
 /// A type describing the different flavors of `.`.
 ///
 /// This type is meant to be used with [`Hir::dot`], which is a convenience
@@ -1564,8 +1773,10 @@ impl Drop for Hir {
             | HirKind::Literal(_)
             | HirKind::Class(_)
             | HirKind::Look(_) => return,
-            HirKind::Capture(ref x) if !x.sub.kind.has_subexprs() => return,
-            HirKind::Repetition(ref x) if !x.sub.kind.has_subexprs() => return,
+            HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
+            HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
+                return
+            }
             HirKind::Concat(ref x) if x.is_empty() => return,
             HirKind::Alternation(ref x) if x.is_empty() => return,
             _ => {}
@@ -1601,7 +1812,8 @@ impl Drop for Hir {
 /// computed inductively on an HIR value. Properties are defined for all
 /// HIR values.
 ///
-/// All methods on a `Properties` value take constant time.
+/// All methods on a `Properties` value take constant time and are meant to
+/// be cheap to call.
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub struct Properties(Box);
 
@@ -1635,6 +1847,7 @@ impl Properties {
     /// `None` is returned when there is no minimum length. This occurs in
     /// precisely the cases where the HIR matches nothing. i.e., The language
     /// the regex matches is empty. An example of such a regex is `\P{any}`.
+    #[inline]
     pub fn minimum_len(&self) -> Option {
         self.0.minimum_len
     }
@@ -1649,12 +1862,14 @@ impl Properties {
     /// occurs when the HIR matches nothing or when there is no upper bound on
     /// the length of matching strings. Example of such regexes are `\P{any}`
     /// (matches nothing) and `a+` (has no upper bound).
+    #[inline]
     pub fn maximum_len(&self) -> Option {
         self.0.maximum_len
     }
 
     /// Returns a set of all look-around assertions that appear at least once
     /// in this HIR value.
+    #[inline]
     pub fn look_set(&self) -> LookSet {
         self.0.look_set
     }
@@ -1665,6 +1880,7 @@ impl Properties {
     ///
     /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
     /// if and only if the HIR is fully anchored at the start.
+    #[inline]
     pub fn look_set_prefix(&self) -> LookSet {
         self.0.look_set_prefix
     }
@@ -1676,6 +1892,7 @@ impl Properties {
     ///
     /// For example, `hir.look_set_suffix().contains(Look::End)` returns true
     /// if and only if the HIR is fully anchored at the end.
+    #[inline]
     pub fn look_set_suffix(&self) -> LookSet {
         self.0.look_set_suffix
     }
@@ -1684,7 +1901,8 @@ impl Properties {
     /// valid UTF-8.
     ///
     /// When this returns false, then it is possible for this HIR expression to
-    /// match invalid UTF-8.
+    /// match invalid UTF-8, including by matching between the code units of
+    /// a single UTF-8 encoded codepoint.
     ///
     /// Note that this returns true even when the corresponding HIR can match
     /// the empty string. Since an empty string can technically appear between
@@ -1693,6 +1911,57 @@ impl Properties {
     /// However, it is generally assumed that such empty matches are handled
     /// specially by the search routine if it is absolutely required that
     /// matches not split a codepoint.
+    ///
+    /// # Example
+    ///
+    /// This code example shows the UTF-8 property of a variety of patterns.
+    ///
+    /// ```
+    /// use regex_syntax::{ParserBuilder, parse};
+    ///
+    /// // Examples of 'is_utf8() == true'.
+    /// assert!(parse(r"a")?.properties().is_utf8());
+    /// assert!(parse(r"[^a]")?.properties().is_utf8());
+    /// assert!(parse(r".")?.properties().is_utf8());
+    /// assert!(parse(r"\W")?.properties().is_utf8());
+    /// assert!(parse(r"\b")?.properties().is_utf8());
+    /// assert!(parse(r"\B")?.properties().is_utf8());
+    /// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
+    /// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
+    /// // Unicode mode is enabled by default, and in
+    /// // that mode, all \x hex escapes are treated as
+    /// // codepoints. So this actually matches the UTF-8
+    /// // encoding of U+00FF.
+    /// assert!(parse(r"\xFF")?.properties().is_utf8());
+    ///
+    /// // Now we show examples of 'is_utf8() == false'.
+    /// // The only way to do this is to force the parser
+    /// // to permit invalid UTF-8, otherwise all of these
+    /// // would fail to parse!
+    /// let parse = |pattern| {
+    ///     ParserBuilder::new().utf8(false).build().parse(pattern)
+    /// };
+    /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
+    /// assert!(!parse(r"(?-u).")?.properties().is_utf8());
+    /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
+    /// // Conversely to the equivalent example above,
+    /// // when Unicode mode is disabled, \x hex escapes
+    /// // are treated as their raw byte values.
+    /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
+    /// // Note that just because we disabled UTF-8 in the
+    /// // parser doesn't mean we still can't use Unicode.
+    /// // It is enabled by default, so \xFF is still
+    /// // equivalent to matching the UTF-8 encoding of
+    /// // U+00FF by default.
+    /// assert!(parse(r"\xFF")?.properties().is_utf8());
+    /// // Even though we use raw bytes that individually
+    /// // are not valid UTF-8, when combined together, the
+    /// // overall expression *does* match valid UTF-8!
+    /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
+    ///
+    /// # Ok::<(), Box>(())
+    /// ```
+    #[inline]
     pub fn is_utf8(&self) -> bool {
         self.0.utf8
     }
@@ -1702,18 +1971,33 @@ impl Properties {
     ///
     /// Note that this does not include the implicit capturing group
     /// corresponding to the entire match that is typically included by regex
-    /// engines. So for example, this method will return `0` for `a` and `1`
-    /// for `(a)`.
+    /// engines.
+    ///
+    /// # Example
+    ///
+    /// This method will return `0` for `a` and `1` for `(a)`:
+    ///
+    /// ```
+    /// use regex_syntax::parse;
+    ///
+    /// assert_eq!(0, parse("a")?.properties().captures_len());
+    /// assert_eq!(1, parse("(a)")?.properties().captures_len());
+    ///
+    /// # Ok::<(), Box>(())
+    /// ```
+    #[inline]
     pub fn captures_len(&self) -> usize {
         self.0.captures_len
     }
 
-    /// Return true if and only if this HIR is a simple literal. This is only
-    /// true when this HIR expression is either itself a `Literal` or a
+    /// Return true if and only if this HIR is a simple literal. This is
+    /// only true when this HIR expression is either itself a `Literal` or a
     /// concatenation of only `Literal`s.
     ///
-    /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`,
-    /// `` are not (even though that contain sub-expressions that are literals).
+    /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
+    /// the empty string are not (even though they contain sub-expressions that
+    /// are literals).
+    #[inline]
     pub fn is_literal(&self) -> bool {
         self.0.literal
     }
@@ -1726,6 +2010,7 @@ impl Properties {
     /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
     /// literals, but `f+`, `(foo)`, `foo()`, ``
     /// are not (even though that contain sub-expressions that are literals).
+    #[inline]
     pub fn is_alternation_literal(&self) -> bool {
         self.0.alternation_literal
     }
@@ -1746,17 +2031,17 @@ impl Properties {
     /// like the minimum and maximum lengths.
     ///
     /// ```
-    /// use regex_syntax::{hir::Properties, Parser};
+    /// use regex_syntax::{hir::Properties, parse};
     ///
-    /// let hir1 = Parser::new().parse("ab?c?")?;
+    /// let hir1 = parse("ab?c?")?;
     /// assert_eq!(Some(1), hir1.properties().minimum_len());
     /// assert_eq!(Some(3), hir1.properties().maximum_len());
     ///
-    /// let hir2 = Parser::new().parse(r"[a&&b]")?;
+    /// let hir2 = parse(r"[a&&b]")?;
     /// assert_eq!(None, hir2.properties().minimum_len());
     /// assert_eq!(None, hir2.properties().maximum_len());
     ///
-    /// let hir3 = Parser::new().parse(r"wxy?z?")?;
+    /// let hir3 = parse(r"wxy?z?")?;
     /// assert_eq!(Some(2), hir3.properties().minimum_len());
     /// assert_eq!(Some(4), hir3.properties().maximum_len());
     ///
@@ -1776,17 +2061,17 @@ impl Properties {
     /// unaffected:
     ///
     /// ```
-    /// use regex_syntax::{hir::Properties, Parser};
+    /// use regex_syntax::{hir::Properties, parse};
     ///
-    /// let hir1 = Parser::new().parse("ab?c?")?;
+    /// let hir1 = parse("ab?c?")?;
     /// assert_eq!(Some(1), hir1.properties().minimum_len());
     /// assert_eq!(Some(3), hir1.properties().maximum_len());
     ///
-    /// let hir2 = Parser::new().parse(r"a+")?;
+    /// let hir2 = parse(r"a+")?;
     /// assert_eq!(Some(1), hir2.properties().minimum_len());
     /// assert_eq!(None, hir2.properties().maximum_len());
     ///
-    /// let hir3 = Parser::new().parse(r"wxy?z?")?;
+    /// let hir3 = parse(r"wxy?z?")?;
     /// assert_eq!(Some(2), hir3.properties().minimum_len());
     /// assert_eq!(Some(4), hir3.properties().maximum_len());
     ///
@@ -1834,9 +2119,9 @@ impl Properties {
         // Handle properties that need to visit every child hir.
         for prop in it {
             let p = prop.borrow();
-            props.look_set.union(p.look_set());
-            props.look_set_prefix.intersect(p.look_set_prefix());
-            props.look_set_suffix.intersect(p.look_set_suffix());
+            props.look_set.set_union(p.look_set());
+            props.look_set_prefix.set_intersect(p.look_set_prefix());
+            props.look_set_suffix.set_intersect(p.look_set_suffix());
             props.utf8 = props.utf8 && p.is_utf8();
             props.captures_len =
                 props.captures_len.saturating_add(p.captures_len());
@@ -1935,29 +2220,26 @@ impl Properties {
 
     /// Create a new set of HIR properties for a look-around assertion.
     fn look(look: Look) -> Properties {
-        use self::Look::*;
-
-        let utf8 = match look {
-            Start | End | StartLF | EndLF | WordAscii | WordUnicode
-            | WordUnicodeNegate => true,
-            // FIXME: Negated ASCII word boundaries can match invalid UTF-8.
-            // But why is this 'false' when 'HirKind::Empty' is true? After
-            // all, isn't WordAsciiNegate just a subset of HirKind::Empty? It
-            // seems to me that if we handle HirKind::Empty correctly even when
-            // it splits a codepoint, then we should be able to automatically
-            // handle WordAsciiNegate correctly too...
-            //
-            // For now, this returns 'false' because that's what it did before.
-            // But we should revisit this before the next release.
-            WordAsciiNegate => false,
-        };
         let inner = PropertiesI {
             minimum_len: Some(0),
             maximum_len: Some(0),
             look_set: LookSet::singleton(look),
             look_set_prefix: LookSet::singleton(look),
             look_set_suffix: LookSet::singleton(look),
-            utf8,
+            // This requires a little explanation. Basically, we don't consider
+            // matching an empty string to be equivalent to matching invalid
+            // UTF-8, even though technically matching every empty string will
+            // split the UTF-8 encoding of a single codepoint when treating a
+            // UTF-8 encoded string as a sequence of bytes. Our defense here is
+            // that in such a case, a codepoint should logically be treated as
+            // the fundamental atom for matching, and thus the only valid match
+            // points are between codepoints and not bytes.
+            //
+            // More practically, this is true here because it's also true
+            // for 'Hir::empty()', otherwise something like 'a*' would be
+            // considered to match invalid UTF-8. That in turn makes this
+            // property borderline useless.
+            utf8: true,
             captures_len: 0,
             literal: false,
             alternation_literal: false,
@@ -1989,7 +2271,10 @@ impl Properties {
             literal: false,
             alternation_literal: false,
         };
-        if !rep.is_match_empty() {
+        // The repetition operator can match the empty string, then its lookset
+        // prefix and suffixes themselves remain empty since they are no longer
+        // required to match.
+        if rep.min > 0 {
             inner.look_set_prefix = p.look_set_prefix();
             inner.look_set_suffix = p.look_set_suffix();
         }
@@ -2027,7 +2312,7 @@ impl Properties {
         // Handle properties that need to visit every child hir.
         for x in concat.iter() {
             let p = x.properties();
-            props.look_set.union(p.look_set());
+            props.look_set.set_union(p.look_set());
             props.utf8 = props.utf8 && p.is_utf8();
             props.captures_len =
                 props.captures_len.saturating_add(p.captures_len());
@@ -2051,7 +2336,7 @@ impl Properties {
         // child exprs until one matches more than the empty string.
         let mut it = concat.iter();
         while let Some(x) = it.next() {
-            props.look_set_prefix.union(x.properties().look_set_prefix());
+            props.look_set_prefix.set_union(x.properties().look_set_prefix());
             if x.properties().maximum_len().map_or(true, |x| x > 0) {
                 break;
             }
@@ -2059,7 +2344,7 @@ impl Properties {
         // Same thing for the suffix properties, but in reverse.
         let mut it = concat.iter().rev();
         while let Some(x) = it.next() {
-            props.look_set_suffix.union(x.properties().look_set_suffix());
+            props.look_set_suffix.set_union(x.properties().look_set_suffix());
             if x.properties().maximum_len().map_or(true, |x| x > 0) {
                 break;
             }
@@ -2079,7 +2364,16 @@ impl Properties {
 /// example, an [`Hir`] provides properties that return `LookSet`s.
 #[derive(Clone, Copy, Default, Eq, PartialEq)]
 pub struct LookSet {
-    bits: u16,
+    /// The underlying representation this set is exposed to make it possible
+    /// to store it somewhere efficiently. The representation is that
+    /// of a bitset, where each assertion occupies bit `i` where `i =
+    /// Look::as_repr()`.
+    ///
+    /// Note that users of this internal representation must permit the full
+    /// range of `u16` values to be represented. For example, even if the
+    /// current implementation only makes use of the 10 least significant bits,
+    /// it may use more bits in a future semver compatible release.
+    pub bits: u16,
 }
 
 impl LookSet {
@@ -2103,14 +2397,12 @@ impl LookSet {
     /// one look-around assertions.
     #[inline]
     pub fn singleton(look: Look) -> LookSet {
-        let mut set = LookSet::empty();
-        set.insert(look);
-        set
+        LookSet::empty().insert(look)
     }
 
     /// Returns the total number of look-around assertions in this set.
     #[inline]
-    pub fn len(&self) -> usize {
+    pub fn len(self) -> usize {
         // OK because max value always fits in a u8, which in turn always
         // fits in a usize, regardless of target.
         usize::try_from(self.bits.count_ones()).unwrap()
@@ -2118,43 +2410,70 @@ impl LookSet {
 
     /// Returns true if and only if this set is empty.
     #[inline]
-    pub fn is_empty(&self) -> bool {
+    pub fn is_empty(self) -> bool {
         self.len() == 0
     }
 
-    /// Insert the given look-around assertions into this set. If the assertion
-    /// is already in the set, then this is a no-op.
+    /// Returns true if and only if the given look-around assertion is in this
+    /// set.
     #[inline]
-    pub fn insert(&mut self, look: Look) {
-        self.bits |= 1 << look.as_repr();
+    pub fn contains(self, look: Look) -> bool {
+        self.bits & look.as_repr() != 0
     }
 
-    /// Remove the given look-around assertion from this set. If it wasn't
-    /// previously in the set, then this is a no-op.
+    /// Returns true if and only if this set contains any anchor assertions.
+    /// This includes both "start/end of haystack" and "start/end of line."
     #[inline]
-    pub fn remove(&mut self, look: Look) {
-        self.bits &= !(1 << look.as_repr());
+    pub fn contains_anchor(&self) -> bool {
+        self.contains_anchor_haystack() || self.contains_anchor_line()
     }
 
-    /// Returns true if and only if the given look-around assertion is in this
-    /// set.
+    /// Returns true if and only if this set contains any "start/end of
+    /// haystack" anchors. This doesn't include "start/end of line" anchors.
+    #[inline]
+    pub fn contains_anchor_haystack(&self) -> bool {
+        self.contains(Look::Start) || self.contains(Look::End)
+    }
+
+    /// Returns true if and only if this set contains any "start/end of line"
+    /// anchors. This doesn't include "start/end of haystack" anchors. This
+    /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
+    #[inline]
+    pub fn contains_anchor_line(&self) -> bool {
+        self.contains(Look::StartLF)
+            || self.contains(Look::EndLF)
+            || self.contains(Look::StartCRLF)
+            || self.contains(Look::EndCRLF)
+    }
+
+    /// Returns true if and only if this set contains any "start/end of line"
+    /// anchors that only treat `\n` as line terminators. This does not include
+    /// haystack anchors or CRLF aware line anchors.
+    #[inline]
+    pub fn contains_anchor_lf(&self) -> bool {
+        self.contains(Look::StartLF) || self.contains(Look::EndLF)
+    }
+
+    /// Returns true if and only if this set contains any "start/end of line"
+    /// anchors that are CRLF-aware. This doesn't include "start/end of
+    /// haystack" or "start/end of line-feed" anchors.
     #[inline]
-    pub fn contains(&self, look: Look) -> bool {
-        self.bits & (1 << look.as_repr()) != 0
+    pub fn contains_anchor_crlf(&self) -> bool {
+        self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
     }
 
     /// Returns true if and only if this set contains any word boundary or
     /// negated word boundary assertions. This include both Unicode and ASCII
     /// word boundaries.
     #[inline]
-    pub fn contains_word(&self) -> bool {
+    pub fn contains_word(self) -> bool {
         self.contains_word_unicode() || self.contains_word_ascii()
     }
 
     /// Returns true if and only if this set contains any Unicode word boundary
     /// or negated Unicode word boundary assertions.
     #[inline]
-    pub fn contains_word_unicode(&self) -> bool {
+    pub fn contains_word_unicode(self) -> bool {
         self.contains(Look::WordUnicode)
             || self.contains(Look::WordUnicodeNegate)
     }
@@ -2162,26 +2481,109 @@ impl LookSet {
     /// Returns true if and only if this set contains any ASCII word boundary
     /// or negated ASCII word boundary assertions.
     #[inline]
-    pub fn contains_word_ascii(&self) -> bool {
+    pub fn contains_word_ascii(self) -> bool {
         self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate)
     }
 
-    /// Modifies this set to be the union of itself and the set given.
+    /// Returns an iterator over all of the look-around assertions in this set.
     #[inline]
-    pub fn union(&mut self, other: LookSet) {
-        self.bits |= other.bits;
+    pub fn iter(self) -> LookSetIter {
+        LookSetIter { set: self }
     }
 
-    /// Modifies this set to be the intersection of itself and the set given.
+    /// Return a new set that is equivalent to the original, but with the given
+    /// assertion added to it. If the assertion is already in the set, then the
+    /// returned set is equivalent to the original.
     #[inline]
-    pub fn intersect(&mut self, other: LookSet) {
-        self.bits &= other.bits;
+    pub fn insert(self, look: Look) -> LookSet {
+        LookSet { bits: self.bits | look.as_repr() }
     }
 
-    /// Returns an iterator over all of the look-around assertions in this set.
+    /// Updates this set in place with the result of inserting the given
+    /// assertion into this set.
     #[inline]
-    pub fn iter(self) -> LookSetIter {
-        LookSetIter { set: self }
+    pub fn set_insert(&mut self, look: Look) {
+        *self = self.insert(look);
+    }
+
+    /// Return a new set that is equivalent to the original, but with the given
+    /// assertion removed from it. If the assertion is not in the set, then the
+    /// returned set is equivalent to the original.
+    #[inline]
+    pub fn remove(self, look: Look) -> LookSet {
+        LookSet { bits: self.bits & !look.as_repr() }
+    }
+
+    /// Updates this set in place with the result of removing the given
+    /// assertion from this set.
+    #[inline]
+    pub fn set_remove(&mut self, look: Look) {
+        *self = self.remove(look);
+    }
+
+    /// Returns a new set that is the result of subtracting the given set from
+    /// this set.
+    #[inline]
+    pub fn subtract(self, other: LookSet) -> LookSet {
+        LookSet { bits: self.bits & !other.bits }
+    }
+
+    /// Updates this set in place with the result of subtracting the given set
+    /// from this set.
+    #[inline]
+    pub fn set_subtract(&mut self, other: LookSet) {
+        *self = self.subtract(other);
+    }
+
+    /// Returns a new set that is the union of this and the one given.
+    #[inline]
+    pub fn union(self, other: LookSet) -> LookSet {
+        LookSet { bits: self.bits | other.bits }
+    }
+
+    /// Updates this set in place with the result of unioning it with the one
+    /// given.
+    #[inline]
+    pub fn set_union(&mut self, other: LookSet) {
+        *self = self.union(other);
+    }
+
+    /// Returns a new set that is the intersection of this and the one given.
+    #[inline]
+    pub fn intersect(self, other: LookSet) -> LookSet {
+        LookSet { bits: self.bits & other.bits }
+    }
+
+    /// Updates this set in place with the result of intersecting it with the
+    /// one given.
+    #[inline]
+    pub fn set_intersect(&mut self, other: LookSet) {
+        *self = self.intersect(other);
+    }
+
+    /// Return a `LookSet` from the slice given as a native endian 16-bit
+    /// integer.
+    ///
+    /// # Panics
+    ///
+    /// This panics if `slice.len() < 2`.
+    #[inline]
+    pub fn read_repr(slice: &[u8]) -> LookSet {
+        let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap());
+        LookSet { bits }
+    }
+
+    /// Write a `LookSet` as a native endian 16-bit integer to the beginning
+    /// of the slice given.
+    ///
+    /// # Panics
+    ///
+    /// This panics if `slice.len() < 2`.
+    #[inline]
+    pub fn write_repr(self, slice: &mut [u8]) {
+        let raw = self.bits.to_ne_bytes();
+        slice[0] = raw[0];
+        slice[1] = raw[1];
     }
 }
 
@@ -2210,11 +2612,14 @@ impl Iterator for LookSetIter {
 
     #[inline]
     fn next(&mut self) -> Option {
+        if self.set.is_empty() {
+            return None;
+        }
         // We'll never have more than u8::MAX distinct look-around assertions,
         // so 'repr' will always fit into a u16.
         let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
-        let look = Look::from_repr(repr)?;
-        self.set.remove(look);
+        let look = Look::from_repr(1 << repr)?;
+        self.set = self.set.remove(look);
         Some(look)
     }
 }
@@ -3113,7 +3518,7 @@ mod tests {
                     props: Properties::empty(),
                 };
             }
-            assert!(!expr.kind.is_empty());
+            assert!(!matches!(*expr.kind(), HirKind::Empty));
         };
 
         // We run our test on a thread with a small stack size so we can
@@ -3128,4 +3533,31 @@ mod tests {
             .join()
             .unwrap();
     }
+
+    #[test]
+    fn look_set_iter() {
+        let set = LookSet::empty();
+        assert_eq!(0, set.iter().count());
+
+        let set = LookSet::full();
+        assert_eq!(10, set.iter().count());
+
+        let set =
+            LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
+        assert_eq!(2, set.iter().count());
+
+        let set = LookSet::empty().insert(Look::StartLF);
+        assert_eq!(1, set.iter().count());
+
+        let set = LookSet::empty().insert(Look::WordAsciiNegate);
+        assert_eq!(1, set.iter().count());
+    }
+
+    #[test]
+    fn look_set_debug() {
+        let res = format!("{:?}", LookSet::empty());
+        assert_eq!("∅", res);
+        let res = format!("{:?}", LookSet::full());
+        assert_eq!("Az^$rRbB𝛃𝚩", res);
+    }
 }
diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs
index c1ebf85c2..81ae9b898 100644
--- a/regex-syntax/src/hir/translate.rs
+++ b/regex-syntax/src/hir/translate.rs
@@ -9,7 +9,7 @@ use alloc::{boxed::Box, string::ToString, vec, vec::Vec};
 use crate::{
     ast::{self, Ast, Span, Visitor},
     either::Either,
-    hir::{self, Error, ErrorKind, Hir},
+    hir::{self, Error, ErrorKind, Hir, HirKind},
     unicode::{self, ClassQuery},
 };
 
@@ -425,7 +425,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
             Ast::Concat(_) => {
                 let mut exprs = vec![];
                 while let Some(expr) = self.pop_concat_expr() {
-                    if !expr.kind().is_empty() {
+                    if !matches!(*expr.kind(), HirKind::Empty) {
                         exprs.push(expr);
                     }
                 }
@@ -899,21 +899,11 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
             } else {
                 hir::Look::WordAscii
             }),
-            ast::AssertionKind::NotWordBoundary => {
-                Hir::look(if unicode {
-                    hir::Look::WordUnicodeNegate
-                } else {
-                    // It is possible for negated ASCII word boundaries to
-                    // match at invalid UTF-8 boundaries, even when searching
-                    // valid UTF-8.
-                    if self.trans().utf8 {
-                        return Err(
-                            self.error(asst.span, ErrorKind::InvalidUtf8)
-                        );
-                    }
-                    hir::Look::WordAsciiNegate
-                })
-            }
+            ast::AssertionKind::NotWordBoundary => Hir::look(if unicode {
+                hir::Look::WordUnicodeNegate
+            } else {
+                hir::Look::WordAsciiNegate
+            }),
         })
     }
 
@@ -1055,7 +1045,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         // Negating a Perl byte class is likely to cause it to match invalid
         // UTF-8. That's only OK if the translator is configured to allow such
         // things.
-        if self.trans().utf8 && !class.is_all_ascii() {
+        if self.trans().utf8 && !class.is_ascii() {
             return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
         }
         Ok(class)
@@ -1123,7 +1113,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
         if negated {
             class.negate();
         }
-        if self.trans().utf8 && !class.is_all_ascii() {
+        if self.trans().utf8 && !class.is_ascii() {
             return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
         }
         Ok(())
@@ -1796,18 +1786,7 @@ mod tests {
         assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode));
         assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate));
         assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii));
-        assert_eq!(t_bytes(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate));
-
-        assert_eq!(
-            t_err(r"(?-u)\B"),
-            TestError {
-                kind: hir::ErrorKind::InvalidUtf8,
-                span: Span::new(
-                    Position::new(5, 1, 6),
-                    Position::new(7, 1, 8)
-                ),
-            }
-        );
+        assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate));
     }
 
     #[test]
@@ -3199,21 +3178,21 @@ mod tests {
         assert!(props_bytes(r"\b").is_utf8());
         assert!(props_bytes(r"\B").is_utf8());
         assert!(props_bytes(r"(?-u)\b").is_utf8());
+        assert!(props_bytes(r"(?-u)\B").is_utf8());
 
         // Negative examples.
         assert!(!props_bytes(r"(?-u)\xFF").is_utf8());
         assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8());
         assert!(!props_bytes(r"(?-u)[^a]").is_utf8());
         assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8());
-        assert!(!props_bytes(r"(?-u)\B").is_utf8());
     }
 
     #[test]
     fn analysis_captures_len() {
         assert_eq!(0, props(r"a").captures_len());
         assert_eq!(0, props(r"(?:a)").captures_len());
-        assert_eq!(0, props(r"(?i:a)").captures_len());
-        assert_eq!(0, props(r"(?i)a").captures_len());
+        assert_eq!(0, props(r"(?i-u:a)").captures_len());
+        assert_eq!(0, props(r"(?i-u)a").captures_len());
         assert_eq!(1, props(r"(a)").captures_len());
         assert_eq!(1, props(r"(?Pa)").captures_len());
         assert_eq!(1, props(r"()").captures_len());
diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs
index c56f9d1ff..10540cab5 100644
--- a/regex-syntax/src/lib.rs
+++ b/regex-syntax/src/lib.rs
@@ -29,7 +29,8 @@ These two types come with conversion routines:
 
 As a convenience, the above two conversion routines are combined into one via
 the top-level [`Parser`] type. This `Parser` will first convert your pattern to
-an `Ast` and then convert the `Ast` to an `Hir`.
+an `Ast` and then convert the `Ast` to an `Hir`. It's also exposed as top-level
+[`parse`] free function.
 
 
 # Example
@@ -37,14 +38,14 @@ an `Ast` and then convert the `Ast` to an `Hir`.
 This example shows how to parse a pattern string into its HIR:
 
 ```
-use regex_syntax::Parser;
-use regex_syntax::hir::Hir;
+use regex_syntax::{hir::Hir, parse};
 
-let hir = Parser::new().parse("a|b").unwrap();
+let hir = parse("a|b")?;
 assert_eq!(hir, Hir::alternation(vec![
     Hir::literal("a".as_bytes()),
     Hir::literal("b".as_bytes()),
 ]));
+# Ok::<(), Box>(())
 ```
 
 
@@ -170,8 +171,8 @@ extern crate std;
 extern crate alloc;
 
 pub use crate::{
-    error::{Error, Result},
-    parser::{Parser, ParserBuilder},
+    error::Error,
+    parser::{parse, Parser, ParserBuilder},
     unicode::UnicodeWordError,
 };
 
diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs
index 2851cda33..2e7a2bb80 100644
--- a/regex-syntax/src/parser.rs
+++ b/regex-syntax/src/parser.rs
@@ -1,4 +1,18 @@
-use crate::{ast, hir, Result};
+use crate::{ast, hir, Error};
+
+/// A convenience routine for parsing a regex using default options.
+///
+/// This is equivalent to `Parser::new().parse(pattern)`.
+///
+/// If you need to set non-default options, then use a [`ParserBuilder`].
+///
+/// This routine returns an [`Hir`](hir::Hir) value. Namely, it automatically
+/// parses the pattern as an [`Ast`](ast::Ast) and then invokes the translator
+/// to convert the `Ast` into an `Hir`. If you need access to the `Ast`, then
+/// you should use a [`ast::parse::Parser`].
+pub fn parse(pattern: &str) -> Result {
+    Parser::new().parse(pattern)
+}
 
 /// A builder for a regular expression parser.
 ///
@@ -207,7 +221,7 @@ impl Parser {
 
     /// Parse the regular expression into a high level intermediate
     /// representation.
-    pub fn parse(&mut self, pattern: &str) -> Result {
+    pub fn parse(&mut self, pattern: &str) -> Result {
         let ast = self.ast.parse(pattern)?;
         let hir = self.hir.translate(pattern, &ast)?;
         Ok(hir)
diff --git a/regex-syntax/test b/regex-syntax/test
index 50854d2b3..a4d6cfaba 100755
--- a/regex-syntax/test
+++ b/regex-syntax/test
@@ -18,7 +18,7 @@ features=(
     unicode-segment
 )
 for f in "${features[@]}"; do
-    echo "===== FEATURE: $f ==="
+    echo "=== FEATURE: $f ==="
     # We only run library tests because I couldn't figure out how to easily
     # make doc tests run in 'no_std' mode. In particular, without the Error
     # trait, using '?' in doc tests seems tricky.
diff --git a/src/compile.rs b/src/compile.rs
index 20eebf0ed..c29196a72 100644
--- a/src/compile.rs
+++ b/src/compile.rs
@@ -290,7 +290,7 @@ impl Compiler {
                 if self.compiled.uses_bytes() {
                     self.c_class_bytes(cls.ranges())
                 } else {
-                    assert!(cls.is_all_ascii());
+                    assert!(cls.is_ascii());
                     let mut char_ranges = vec![];
                     for r in cls.iter() {
                         let (s, e) = (r.start() as char, r.end() as char);
diff --git a/src/exec.rs b/src/exec.rs
index b183d24d1..e36e367ba 100644
--- a/src/exec.rs
+++ b/src/exec.rs
@@ -252,7 +252,19 @@ impl ExecBuilder {
             let expr =
                 parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
             let props = expr.properties();
-            bytes = bytes || !props.is_utf8();
+            // This used to just check whether the HIR matched valid UTF-8
+            // or not, but in regex-syntax 0.7, we changed our definition of
+            // "matches valid UTF-8" to exclude zero-width matches. And in
+            // particular, previously, we considered WordAsciiNegate (that
+            // is '(?-u:\B)') to be capable of matching invalid UTF-8. Our
+            // matcher engines were built under this assumption and fixing
+            // them is not worth it with the imminent plan to switch over to
+            // regex-automata. So for now, we retain the previous behavior by
+            // just explicitly treating the presence of a negated ASCII word
+            // boundary as forcing use to use a byte oriented automaton.
+            bytes = bytes
+                || !props.is_utf8()
+                || props.look_set().contains(Look::WordAsciiNegate);
 
             if cfg!(feature = "perf-literal") {
                 if !props.look_set_prefix().contains(Look::Start)

From fbdc4a9ba1a47b4574e277f16206397d8ddd5d3b Mon Sep 17 00:00:00 2001
From: Andrew Gallant 
Date: Fri, 3 Mar 2023 09:32:22 -0500
Subject: [PATCH 58/79] syntax: permit most no-op escape sequences

This resolves a long-standing (but somewhat minor) complaint that folks
have with the regex crate: it does not permit escaping punctuation
characters in cases where those characters do not need to be escaped. So
things like \/, \" and \! would result in parse errors. Most other regex
engines permit these, even in cases where they aren't needed.

I had been against doing this for future evolution purposes, but it's
incredibly unlikely that we're ever going to add a new meta character to
the syntax. I literally cannot think of any conceivable future in which
that might happen.

However, we do continue to ban escapes for [0-9A-Za-z<>], because it is
conceivable that we might add new escape sequences for those characters.
(And 0-9 are already banned by virtue of them looking too much like
backreferences, which aren't supported.) For example, we could add
\Q...\E literal syntax. Or \< and \> as start and end word boundaries,
as found in POSIX regex engines.

Fixes #501
---
 regex-syntax/src/ast/mod.rs   |   9 ++-
 regex-syntax/src/ast/parse.rs | 133 +++++++++++++++++++++-------------
 regex-syntax/src/ast/print.rs |   2 +-
 regex-syntax/src/lib.rs       | 102 ++++++++++++++++++++++++--
 4 files changed, 185 insertions(+), 61 deletions(-)

diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs
index 9be867c56..faabca2c1 100644
--- a/regex-syntax/src/ast/mod.rs
+++ b/regex-syntax/src/ast/mod.rs
@@ -588,9 +588,12 @@ impl Literal {
 pub enum LiteralKind {
     /// The literal is written verbatim, e.g., `a` or `☃`.
     Verbatim,
-    /// The literal is written as an escape because it is punctuation, e.g.,
-    /// `\*` or `\[`.
-    Punctuation,
+    /// The literal is written as an escape because it is otherwise a special
+    /// regex meta character, e.g., `\*` or `\[`.
+    Meta,
+    /// The literal is written as an escape despite the fact that the escape is
+    /// unnecessary, e.g., `\%` or `\/`.
+    Superfluous,
     /// The literal is written as an octal escape, e.g., `\141`.
     Octal,
     /// The literal is written as a hex code with a fixed number of digits
diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs
index 93452cb18..901250f61 100644
--- a/regex-syntax/src/ast/parse.rs
+++ b/regex-syntax/src/ast/parse.rs
@@ -18,7 +18,7 @@ use alloc::{
 use crate::{
     ast::{self, Ast, Position, Span},
     either::Either,
-    is_meta_character,
+    is_escapeable_character, is_meta_character,
 };
 
 type Result = core::result::Result;
@@ -1495,7 +1495,14 @@ impl<'s, P: Borrow> ParserI<'s, P> {
         if is_meta_character(c) {
             return Ok(Primitive::Literal(ast::Literal {
                 span,
-                kind: ast::LiteralKind::Punctuation,
+                kind: ast::LiteralKind::Meta,
+                c,
+            }));
+        }
+        if is_escapeable_character(c) {
+            return Ok(Primitive::Literal(ast::Literal {
+                span,
+                kind: ast::LiteralKind::Superfluous,
                 c,
             }));
         }
@@ -1513,9 +1520,6 @@ impl<'s, P: Borrow> ParserI<'s, P> {
             'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
             'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
             'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
-            ' ' if self.ignore_whitespace() => {
-                special(ast::SpecialLiteralKind::Space, ' ')
-            }
             'A' => Ok(Primitive::Assertion(ast::Assertion {
                 span,
                 kind: ast::AssertionKind::StartText,
@@ -2420,13 +2424,9 @@ mod tests {
         lit_with(c, span(start..start + c.len_utf8()))
     }
 
-    /// Create a punctuation literal starting at the given position.
-    fn punct_lit(c: char, span: Span) -> Ast {
-        Ast::Literal(ast::Literal {
-            span,
-            kind: ast::LiteralKind::Punctuation,
-            c,
-        })
+    /// Create a meta literal starting at the given position.
+    fn meta_lit(c: char, span: Span) -> Ast {
+        Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c })
     }
 
     /// Create a verbatim literal with the given span.
@@ -2710,24 +2710,24 @@ bar
             Ok(concat(
                 0..36,
                 vec![
-                    punct_lit('\\', span(0..2)),
-                    punct_lit('.', span(2..4)),
-                    punct_lit('+', span(4..6)),
-                    punct_lit('*', span(6..8)),
-                    punct_lit('?', span(8..10)),
-                    punct_lit('(', span(10..12)),
-                    punct_lit(')', span(12..14)),
-                    punct_lit('|', span(14..16)),
-                    punct_lit('[', span(16..18)),
-                    punct_lit(']', span(18..20)),
-                    punct_lit('{', span(20..22)),
-                    punct_lit('}', span(22..24)),
-                    punct_lit('^', span(24..26)),
-                    punct_lit('$', span(26..28)),
-                    punct_lit('#', span(28..30)),
-                    punct_lit('&', span(30..32)),
-                    punct_lit('-', span(32..34)),
-                    punct_lit('~', span(34..36)),
+                    meta_lit('\\', span(0..2)),
+                    meta_lit('.', span(2..4)),
+                    meta_lit('+', span(4..6)),
+                    meta_lit('*', span(6..8)),
+                    meta_lit('?', span(8..10)),
+                    meta_lit('(', span(10..12)),
+                    meta_lit(')', span(12..14)),
+                    meta_lit('|', span(14..16)),
+                    meta_lit('[', span(16..18)),
+                    meta_lit(']', span(18..20)),
+                    meta_lit('{', span(20..22)),
+                    meta_lit('}', span(22..24)),
+                    meta_lit('^', span(24..26)),
+                    meta_lit('$', span(26..28)),
+                    meta_lit('#', span(28..30)),
+                    meta_lit('&', span(30..32)),
+                    meta_lit('-', span(32..34)),
+                    meta_lit('~', span(34..36)),
                 ]
             ))
         );
@@ -2879,23 +2879,12 @@ bar
                     flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
                     Ast::Literal(ast::Literal {
                         span: span_range(pat, 4..6),
-                        kind: ast::LiteralKind::Special(
-                            ast::SpecialLiteralKind::Space
-                        ),
+                        kind: ast::LiteralKind::Superfluous,
                         c: ' ',
                     }),
                 ]
             ))
         );
-        // ... but only when `x` mode is enabled.
-        let pat = r"\ ";
-        assert_eq!(
-            parser(pat).parse().unwrap_err(),
-            TestError {
-                span: span_range(pat, 0..2),
-                kind: ast::ErrorKind::EscapeUnrecognized,
-            }
-        );
     }
 
     #[test]
@@ -4246,7 +4235,7 @@ bar
             parser(r"\|").parse_primitive(),
             Ok(Primitive::Literal(ast::Literal {
                 span: span(0..2),
-                kind: ast::LiteralKind::Punctuation,
+                kind: ast::LiteralKind::Meta,
                 c: '|',
             }))
         );
@@ -4297,11 +4286,26 @@ bar
             }))
         );
 
+        // We also support superfluous escapes in most cases now too.
+        for c in ['!', '@', '%', '"', '\'', '/', ' '] {
+            let pat = format!(r"\{}", c);
+            assert_eq!(
+                parser(&pat).parse_primitive(),
+                Ok(Primitive::Literal(ast::Literal {
+                    span: span(0..2),
+                    kind: ast::LiteralKind::Superfluous,
+                    c,
+                }))
+            );
+        }
+
+        // Some superfluous escapes, namely [0-9A-Za-z], are still banned. This
+        // gives flexibility for future evolution.
         assert_eq!(
-            parser(r"\").parse_escape().unwrap_err(),
+            parser(r"\e").parse_escape().unwrap_err(),
             TestError {
-                span: span(0..1),
-                kind: ast::ErrorKind::EscapeUnexpectedEof,
+                span: span(0..2),
+                kind: ast::ErrorKind::EscapeUnrecognized,
             }
         );
         assert_eq!(
@@ -4311,6 +4315,31 @@ bar
                 kind: ast::ErrorKind::EscapeUnrecognized,
             }
         );
+        // But also, < and > are banned, so that we may evolve them into
+        // start/end word boundary assertions. (Not sure if we will...)
+        assert_eq!(
+            parser(r"\<").parse_escape().unwrap_err(),
+            TestError {
+                span: span(0..2),
+                kind: ast::ErrorKind::EscapeUnrecognized,
+            }
+        );
+        assert_eq!(
+            parser(r"\>").parse_escape().unwrap_err(),
+            TestError {
+                span: span(0..2),
+                kind: ast::ErrorKind::EscapeUnrecognized,
+            }
+        );
+
+        // An unfinished escape is illegal.
+        assert_eq!(
+            parser(r"\").parse_escape().unwrap_err(),
+            TestError {
+                span: span(0..1),
+                kind: ast::ErrorKind::EscapeUnexpectedEof,
+            }
+        );
     }
 
     #[test]
@@ -4907,7 +4936,7 @@ bar
                         lit(span(1..2), 'a'),
                         ast::ClassSetItem::Literal(ast::Literal {
                             span: span(2..4),
-                            kind: ast::LiteralKind::Punctuation,
+                            kind: ast::LiteralKind::Meta,
                             c: ']',
                         }),
                     ]
@@ -4925,7 +4954,7 @@ bar
                         lit(span(1..2), 'a'),
                         ast::ClassSetItem::Literal(ast::Literal {
                             span: span(2..4),
-                            kind: ast::LiteralKind::Punctuation,
+                            kind: ast::LiteralKind::Meta,
                             c: '-',
                         }),
                         lit(span(4..5), 'z'),
@@ -5117,7 +5146,7 @@ bar
                     span(1..6),
                     itemset(ast::ClassSetItem::Literal(ast::Literal {
                         span: span(1..3),
-                        kind: ast::LiteralKind::Punctuation,
+                        kind: ast::LiteralKind::Meta,
                         c: '^',
                     })),
                     itemset(lit(span(5..6), '^')),
@@ -5133,7 +5162,7 @@ bar
                     span(1..6),
                     itemset(ast::ClassSetItem::Literal(ast::Literal {
                         span: span(1..3),
-                        kind: ast::LiteralKind::Punctuation,
+                        kind: ast::LiteralKind::Meta,
                         c: '&',
                     })),
                     itemset(lit(span(5..6), '&')),
@@ -5198,7 +5227,7 @@ bar
                         lit(span(1..2), ']'),
                         ast::ClassSetItem::Literal(ast::Literal {
                             span: span(2..4),
-                            kind: ast::LiteralKind::Punctuation,
+                            kind: ast::LiteralKind::Meta,
                             c: '[',
                         }),
                     ]
@@ -5216,7 +5245,7 @@ bar
                         kind: itemset(ast::ClassSetItem::Literal(
                             ast::Literal {
                                 span: span(1..3),
-                                kind: ast::LiteralKind::Punctuation,
+                                kind: ast::LiteralKind::Meta,
                                 c: '[',
                             }
                         )),
diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs
index 40f967cfa..86a87e143 100644
--- a/regex-syntax/src/ast/print.rs
+++ b/regex-syntax/src/ast/print.rs
@@ -216,7 +216,7 @@ impl Writer {
 
         match ast.kind {
             Verbatim => self.wtr.write_char(ast.c),
-            Punctuation => write!(self.wtr, r"\{}", ast.c),
+            Meta | Superfluous => write!(self.wtr, r"\{}", ast.c),
             Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)),
             HexFixed(ast::HexLiteralKind::X) => {
                 write!(self.wtr, r"\x{:02X}", u32::from(ast.c))
diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs
index 10540cab5..4953641d7 100644
--- a/regex-syntax/src/lib.rs
+++ b/regex-syntax/src/lib.rs
@@ -215,13 +215,43 @@ pub fn escape_into(text: &str, buf: &mut String) {
 
 /// Returns true if the given character has significance in a regex.
 ///
-/// These are the only characters that are allowed to be escaped, with one
-/// exception: an ASCII space character may be escaped when extended mode (with
-/// the `x` flag) is enabled. In particular, `is_meta_character(' ')` returns
-/// `false`.
+/// Generally speaking, these are the only characters which _must_ be escaped
+/// in order to match their literal meaning. For example, to match a literal
+/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For
+/// example, `-` is treated as a meta character because of its significance
+/// for writing ranges inside of character classes, but the regex `-` will
+/// match a literal `-` because `-` has no special meaning outside of character
+/// classes.
+///
+/// In order to determine whether a character may be escaped at all, the
+/// [`is_escapeable_character`] routine should be used. The difference between
+/// `is_meta_character` and `is_escapeable_character` is that the latter will
+/// return true for some characters that are _not_ meta characters. For
+/// example, `%` and `\%` both match a literal `%` in all contexts. In other
+/// words, `is_escapeable_character` includes "superfluous" escapes.
 ///
 /// Note that the set of characters for which this function returns `true` or
-/// `false` is fixed and won't change in a semver compatible release.
+/// `false` is fixed and won't change in a semver compatible release. (In this
+/// case, "semver compatible release" actually refers to the `regex` crate
+/// itself, since reducing or expanding the set of meta characters would be a
+/// breaking change for not just `regex-syntax` but also `regex` itself.)
+///
+/// # Example
+///
+/// ```
+/// use regex_syntax::is_meta_character;
+///
+/// assert!(is_meta_character('?'));
+/// assert!(is_meta_character('-'));
+/// assert!(is_meta_character('&'));
+/// assert!(is_meta_character('#'));
+///
+/// assert!(!is_meta_character('%'));
+/// assert!(!is_meta_character('/'));
+/// assert!(!is_meta_character('!'));
+/// assert!(!is_meta_character('"'));
+/// assert!(!is_meta_character('e'));
+/// ```
 pub fn is_meta_character(c: char) -> bool {
     match c {
         '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{'
@@ -230,6 +260,68 @@ pub fn is_meta_character(c: char) -> bool {
     }
 }
 
+/// Returns true if the given character can be escaped in a regex.
+///
+/// This returns true in all cases that `is_meta_character` returns true, but
+/// also returns true in some cases where `is_meta_character` returns false.
+/// For example, `%` is not a meta character, but it is escapeable. That is,
+/// `%` and `\%` both match a literal `%` in all contexts.
+///
+/// The purpose of this routine is to provide knowledge about what characters
+/// may be escaped. Namely, most regex engines permit "superfluous" escapes
+/// where characters without any special significance may be escaped even
+/// though there is no actual _need_ to do so.
+///
+/// This will return false for some characters. For example, `e` is not
+/// escapeable. Therefore, `\e` will either result in a parse error (which is
+/// true today), or it could backwards compatibly evolve into a new construct
+/// with its own meaning. Indeed, that is the purpose of banning _some_
+/// superfluous escapes: it provides a way to evolve the syntax in a compatible
+/// manner.
+///
+/// # Example
+///
+/// ```
+/// use regex_syntax::is_escapeable_character;
+///
+/// assert!(is_escapeable_character('?'));
+/// assert!(is_escapeable_character('-'));
+/// assert!(is_escapeable_character('&'));
+/// assert!(is_escapeable_character('#'));
+/// assert!(is_escapeable_character('%'));
+/// assert!(is_escapeable_character('/'));
+/// assert!(is_escapeable_character('!'));
+/// assert!(is_escapeable_character('"'));
+///
+/// assert!(!is_escapeable_character('e'));
+/// ```
+pub fn is_escapeable_character(c: char) -> bool {
+    // Certainly escapeable if it's a meta character.
+    if is_meta_character(c) {
+        return true;
+    }
+    // Any character that isn't ASCII is definitely not escapeable. There's
+    // no real need to allow things like \☃ right?
+    if !c.is_ascii() {
+        return false;
+    }
+    // Otherwise, we basically say that everything is escapeable unless it's a
+    // letter or digit. Things like \3 are either octal (when enabled) or an
+    // error, and we should keep it that way. Otherwise, letters are reserved
+    // for adding new syntax in a backwards compatible way.
+    match c {
+        '0'..='9' | 'A'..='Z' | 'a'..='z' => false,
+        // While not currently supported, we keep these as not escapeable to
+        // give us some flexibility with respect to supporting the \< and
+        // \> word boundary assertions in the future. By rejecting them as
+        // escapeable, \< and \> will result in a parse error. Thus, we can
+        // turn them into something else in the future without it being a
+        // backwards incompatible change.
+        '<' | '>' => false,
+        _ => true,
+    }
+}
+
 /// Returns true if and only if the given character is a Unicode word
 /// character.
 ///

From 0732763a780978cdc782df4e1e2a3efe8275c224 Mon Sep 17 00:00:00 2001
From: Andrew Gallant 
Date: Fri, 3 Mar 2023 12:29:44 -0500
Subject: [PATCH 59/79] syntax: allow Unicode in capture names

This changes the rules for capture names to be much less restrictive.
Namely, the requirements are now:

1. Must begin with an `_` or any alphabetic codepoint.
2. After the first codepoint, the name may contain any sequence of
   alpha-numeric codepoints along with `_`, `.`, `[` and `]`.

Closes #595
---
 regex-syntax/src/ast/parse.rs | 113 ++++++++++++++++++++++++++++++++--
 src/expand.rs                 |  10 ++-
 src/lib.rs                    |  10 ++-
 3 files changed, 125 insertions(+), 8 deletions(-)

diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs
index 901250f61..533766a86 100644
--- a/regex-syntax/src/ast/parse.rs
+++ b/regex-syntax/src/ast/parse.rs
@@ -109,11 +109,11 @@ fn is_hex(c: char) -> bool {
 /// If `first` is true, then `c` is treated as the first character in the
 /// group name (which must be alphabetic or underscore).
 fn is_capture_char(c: char, first: bool) -> bool {
-    c == '_'
-        || (!first
-            && (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']'))
-        || ('A' <= c && c <= 'Z')
-        || ('a' <= c && c <= 'z')
+    if first {
+        c == '_' || c.is_alphabetic()
+    } else {
+        c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
+    }
 }
 
 /// A builder for a regular expression parser.
@@ -3910,6 +3910,55 @@ bar
             }))
         );
 
+        assert_eq!(
+            parser("(?P)").parse(),
+            Ok(Ast::Group(ast::Group {
+                span: Span::new(
+                    Position::new(0, 1, 1),
+                    Position::new(9, 1, 9),
+                ),
+                kind: ast::GroupKind::CaptureName {
+                    starts_with_p: true,
+                    name: ast::CaptureName {
+                        span: Span::new(
+                            Position::new(4, 1, 5),
+                            Position::new(7, 1, 7),
+                        ),
+                        name: s("a¾"),
+                        index: 1,
+                    }
+                },
+                ast: Box::new(Ast::Empty(Span::new(
+                    Position::new(8, 1, 8),
+                    Position::new(8, 1, 8),
+                ))),
+            }))
+        );
+        assert_eq!(
+            parser("(?P<名字>)").parse(),
+            Ok(Ast::Group(ast::Group {
+                span: Span::new(
+                    Position::new(0, 1, 1),
+                    Position::new(12, 1, 9),
+                ),
+                kind: ast::GroupKind::CaptureName {
+                    starts_with_p: true,
+                    name: ast::CaptureName {
+                        span: Span::new(
+                            Position::new(4, 1, 5),
+                            Position::new(10, 1, 7),
+                        ),
+                        name: s("名字"),
+                        index: 1,
+                    }
+                },
+                ast: Box::new(Ast::Empty(Span::new(
+                    Position::new(11, 1, 8),
+                    Position::new(11, 1, 8),
+                ))),
+            }))
+        );
+
         assert_eq!(
             parser("(?P<").parse().unwrap_err(),
             TestError {
@@ -3968,6 +4017,60 @@ bar
                 },
             }
         );
+        assert_eq!(
+            parser("(?P<5>)").parse().unwrap_err(),
+            TestError {
+                span: span(4..5),
+                kind: ast::ErrorKind::GroupNameInvalid,
+            }
+        );
+        assert_eq!(
+            parser("(?P<5a>)").parse().unwrap_err(),
+            TestError {
+                span: span(4..5),
+                kind: ast::ErrorKind::GroupNameInvalid,
+            }
+        );
+        assert_eq!(
+            parser("(?P<¾>)").parse().unwrap_err(),
+            TestError {
+                span: Span::new(
+                    Position::new(4, 1, 5),
+                    Position::new(6, 1, 6),
+                ),
+                kind: ast::ErrorKind::GroupNameInvalid,
+            }
+        );
+        assert_eq!(
+            parser("(?P<¾a>)").parse().unwrap_err(),
+            TestError {
+                span: Span::new(
+                    Position::new(4, 1, 5),
+                    Position::new(6, 1, 6),
+                ),
+                kind: ast::ErrorKind::GroupNameInvalid,
+            }
+        );
+        assert_eq!(
+            parser("(?P<☃>)").parse().unwrap_err(),
+            TestError {
+                span: Span::new(
+                    Position::new(4, 1, 5),
+                    Position::new(7, 1, 6),
+                ),
+                kind: ast::ErrorKind::GroupNameInvalid,
+            }
+        );
+        assert_eq!(
+            parser("(?P)").parse().unwrap_err(),
+            TestError {
+                span: Span::new(
+                    Position::new(5, 1, 6),
+                    Position::new(8, 1, 7),
+                ),
+                kind: ast::ErrorKind::GroupNameInvalid,
+            }
+        );
     }
 
     #[test]
diff --git a/src/expand.rs b/src/expand.rs
index 67b514926..98fafc949 100644
--- a/src/expand.rs
+++ b/src/expand.rs
@@ -182,7 +182,8 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option> {
     })
 }
 
-/// Returns true if and only if the given byte is allowed in a capture name.
+/// Returns true if and only if the given byte is allowed in a capture name
+/// written in non-brace form.
 fn is_valid_cap_letter(b: u8) -> bool {
     match b {
         b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
@@ -236,4 +237,11 @@ mod tests {
     find!(find_cap_ref17, "$x_$y", c!("x_", 3));
     find!(find_cap_ref18, "${#}", c!("#", 4));
     find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
+    find!(find_cap_ref20, "${¾}", c!("¾", 5));
+    find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
+    find!(find_cap_ref22, "${a¾}", c!("a¾", 6));
+    find!(find_cap_ref23, "${☃}", c!("☃", 6));
+    find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
+    find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
+    find!(find_cap_ref26, "${名字}", c!("名字", 9));
 }
diff --git a/src/lib.rs b/src/lib.rs
index 1de347861..042d243f8 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -360,13 +360,19 @@ regex matches `abc` at positions `0`, `1`, `2` and `3`.
 
 
 (exp)          numbered capture group (indexed by opening parenthesis)
-(?P<name>exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
-(?<name>exp)   named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
+(?P<name>exp)  named (also numbered) capture group (names must be alpha-numeric)
+(?<name>exp)   named (also numbered) capture group (names must be alpha-numeric)
 (?:exp)        non-capturing group
 (?flags)       set flags within current group
 (?flags:exp)   set flags for exp (non-capturing)
 
+Capture group names must be any sequence of alpha-numeric Unicode codepoints, +in addition to `.`, `_`, `[` and `]`. Names must start with either an `_` or +an alphabetic codepoint. Alphabetic codepoints correspond to the `Alphabetic` +Unicode property, while numeric codepoints correspond to the union of the +`Decimal_Number`, `Letter_Number` and `Other_Number` general categories. + Flags are each a single character. For example, `(?x)` sets the flag `x` and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets From 8a0bf38b03c79bd74a3819b345115894dca40930 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 4 Mar 2023 13:40:51 -0500 Subject: [PATCH 60/79] api: add new 'Regex::static_captures_len' method This adds a new routine for computing the static number of capture groups that will appear in every match. If the number of groups is not invariant across all matches, then there is no static capture length. This is meant to help implement higher level convenience APIs for extracting capture groups, such as the one described in #824. We may wind up including such APIs in the regex crate itself, but this commit stops short of that. Instead, we just add this new property which should permit those APIs to exist outside of this crate for now. Closes #908 --- regex-syntax/src/hir/mod.rs | 80 +++++++++++++++++++++++++++++++ regex-syntax/src/hir/translate.rs | 35 ++++++++++++++ src/compile.rs | 2 + src/exec.rs | 6 +++ src/prog.rs | 4 ++ src/re_bytes.rs | 40 ++++++++++++++++ src/re_unicode.rs | 40 ++++++++++++++++ 7 files changed, 207 insertions(+) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 4102bfec5..495f489f1 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1833,6 +1833,7 @@ struct PropertiesI { look_set_suffix: LookSet, utf8: bool, captures_len: usize, + static_captures_len: Option, literal: bool, alternation_literal: bool, } @@ -1990,6 +1991,44 @@ impl Properties { self.0.captures_len } + /// Returns the total number of explicit capturing groups that appear in + /// every possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that this does not include the implicit capturing group + /// corresponding to the entire match. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex_syntax::parse; + /// + /// let len = |pattern| { + /// parse(pattern).map(|h| h.properties().static_captures_len()) + /// }; + /// + /// assert_eq!(Some(0), len("a")?); + /// assert_eq!(Some(1), len("(a)")?); + /// assert_eq!(Some(1), len("(a)|(b)")?); + /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(1), len("(b)+")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option { + self.0.static_captures_len + } + /// Return true if and only if this HIR is a simple literal. This is /// only true when this HIR expression is either itself a `Literal` or a /// concatenation of only `Literal`s. @@ -2100,6 +2139,13 @@ impl Properties { } else { LookSet::full() }; + // And also, an empty alternate means we have 0 static capture groups, + // but we otherwise start with the number corresponding to the first + // alternate. If any subsequent alternate has a different number of + // static capture groups, then we overall have a variation and not a + // static number of groups. + let static_captures_len = + it.peek().and_then(|p| p.borrow().static_captures_len()); // The base case is an empty alternation, which matches nothing. // Note though that empty alternations aren't possible, because the // Hir::alternation smart constructor rewrites those as empty character @@ -2112,6 +2158,7 @@ impl Properties { look_set_suffix: fix, utf8: true, captures_len: 0, + static_captures_len, literal: false, alternation_literal: true, }; @@ -2125,6 +2172,9 @@ impl Properties { props.utf8 = props.utf8 && p.is_utf8(); props.captures_len = props.captures_len.saturating_add(p.captures_len()); + if props.static_captures_len != p.static_captures_len() { + props.static_captures_len = None; + } props.alternation_literal = props.alternation_literal && p.is_alternation_literal(); if !min_poisoned { @@ -2180,6 +2230,7 @@ impl Properties { // since it too can match the empty string. utf8: true, captures_len: 0, + static_captures_len: Some(0), literal: false, alternation_literal: false, }; @@ -2196,6 +2247,7 @@ impl Properties { look_set_suffix: LookSet::empty(), utf8: core::str::from_utf8(&lit.0).is_ok(), captures_len: 0, + static_captures_len: Some(0), literal: true, alternation_literal: true, }; @@ -2212,6 +2264,7 @@ impl Properties { look_set_suffix: LookSet::empty(), utf8: class.is_utf8(), captures_len: 0, + static_captures_len: Some(0), literal: false, alternation_literal: false, }; @@ -2241,6 +2294,7 @@ impl Properties { // property borderline useless. utf8: true, captures_len: 0, + static_captures_len: Some(0), literal: false, alternation_literal: false, }; @@ -2268,6 +2322,7 @@ impl Properties { look_set_suffix: LookSet::empty(), utf8: p.is_utf8(), captures_len: p.captures_len(), + static_captures_len: p.static_captures_len(), literal: false, alternation_literal: false, }; @@ -2278,6 +2333,23 @@ impl Properties { inner.look_set_prefix = p.look_set_prefix(); inner.look_set_suffix = p.look_set_suffix(); } + // If the static captures len of the sub-expression is not known or is + // zero, then it automatically propagates to the repetition, regardless + // of the repetition. Otherwise, it might change, but only when the + // repetition can match 0 times. + if rep.min == 0 + && inner.static_captures_len.map_or(false, |len| len > 0) + { + // If we require a match 0 times, then our captures len is + // guaranteed to be zero. Otherwise, if we *can* match the empty + // string, then it's impossible to know how many captures will be + // in the resulting match. + if rep.max == Some(0) { + inner.static_captures_len = Some(0); + } else { + inner.static_captures_len = None; + } + } Properties(Box::new(inner)) } @@ -2286,6 +2358,9 @@ impl Properties { let p = capture.sub.properties(); Properties(Box::new(PropertiesI { captures_len: p.captures_len().saturating_add(1), + static_captures_len: p + .static_captures_len() + .map(|len| len.saturating_add(1)), literal: false, alternation_literal: false, ..*p.0.clone() @@ -2306,6 +2381,7 @@ impl Properties { look_set_suffix: LookSet::empty(), utf8: true, captures_len: 0, + static_captures_len: Some(0), literal: true, alternation_literal: true, }; @@ -2316,6 +2392,10 @@ impl Properties { props.utf8 = props.utf8 && p.is_utf8(); props.captures_len = props.captures_len.saturating_add(p.captures_len()); + props.static_captures_len = p + .static_captures_len() + .and_then(|len1| Some((len1, props.static_captures_len?))) + .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); props.literal = props.literal && p.is_literal(); props.alternation_literal = props.alternation_literal && p.is_alternation_literal(); diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 81ae9b898..766e19c07 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -3204,6 +3204,41 @@ mod tests { assert_eq!(1, props(r"([a&&b])").captures_len()); } + #[test] + fn analysis_static_captures_len() { + let len = |pattern| props(pattern).static_captures_len(); + assert_eq!(Some(0), len(r"")); + assert_eq!(Some(0), len(r"foo|bar")); + assert_eq!(None, len(r"(foo)|bar")); + assert_eq!(None, len(r"foo|(bar)")); + assert_eq!(Some(1), len(r"(foo|bar)")); + assert_eq!(Some(1), len(r"(a|b|c|d|e|f)")); + assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)")); + assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)")); + assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)")); + assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()")); + assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)")); + assert_eq!(None, len(r"(a)(b)(extra)?")); + assert_eq!(Some(1), len(r"(foo)|(bar)")); + assert_eq!(Some(2), len(r"(foo)(bar)")); + assert_eq!(Some(2), len(r"(foo)+(bar)")); + assert_eq!(None, len(r"(foo)*(bar)")); + assert_eq!(Some(0), len(r"(foo)?{0}")); + assert_eq!(None, len(r"(foo)?{1}")); + assert_eq!(Some(1), len(r"(foo){1}")); + assert_eq!(Some(1), len(r"(foo){1,}")); + assert_eq!(Some(1), len(r"(foo){1,}?")); + assert_eq!(None, len(r"(foo){1,}??")); + assert_eq!(None, len(r"(foo){0,}")); + assert_eq!(Some(1), len(r"(foo)(?:bar)")); + assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))")); + assert_eq!(Some(2), len(r"(?Pfoo)(?:bar)(bal|loon)")); + assert_eq!( + Some(2), + len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#) + ); + } + #[test] fn analysis_is_all_assertions() { // Positive examples. diff --git a/src/compile.rs b/src/compile.rs index c29196a72..c6eebcc35 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -161,6 +161,8 @@ impl Compiler { self.fill_to_next(patch.hole); self.compiled.matches = vec![self.insts.len()]; self.push_compiled(Inst::Match(0)); + self.compiled.static_captures_len = + expr.properties().static_captures_len(); self.compile_finish() } diff --git a/src/exec.rs b/src/exec.rs index e36e367ba..778a39d4c 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -1361,6 +1361,12 @@ impl Exec { pub fn capture_name_idx(&self) -> &Arc> { &self.ro.nfa.capture_name_idx } + + /// If the number of capture groups in every match is always the same, then + /// return that number. Otherwise return `None`. + pub fn static_captures_len(&self) -> Option { + self.ro.nfa.static_captures_len + } } impl Clone for Exec { diff --git a/src/prog.rs b/src/prog.rs index c211f71d8..100862cf1 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -27,6 +27,9 @@ pub struct Program { pub captures: Vec>, /// Pointers to all named capture groups into `captures`. pub capture_name_idx: Arc>, + /// If the number of capture groups is the same for all possible matches, + /// then this is that number. + pub static_captures_len: Option, /// A pointer to the start instruction. This can vary depending on how /// the program was compiled. For example, programs for use with the DFA /// engine have a `.*?` inserted at the beginning of unanchored regular @@ -83,6 +86,7 @@ impl Program { matches: vec![], captures: vec![], capture_name_idx: Arc::new(HashMap::new()), + static_captures_len: None, start: 0, byte_classes: vec![0; 256], only_utf8: true, diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 07e9f98ac..b8f9738e8 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -667,6 +667,46 @@ impl Regex { self.0.capture_names().len() } + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option { + self.0.static_captures_len().map(|len| len.saturating_add(1)) + } + /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations { diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 197510ea0..0e7fc70a4 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -725,6 +725,46 @@ impl Regex { self.0.capture_names().len() } + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option { + self.0.static_captures_len().map(|len| len.saturating_add(1)) + } + /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations { From cf345530e9ed7d766f5faa13cdf9f40f8c76cd29 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 5 Mar 2023 23:06:24 -0500 Subject: [PATCH 61/79] syntax: rename 'captures_len' to 'explicit_captures_len' And do the same for 'static_captures_len'. The motivation for this is that the top-level Regex API had equivalently named methods 'captures_len' and 'static_captures_len', except those included the implicit group and were therefore always 1 more than the same APIs on Hir. We distinguish them by renaming the routines on HIR. --- regex-syntax/src/hir/mod.rs | 88 +++++++++++++++++-------------- regex-syntax/src/hir/translate.rs | 28 +++++----- src/compile.rs | 2 +- 3 files changed, 63 insertions(+), 55 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 495f489f1..6f3b08fe3 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1832,8 +1832,8 @@ struct PropertiesI { look_set_prefix: LookSet, look_set_suffix: LookSet, utf8: bool, - captures_len: usize, - static_captures_len: Option, + explicit_captures_len: usize, + static_explicit_captures_len: Option, literal: bool, alternation_literal: bool, } @@ -1981,14 +1981,14 @@ impl Properties { /// ``` /// use regex_syntax::parse; /// - /// assert_eq!(0, parse("a")?.properties().captures_len()); - /// assert_eq!(1, parse("(a)")?.properties().captures_len()); + /// assert_eq!(0, parse("a")?.properties().explicit_captures_len()); + /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] - pub fn captures_len(&self) -> usize { - self.0.captures_len + pub fn explicit_captures_len(&self) -> usize { + self.0.explicit_captures_len } /// Returns the total number of explicit capturing groups that appear in @@ -2010,7 +2010,9 @@ impl Properties { /// use regex_syntax::parse; /// /// let len = |pattern| { - /// parse(pattern).map(|h| h.properties().static_captures_len()) + /// parse(pattern).map(|h| { + /// h.properties().static_explicit_captures_len() + /// }) /// }; /// /// assert_eq!(Some(0), len("a")?); @@ -2025,8 +2027,8 @@ impl Properties { /// # Ok::<(), Box>(()) /// ``` #[inline] - pub fn static_captures_len(&self) -> Option { - self.0.static_captures_len + pub fn static_explicit_captures_len(&self) -> Option { + self.0.static_explicit_captures_len } /// Return true if and only if this HIR is a simple literal. This is @@ -2144,8 +2146,8 @@ impl Properties { // alternate. If any subsequent alternate has a different number of // static capture groups, then we overall have a variation and not a // static number of groups. - let static_captures_len = - it.peek().and_then(|p| p.borrow().static_captures_len()); + let static_explicit_captures_len = + it.peek().and_then(|p| p.borrow().static_explicit_captures_len()); // The base case is an empty alternation, which matches nothing. // Note though that empty alternations aren't possible, because the // Hir::alternation smart constructor rewrites those as empty character @@ -2157,8 +2159,8 @@ impl Properties { look_set_prefix: fix, look_set_suffix: fix, utf8: true, - captures_len: 0, - static_captures_len, + explicit_captures_len: 0, + static_explicit_captures_len, literal: false, alternation_literal: true, }; @@ -2170,10 +2172,13 @@ impl Properties { props.look_set_prefix.set_intersect(p.look_set_prefix()); props.look_set_suffix.set_intersect(p.look_set_suffix()); props.utf8 = props.utf8 && p.is_utf8(); - props.captures_len = - props.captures_len.saturating_add(p.captures_len()); - if props.static_captures_len != p.static_captures_len() { - props.static_captures_len = None; + props.explicit_captures_len = props + .explicit_captures_len + .saturating_add(p.explicit_captures_len()); + if props.static_explicit_captures_len + != p.static_explicit_captures_len() + { + props.static_explicit_captures_len = None; } props.alternation_literal = props.alternation_literal && p.is_alternation_literal(); @@ -2229,8 +2234,8 @@ impl Properties { // were false, for example, then 'a*' would also need to be false // since it too can match the empty string. utf8: true, - captures_len: 0, - static_captures_len: Some(0), + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), literal: false, alternation_literal: false, }; @@ -2246,8 +2251,8 @@ impl Properties { look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), utf8: core::str::from_utf8(&lit.0).is_ok(), - captures_len: 0, - static_captures_len: Some(0), + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), literal: true, alternation_literal: true, }; @@ -2263,8 +2268,8 @@ impl Properties { look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), utf8: class.is_utf8(), - captures_len: 0, - static_captures_len: Some(0), + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), literal: false, alternation_literal: false, }; @@ -2293,8 +2298,8 @@ impl Properties { // considered to match invalid UTF-8. That in turn makes this // property borderline useless. utf8: true, - captures_len: 0, - static_captures_len: Some(0), + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), literal: false, alternation_literal: false, }; @@ -2321,8 +2326,8 @@ impl Properties { look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), utf8: p.is_utf8(), - captures_len: p.captures_len(), - static_captures_len: p.static_captures_len(), + explicit_captures_len: p.explicit_captures_len(), + static_explicit_captures_len: p.static_explicit_captures_len(), literal: false, alternation_literal: false, }; @@ -2338,16 +2343,16 @@ impl Properties { // of the repetition. Otherwise, it might change, but only when the // repetition can match 0 times. if rep.min == 0 - && inner.static_captures_len.map_or(false, |len| len > 0) + && inner.static_explicit_captures_len.map_or(false, |len| len > 0) { // If we require a match 0 times, then our captures len is // guaranteed to be zero. Otherwise, if we *can* match the empty // string, then it's impossible to know how many captures will be // in the resulting match. if rep.max == Some(0) { - inner.static_captures_len = Some(0); + inner.static_explicit_captures_len = Some(0); } else { - inner.static_captures_len = None; + inner.static_explicit_captures_len = None; } } Properties(Box::new(inner)) @@ -2357,9 +2362,9 @@ impl Properties { fn capture(capture: &Capture) -> Properties { let p = capture.sub.properties(); Properties(Box::new(PropertiesI { - captures_len: p.captures_len().saturating_add(1), - static_captures_len: p - .static_captures_len() + explicit_captures_len: p.explicit_captures_len().saturating_add(1), + static_explicit_captures_len: p + .static_explicit_captures_len() .map(|len| len.saturating_add(1)), literal: false, alternation_literal: false, @@ -2380,8 +2385,8 @@ impl Properties { look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), utf8: true, - captures_len: 0, - static_captures_len: Some(0), + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), literal: true, alternation_literal: true, }; @@ -2390,11 +2395,14 @@ impl Properties { let p = x.properties(); props.look_set.set_union(p.look_set()); props.utf8 = props.utf8 && p.is_utf8(); - props.captures_len = - props.captures_len.saturating_add(p.captures_len()); - props.static_captures_len = p - .static_captures_len() - .and_then(|len1| Some((len1, props.static_captures_len?))) + props.explicit_captures_len = props + .explicit_captures_len + .saturating_add(p.explicit_captures_len()); + props.static_explicit_captures_len = p + .static_explicit_captures_len() + .and_then(|len1| { + Some((len1, props.static_explicit_captures_len?)) + }) .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); props.literal = props.literal && p.is_literal(); props.alternation_literal = diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 766e19c07..24774ddea 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -3189,24 +3189,24 @@ mod tests { #[test] fn analysis_captures_len() { - assert_eq!(0, props(r"a").captures_len()); - assert_eq!(0, props(r"(?:a)").captures_len()); - assert_eq!(0, props(r"(?i-u:a)").captures_len()); - assert_eq!(0, props(r"(?i-u)a").captures_len()); - assert_eq!(1, props(r"(a)").captures_len()); - assert_eq!(1, props(r"(?Pa)").captures_len()); - assert_eq!(1, props(r"()").captures_len()); - assert_eq!(1, props(r"()a").captures_len()); - assert_eq!(1, props(r"(a)+").captures_len()); - assert_eq!(2, props(r"(a)(b)").captures_len()); - assert_eq!(2, props(r"(a)|(b)").captures_len()); - assert_eq!(2, props(r"((a))").captures_len()); - assert_eq!(1, props(r"([a&&b])").captures_len()); + assert_eq!(0, props(r"a").explicit_captures_len()); + assert_eq!(0, props(r"(?:a)").explicit_captures_len()); + assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len()); + assert_eq!(0, props(r"(?i-u)a").explicit_captures_len()); + assert_eq!(1, props(r"(a)").explicit_captures_len()); + assert_eq!(1, props(r"(?Pa)").explicit_captures_len()); + assert_eq!(1, props(r"()").explicit_captures_len()); + assert_eq!(1, props(r"()a").explicit_captures_len()); + assert_eq!(1, props(r"(a)+").explicit_captures_len()); + assert_eq!(2, props(r"(a)(b)").explicit_captures_len()); + assert_eq!(2, props(r"(a)|(b)").explicit_captures_len()); + assert_eq!(2, props(r"((a))").explicit_captures_len()); + assert_eq!(1, props(r"([a&&b])").explicit_captures_len()); } #[test] fn analysis_static_captures_len() { - let len = |pattern| props(pattern).static_captures_len(); + let len = |pattern| props(pattern).static_explicit_captures_len(); assert_eq!(Some(0), len(r"")); assert_eq!(Some(0), len(r"foo|bar")); assert_eq!(None, len(r"(foo)|bar")); diff --git a/src/compile.rs b/src/compile.rs index c6eebcc35..0030cfb10 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -162,7 +162,7 @@ impl Compiler { self.compiled.matches = vec![self.insts.len()]; self.push_compiled(Inst::Match(0)); self.compiled.static_captures_len = - expr.properties().static_captures_len(); + expr.properties().static_explicit_captures_len(); self.compile_finish() } From 7212a03c1036354b4fdf14ac371ee69d0714d4af Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 5 Mar 2023 21:05:11 -0500 Subject: [PATCH 62/79] syntax: optimize case folding It turns out that it's not too hard to get HIR translation to run pretty slowly with some carefully crafted regexes. For example: (?i:[[:^space:]------------------------------------------------------------------------]) This regex is actually a [:^space:] class that has an empty class subtracted from it 36 times. For each subtraction, the resulting class--despite it not having changed---goes through Unicode case folding again. This in turn slows things way down. We introduce a fairly basic optimization that basically keeps track of whether an interval set has been folded or not. The idea was taken from PR #893, but was tweaked slightly. The magic of how it works is that if two interval sets have already been folded, then they retain that property after any of the set operations: negation, union, difference, intersection and symmetric difference. So case folding should generally only need to be run once for each "base" class, but then not again as operations are performed. Some benchmarks were added to rebar (which isn't public yet at time of writing). Closes #893 --- regex-syntax/src/hir/interval.rs | 67 ++++++++++++++++++++++++++++++- regex-syntax/src/hir/translate.rs | 2 +- 2 files changed, 66 insertions(+), 3 deletions(-) diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index fbe772ea4..4efcf1e4b 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -30,9 +30,38 @@ use crate::unicode; // // Tests on this are relegated to the public API of HIR in src/hir.rs. -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug)] pub struct IntervalSet { + /// A sorted set of non-overlapping ranges. ranges: Vec, + /// While not required at all for correctness, we keep track of whether an + /// interval set has been case folded or not. This helps us avoid doing + /// redundant work if, for example, a set has already been cased folded. + /// And note that whether a set is folded or not is preserved through + /// all of the pairwise set operations. That is, if both interval sets + /// have been case folded, then any of difference, union, intersection or + /// symmetric difference all produce a case folded set. + /// + /// Note that when this is true, it *must* be the case that the set is case + /// folded. But when it's false, the set *may* be case folded. In other + /// words, we only set this to true when we know it to be case, but we're + /// okay with it being false if it would otherwise be costly to determine + /// whether it should be true. This means code cannot assume that a false + /// value necessarily indicates that the set is not case folded. + /// + /// Bottom line: this is a performance optimization. + folded: bool, +} + +impl Eq for IntervalSet {} + +// We implement PartialEq manually so that we don't consider the set's internal +// 'folded' property to be part of its identity. The 'folded' property is +// strictly an optimization. +impl PartialEq for IntervalSet { + fn eq(&self, other: &IntervalSet) -> bool { + self.ranges.eq(&other.ranges) + } } impl IntervalSet { @@ -42,7 +71,10 @@ impl IntervalSet { /// The given ranges do not need to be in any specific order, and ranges /// may overlap. pub fn new>(intervals: T) -> IntervalSet { - let mut set = IntervalSet { ranges: intervals.into_iter().collect() }; + let ranges: Vec = intervals.into_iter().collect(); + // An empty set is case folded. + let folded = ranges.is_empty(); + let mut set = IntervalSet { ranges, folded }; set.canonicalize(); set } @@ -53,6 +85,10 @@ impl IntervalSet { // it preserves canonicalization. self.ranges.push(interval); self.canonicalize(); + // We don't know whether the new interval added here is considered + // case folded, so we conservatively assume that the entire set is + // no longer case folded if it was previously. + self.folded = false; } /// Return an iterator over all intervals in this set. @@ -77,6 +113,9 @@ impl IntervalSet { /// This returns an error if the necessary case mapping data is not /// available. pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> { + if self.folded { + return Ok(()); + } let len = self.ranges.len(); for i in 0..len { let range = self.ranges[i]; @@ -86,14 +125,19 @@ impl IntervalSet { } } self.canonicalize(); + self.folded = true; Ok(()) } /// Union this set with the given set, in place. pub fn union(&mut self, other: &IntervalSet) { + if other.ranges.is_empty() { + return; + } // This could almost certainly be done more efficiently. self.ranges.extend(&other.ranges); self.canonicalize(); + self.folded = self.folded && other.folded; } /// Intersect this set with the given set, in place. @@ -103,6 +147,8 @@ impl IntervalSet { } if other.ranges.is_empty() { self.ranges.clear(); + // An empty set is case folded. + self.folded = true; return; } @@ -132,6 +178,7 @@ impl IntervalSet { } } self.ranges.drain(..drain_end); + self.folded = self.folded && other.folded; } /// Subtract the given set from this set, in place. @@ -224,6 +271,7 @@ impl IntervalSet { a += 1; } self.ranges.drain(..drain_end); + self.folded = self.folded && other.folded; } /// Compute the symmetric difference of the two sets, in place. @@ -249,6 +297,8 @@ impl IntervalSet { if self.ranges.is_empty() { let (min, max) = (I::Bound::min_value(), I::Bound::max_value()); self.ranges.push(I::create(min, max)); + // The set containing everything must case folded. + self.folded = true; return; } @@ -274,6 +324,19 @@ impl IntervalSet { self.ranges.push(I::create(lower, I::Bound::max_value())); } self.ranges.drain(..drain_end); + // We don't need to update whether this set is folded or not, because + // it is conservatively preserved through negation. Namely, if a set + // is not folded, then it is possible that its negation is folded, for + // example, [^☃]. But we're fine with assuming that the set is not + // folded in that case. (`folded` permits false negatives but not false + // positives.) + // + // But what about when a set is folded, is its negation also + // necessarily folded? Yes. Because if a set is folded, then for every + // character in the set, it necessarily included its equivalence class + // of case folded characters. Negating it in turn means that all + // equivalence classes in the set are negated, and any equivalence + // class that was previously not in the set is now entirely in the set. } /// Converts this set into a canonical ordering. diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 24774ddea..204653543 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -1083,7 +1083,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { class: &mut hir::ClassUnicode, ) -> Result<()> { // Note that we must apply case folding before negation! - // Consider `(?i)[^x]`. If we applied negation field, then + // Consider `(?i)[^x]`. If we applied negation first, then // the result would be the character class that matched any // Unicode scalar value. if self.flags().case_insensitive() { From 0dd585368b41e24e470cf935b63debd7268c5c63 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 5 Mar 2023 21:52:45 -0500 Subject: [PATCH 63/79] syntax: drop some Result type aliases I'm overall coming around to the opinion that these tend to make the code harder to read. So I've been steadily dropping the Result aliases. --- regex-syntax/src/ast/parse.rs | 2 +- regex-syntax/src/hir/translate.rs | 2 +- regex-syntax/src/unicode.rs | 136 +++++++++++++++--------------- 3 files changed, 70 insertions(+), 70 deletions(-) diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 533766a86..9cf64e9ec 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -171,7 +171,7 @@ impl ParserBuilder { /// constant stack space and moving the call stack to the heap), other /// crates may. /// - /// This limit is not checked until the entire Ast is parsed. Therefore, + /// This limit is not checked until the entire AST is parsed. Therefore, /// if callers want to put a limit on the amount of heap space used, then /// they should impose a limit on the length, in bytes, of the concrete /// pattern string. In particular, this is viable since this parser diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 204653543..3df9d1f8d 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -1058,7 +1058,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { fn convert_unicode_class_error( &self, span: &Span, - result: unicode::Result, + result: core::result::Result, ) -> Result { result.map_err(|err| { let sp = span.clone(); diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index 1689681fa..a87fa23c6 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -5,9 +5,6 @@ use alloc::{ use crate::hir; -/// A type alias for errors specific to Unicode handling of classes. -pub type Result = core::result::Result; - /// An inclusive range of codepoints from a generated file (hence the static /// lifetime). type Range = &'static [(char, char)]; @@ -25,9 +22,6 @@ pub enum Error { PerlClassNotFound, } -/// A type alias for errors specific to Unicode case folding. -pub type FoldResult = core::result::Result; - /// An error that occurs when Unicode-aware simple case folding fails. /// /// This error can occur when the case mapping tables necessary for Unicode @@ -83,14 +77,12 @@ impl core::fmt::Display for UnicodeWordError { /// This returns an error if the Unicode case folding tables are not available. pub fn simple_fold( c: char, -) -> FoldResult, Option>> -{ +) -> Result, Option>, CaseFoldError> { #[cfg(not(feature = "unicode-case"))] fn imp( _: char, - ) -> FoldResult< - core::result::Result, Option>, - > { + ) -> Result, Option>, CaseFoldError> + { use core::option::IntoIter; Err::, _>, _>(CaseFoldError(())) } @@ -98,9 +90,8 @@ pub fn simple_fold( #[cfg(feature = "unicode-case")] fn imp( c: char, - ) -> FoldResult< - core::result::Result, Option>, - > { + ) -> Result, Option>, CaseFoldError> + { use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; Ok(CASE_FOLDING_SIMPLE @@ -128,14 +119,14 @@ pub fn simple_fold( pub fn contains_simple_case_mapping( start: char, end: char, -) -> FoldResult { +) -> Result { #[cfg(not(feature = "unicode-case"))] - fn imp(_: char, _: char) -> FoldResult { + fn imp(_: char, _: char) -> Result { Err(CaseFoldError(())) } #[cfg(feature = "unicode-case")] - fn imp(start: char, end: char) -> FoldResult { + fn imp(start: char, end: char) -> Result { use core::cmp::Ordering; use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; @@ -192,7 +183,7 @@ pub enum ClassQuery<'a> { } impl<'a> ClassQuery<'a> { - fn canonicalize(&self) -> Result { + fn canonicalize(&self) -> Result { match *self { ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), ClassQuery::Binary(name) => self.canonical_binary(name), @@ -241,7 +232,10 @@ impl<'a> ClassQuery<'a> { } } - fn canonical_binary(&self, name: &str) -> Result { + fn canonical_binary( + &self, + name: &str, + ) -> Result { let norm = symbolic_name_normalize(name); // This is a special case where 'cf' refers to the 'Format' general @@ -302,7 +296,7 @@ enum CanonicalClassQuery { /// Looks up a Unicode class given a query. If one doesn't exist, then /// `None` is returned. -pub fn class(query: ClassQuery<'_>) -> Result { +pub fn class(query: ClassQuery<'_>) -> Result { use self::CanonicalClassQuery::*; match query.canonicalize()? { @@ -339,14 +333,14 @@ pub fn class(query: ClassQuery<'_>) -> Result { /// Returns a Unicode aware class for \w. /// /// This returns an error if the data is not available for \w. -pub fn perl_word() -> Result { +pub fn perl_word() -> Result { #[cfg(not(feature = "unicode-perl"))] - fn imp() -> Result { + fn imp() -> Result { Err(Error::PerlClassNotFound) } #[cfg(feature = "unicode-perl")] - fn imp() -> Result { + fn imp() -> Result { use crate::unicode_tables::perl_word::PERL_WORD; Ok(hir_class(PERL_WORD)) } @@ -357,20 +351,20 @@ pub fn perl_word() -> Result { /// Returns a Unicode aware class for \s. /// /// This returns an error if the data is not available for \s. -pub fn perl_space() -> Result { +pub fn perl_space() -> Result { #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] - fn imp() -> Result { + fn imp() -> Result { Err(Error::PerlClassNotFound) } #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] - fn imp() -> Result { + fn imp() -> Result { use crate::unicode_tables::perl_space::WHITE_SPACE; Ok(hir_class(WHITE_SPACE)) } #[cfg(feature = "unicode-bool")] - fn imp() -> Result { + fn imp() -> Result { use crate::unicode_tables::property_bool::WHITE_SPACE; Ok(hir_class(WHITE_SPACE)) } @@ -381,20 +375,20 @@ pub fn perl_space() -> Result { /// Returns a Unicode aware class for \d. /// /// This returns an error if the data is not available for \d. -pub fn perl_digit() -> Result { +pub fn perl_digit() -> Result { #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] - fn imp() -> Result { + fn imp() -> Result { Err(Error::PerlClassNotFound) } #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] - fn imp() -> Result { + fn imp() -> Result { use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER; Ok(hir_class(DECIMAL_NUMBER)) } #[cfg(feature = "unicode-gencat")] - fn imp() -> Result { + fn imp() -> Result { use crate::unicode_tables::general_category::DECIMAL_NUMBER; Ok(hir_class(DECIMAL_NUMBER)) } @@ -414,16 +408,14 @@ pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { /// Returns true only if the given codepoint is in the `\w` character class. /// /// If the `unicode-perl` feature is not enabled, then this returns an error. -pub fn is_word_character( - c: char, -) -> core::result::Result { +pub fn is_word_character(c: char) -> Result { #[cfg(not(feature = "unicode-perl"))] - fn imp(_: char) -> core::result::Result { + fn imp(_: char) -> Result { Err(UnicodeWordError(())) } #[cfg(feature = "unicode-perl")] - fn imp(c: char) -> core::result::Result { + fn imp(c: char) -> Result { use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD}; // MSRV(1.59): Use 'u8::try_from(c)' instead. @@ -455,7 +447,9 @@ pub fn is_word_character( /// value. type PropertyValues = &'static [(&'static str, &'static str)]; -fn canonical_gencat(normalized_value: &str) -> Result> { +fn canonical_gencat( + normalized_value: &str, +) -> Result, Error> { Ok(match normalized_value { "any" => Some("Any"), "assigned" => Some("Assigned"), @@ -467,7 +461,9 @@ fn canonical_gencat(normalized_value: &str) -> Result> { }) } -fn canonical_script(normalized_value: &str) -> Result> { +fn canonical_script( + normalized_value: &str, +) -> Result, Error> { let scripts = property_values("Script")?.unwrap(); Ok(canonical_value(scripts, normalized_value)) } @@ -480,7 +476,9 @@ fn canonical_script(normalized_value: &str) -> Result> { /// UAX44 LM3, which can be done using `symbolic_name_normalize`. /// /// If the property names data is not available, then an error is returned. -fn canonical_prop(normalized_name: &str) -> Result> { +fn canonical_prop( + normalized_name: &str, +) -> Result, Error> { #[cfg(not(any( feature = "unicode-age", feature = "unicode-bool", @@ -489,7 +487,7 @@ fn canonical_prop(normalized_name: &str) -> Result> { feature = "unicode-script", feature = "unicode-segment", )))] - fn imp(_: &str) -> Result> { + fn imp(_: &str) -> Result, Error> { Err(Error::PropertyNotFound) } @@ -501,7 +499,7 @@ fn canonical_prop(normalized_name: &str) -> Result> { feature = "unicode-script", feature = "unicode-segment", ))] - fn imp(name: &str) -> Result> { + fn imp(name: &str) -> Result, Error> { use crate::unicode_tables::property_names::PROPERTY_NAMES; Ok(PROPERTY_NAMES @@ -537,7 +535,7 @@ fn canonical_value( /// If the property values data is not available, then an error is returned. fn property_values( canonical_property_name: &'static str, -) -> Result> { +) -> Result, Error> { #[cfg(not(any( feature = "unicode-age", feature = "unicode-bool", @@ -546,7 +544,7 @@ fn property_values( feature = "unicode-script", feature = "unicode-segment", )))] - fn imp(_: &'static str) -> Result> { + fn imp(_: &'static str) -> Result, Error> { Err(Error::PropertyValueNotFound) } @@ -558,7 +556,7 @@ fn property_values( feature = "unicode-script", feature = "unicode-segment", ))] - fn imp(name: &'static str) -> Result> { + fn imp(name: &'static str) -> Result, Error> { use crate::unicode_tables::property_values::PROPERTY_VALUES; Ok(PROPERTY_VALUES @@ -589,15 +587,15 @@ fn property_set( /// /// If the given age value isn't valid or if the data isn't available, then an /// error is returned instead. -fn ages(canonical_age: &str) -> Result> { +fn ages(canonical_age: &str) -> Result, Error> { #[cfg(not(feature = "unicode-age"))] - fn imp(_: &str) -> Result> { + fn imp(_: &str) -> Result, Error> { use core::option::IntoIter; Err::, _>(Error::PropertyNotFound) } #[cfg(feature = "unicode-age")] - fn imp(canonical_age: &str) -> Result> { + fn imp(canonical_age: &str) -> Result, Error> { use crate::unicode_tables::age; const AGES: &[(&str, Range)] = &[ @@ -645,14 +643,14 @@ fn ages(canonical_age: &str) -> Result> { /// /// If the given general category could not be found, or if the general /// category data is not available, then an error is returned. -fn gencat(canonical_name: &'static str) -> Result { +fn gencat(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-gencat"))] - fn imp(_: &'static str) -> Result { + fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-gencat")] - fn imp(name: &'static str) -> Result { + fn imp(name: &'static str) -> Result { use crate::unicode_tables::general_category::BY_NAME; match name { "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), @@ -680,14 +678,14 @@ fn gencat(canonical_name: &'static str) -> Result { /// /// If the given script could not be found, or if the script data is not /// available, then an error is returned. -fn script(canonical_name: &'static str) -> Result { +fn script(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-script"))] - fn imp(_: &'static str) -> Result { + fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-script")] - fn imp(name: &'static str) -> Result { + fn imp(name: &'static str) -> Result { use crate::unicode_tables::script::BY_NAME; property_set(BY_NAME, name) .map(hir_class) @@ -705,14 +703,14 @@ fn script(canonical_name: &'static str) -> Result { /// not available, then an error is returned. fn script_extension( canonical_name: &'static str, -) -> Result { +) -> Result { #[cfg(not(feature = "unicode-script"))] - fn imp(_: &'static str) -> Result { + fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-script")] - fn imp(name: &'static str) -> Result { + fn imp(name: &'static str) -> Result { use crate::unicode_tables::script_extension::BY_NAME; property_set(BY_NAME, name) .map(hir_class) @@ -729,14 +727,16 @@ fn script_extension( /// /// If the given boolean property could not be found, or if the boolean /// property data is not available, then an error is returned. -fn bool_property(canonical_name: &'static str) -> Result { +fn bool_property( + canonical_name: &'static str, +) -> Result { #[cfg(not(feature = "unicode-bool"))] - fn imp(_: &'static str) -> Result { + fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-bool")] - fn imp(name: &'static str) -> Result { + fn imp(name: &'static str) -> Result { use crate::unicode_tables::property_bool::BY_NAME; property_set(BY_NAME, name) .map(hir_class) @@ -757,14 +757,14 @@ fn bool_property(canonical_name: &'static str) -> Result { /// /// If the given property could not be found, or if the corresponding data is /// not available, then an error is returned. -fn gcb(canonical_name: &'static str) -> Result { +fn gcb(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-segment"))] - fn imp(_: &'static str) -> Result { + fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-segment")] - fn imp(name: &'static str) -> Result { + fn imp(name: &'static str) -> Result { use crate::unicode_tables::grapheme_cluster_break::BY_NAME; property_set(BY_NAME, name) .map(hir_class) @@ -781,14 +781,14 @@ fn gcb(canonical_name: &'static str) -> Result { /// /// If the given property could not be found, or if the corresponding data is /// not available, then an error is returned. -fn wb(canonical_name: &'static str) -> Result { +fn wb(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-segment"))] - fn imp(_: &'static str) -> Result { + fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-segment")] - fn imp(name: &'static str) -> Result { + fn imp(name: &'static str) -> Result { use crate::unicode_tables::word_break::BY_NAME; property_set(BY_NAME, name) .map(hir_class) @@ -805,14 +805,14 @@ fn wb(canonical_name: &'static str) -> Result { /// /// If the given property could not be found, or if the corresponding data is /// not available, then an error is returned. -fn sb(canonical_name: &'static str) -> Result { +fn sb(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-segment"))] - fn imp(_: &'static str) -> Result { + fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-segment")] - fn imp(name: &'static str) -> Result { + fn imp(name: &'static str) -> Result { use crate::unicode_tables::sentence_break::BY_NAME; property_set(BY_NAME, name) .map(hir_class) From 627c997429f1071040c3c3fdbb28e5ad20ba0719 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 5 Mar 2023 22:59:52 -0500 Subject: [PATCH 64/79] syntax: refactor and optimize case folding This rewrites how Unicode simple case folding worked. Instead of just defining a single function and expecting callers to deal with the fallout, we know define a stateful type that "knows" about the structure of the case folding table. For example, it now knows enough to avoid binary search lookups in most cases. All we really have to do is require that callers lookup codepoints in sequence, which is perfectly fine for our use case. Ref #893 --- regex-syntax/src/hir/interval.rs | 2 +- regex-syntax/src/hir/mod.rs | 16 +-- regex-syntax/src/hir/translate.rs | 5 +- regex-syntax/src/unicode.rs | 214 ++++++++++++++++-------------- 4 files changed, 125 insertions(+), 112 deletions(-) diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index 4efcf1e4b..e063390a8 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -131,7 +131,7 @@ impl IntervalSet { /// Union this set with the given set, in place. pub fn union(&mut self, other: &IntervalSet) { - if other.ranges.is_empty() { + if other.ranges.is_empty() || self.ranges == other.ranges { return; } // This could almost certainly be done more efficiently. diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 6f3b08fe3..ce0ed4f4f 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1232,23 +1232,13 @@ impl Interval for ClassUnicodeRange { &self, ranges: &mut Vec, ) -> Result<(), unicode::CaseFoldError> { - if !unicode::contains_simple_case_mapping(self.start, self.end)? { + let mut folder = unicode::SimpleCaseFolder::new()?; + if !folder.overlaps(self.start, self.end) { return Ok(()); } let (start, end) = (u32::from(self.start), u32::from(self.end)); - let mut next_simple_cp = None; for cp in (start..=end).filter_map(char::from_u32) { - if next_simple_cp.map_or(false, |next| cp < next) { - continue; - } - let it = match unicode::simple_fold(cp)? { - Ok(it) => it, - Err(next) => { - next_simple_cp = next; - continue; - } - }; - for cp_folded in it { + for &cp_folded in folder.mapping(cp) { ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 3df9d1f8d..b22861fc7 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -824,8 +824,9 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } if self.flags().unicode() { // If case folding won't do anything, then don't bother trying. - let map = - unicode::contains_simple_case_mapping(c, c).map_err(|_| { + let map = unicode::SimpleCaseFolder::new() + .map(|f| f.overlaps(c, c)) + .map_err(|_| { self.error(span, ErrorKind::UnicodeCaseUnavailable) })?; if !map { diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index a87fa23c6..91bd4b120 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -64,75 +64,122 @@ impl core::fmt::Display for UnicodeWordError { } } -/// Return an iterator over the equivalence class of simple case mappings -/// for the given codepoint. The equivalence class does not include the -/// given codepoint. -/// -/// If the equivalence class is empty, then this returns the next scalar -/// value that has a non-empty equivalence class, if it exists. If no such -/// scalar value exists, then `None` is returned. The point of this behavior -/// is to permit callers to avoid calling `simple_fold` more than they need -/// to, since there is some cost to fetching the equivalence class. -/// -/// This returns an error if the Unicode case folding tables are not available. -pub fn simple_fold( - c: char, -) -> Result, Option>, CaseFoldError> { - #[cfg(not(feature = "unicode-case"))] - fn imp( - _: char, - ) -> Result, Option>, CaseFoldError> - { - use core::option::IntoIter; - Err::, _>, _>(CaseFoldError(())) - } +/// A state oriented traverser of the simple case folding table. +/// +/// A case folder can be constructed via `SimpleCaseFolder::new()`, which will +/// return an error if the underlying case folding table is unavailable. +/// +/// After construction, it is expected that callers will use +/// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly +/// increasing order. For example, calling it on `b` and then on `a` is illegal +/// and will result in a panic. +/// +/// The main idea of this type is that it tries hard to make mapping lookups +/// fast by exploiting the structure of the underlying table, and the ordering +/// assumption enables this. +#[derive(Debug)] +pub struct SimpleCaseFolder { + /// The simple case fold table. It's a sorted association list, where the + /// keys are Unicode scalar values and the values are the corresponding + /// equivalence class (not including the key) of the "simple" case folded + /// Unicode scalar values. + table: &'static [(char, &'static [char])], + /// The last codepoint that was used for a lookup. + last: Option, + /// The index to the entry in `table` corresponding to the smallest key `k` + /// such that `k > k0`, where `k0` is the most recent key lookup. Note that + /// in particular, `k0` may not be in the table! + next: usize, +} - #[cfg(feature = "unicode-case")] - fn imp( - c: char, - ) -> Result, Option>, CaseFoldError> - { - use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; - - Ok(CASE_FOLDING_SIMPLE - .binary_search_by_key(&c, |&(c1, _)| c1) - .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().copied()) - .map_err(|i| { - if i >= CASE_FOLDING_SIMPLE.len() { - None - } else { - Some(CASE_FOLDING_SIMPLE[i].0) - } - })) +impl SimpleCaseFolder { + /// Create a new simple case folder, returning an error if the underlying + /// case folding table is unavailable. + pub fn new() -> Result { + #[cfg(not(feature = "unicode-case"))] + { + Err(CaseFoldError(())) + } + #[cfg(feature = "unicode-case")] + { + Ok(SimpleCaseFolder { + table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE, + last: None, + next: 0, + }) + } } - imp(c) -} - -/// Returns true if and only if the given (inclusive) range contains at least -/// one Unicode scalar value that has a non-empty non-trivial simple case -/// mapping. -/// -/// This function panics if `end < start`. -/// -/// This returns an error if the Unicode case folding tables are not available. -pub fn contains_simple_case_mapping( - start: char, - end: char, -) -> Result { - #[cfg(not(feature = "unicode-case"))] - fn imp(_: char, _: char) -> Result { - Err(CaseFoldError(())) + /// Return the equivalence class of case folded codepoints for the given + /// codepoint. The equivalence class returned never includes the codepoint + /// given. If the given codepoint has no case folded codepoints (i.e., + /// no entry in the underlying case folding table), then this returns an + /// empty slice. + /// + /// # Panics + /// + /// This panics when called with a `c` that is less than or equal to the + /// previous call. In other words, callers need to use this method with + /// strictly increasing values of `c`. + pub fn mapping(&mut self, c: char) -> &'static [char] { + if let Some(last) = self.last { + assert!( + last < c, + "got codepoint U+{:X} which occurs before \ + last codepoint U+{:X}", + u32::from(c), + u32::from(last), + ); + } + self.last = Some(c); + if self.next >= self.table.len() { + return &[]; + } + let (k, v) = self.table[self.next]; + if k == c { + self.next += 1; + return v; + } + match self.get(c) { + Err(i) => { + self.next = i; + &[] + } + Ok(i) => { + // Since we require lookups to proceed + // in order, anything we find should be + // after whatever we thought might be + // next. Otherwise, the caller is either + // going out of order or we would have + // found our next key at 'self.next'. + assert!(i > self.next); + self.next = i + 1; + self.table[i].1 + } + } } - #[cfg(feature = "unicode-case")] - fn imp(start: char, end: char) -> Result { + /// Returns true if and only if the given range overlaps with any region + /// of the underlying case folding table. That is, when true, there exists + /// at least one codepoint in the inclusive range `[start, end]` that has + /// a non-trivial equivalence class of case folded codepoints. Conversely, + /// when this returns false, all codepoints in the range `[start, end]` + /// correspond to the trivial equivalence class of case folded codepoints, + /// i.e., itself. + /// + /// This is useful to call before iterating over the codepoints in the + /// range and looking up the mapping for each. If you know none of the + /// mappings will return anything, then you might be able to skip doing it + /// altogether. + /// + /// # Panics + /// + /// This panics when `end < start`. + pub fn overlaps(&self, start: char, end: char) -> bool { use core::cmp::Ordering; - use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; - assert!(start <= end); - Ok(CASE_FOLDING_SIMPLE + self.table .binary_search_by(|&(c, _)| { if start <= c && c <= end { Ordering::Equal @@ -142,10 +189,15 @@ pub fn contains_simple_case_mapping( Ordering::Less } }) - .is_ok()) + .is_ok() } - imp(start, end) + /// Returns the index at which `c` occurs in the simple case fold table. If + /// `c` does not occur, then this returns an `i` such that `table[i-1].0 < + /// c` and `table[i].0 > c`. + fn get(&self, c: char) -> Result { + self.table.binary_search_by_key(&c, |&(c1, _)| c1) + } } /// A query for finding a character class defined by Unicode. This supports @@ -897,20 +949,12 @@ mod tests { #[cfg(feature = "unicode-case")] fn simple_fold_ok(c: char) -> impl Iterator { - simple_fold(c).unwrap().unwrap() - } - - #[cfg(feature = "unicode-case")] - fn simple_fold_err(c: char) -> Option { - match simple_fold(c).unwrap() { - Ok(_) => unreachable!("simple_fold returned Ok iterator"), - Err(next) => next, - } + SimpleCaseFolder::new().unwrap().mapping(c).iter().copied() } #[cfg(feature = "unicode-case")] fn contains_case_map(start: char, end: char) -> bool { - contains_simple_case_mapping(start, end).unwrap() + SimpleCaseFolder::new().unwrap().overlaps(start, end) } #[test] @@ -936,26 +980,10 @@ mod tests { assert_eq!(xs, alloc::vec!['a']); } - #[test] - #[cfg(feature = "unicode-case")] - fn simple_fold_empty() { - assert_eq!(Some('A'), simple_fold_err('?')); - assert_eq!(Some('A'), simple_fold_err('@')); - assert_eq!(Some('a'), simple_fold_err('[')); - assert_eq!(Some('Ⰰ'), simple_fold_err('☃')); - } - - #[test] - #[cfg(feature = "unicode-case")] - fn simple_fold_max() { - assert_eq!(None, simple_fold_err('\u{10FFFE}')); - assert_eq!(None, simple_fold_err('\u{10FFFF}')); - } - #[test] #[cfg(not(feature = "unicode-case"))] fn simple_fold_disabled() { - assert!(simple_fold('a').is_err()); + assert!(SimpleCaseFolder::new().is_err()); } #[test] @@ -974,12 +1002,6 @@ mod tests { assert!(!contains_case_map('☃', '☃')); } - #[test] - #[cfg(not(feature = "unicode-case"))] - fn range_contains_disabled() { - assert!(contains_simple_case_mapping('a', 'a').is_err()); - } - #[test] #[cfg(feature = "unicode-gencat")] fn regression_466() { From 3c4961501c1bcd48f268ad1bf4aa9a53fdf0168d Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 14 Mar 2023 22:37:52 -0400 Subject: [PATCH 65/79] syntax: improve Debug impl for Class Previously, classes would show up in the debug representation as very deeply nested things, making them more difficult to read than they need to be. This removes at least a few pretty redundant layers and uses a more compact range notation. --- regex-syntax/src/hir/mod.rs | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index ce0ed4f4f..69c1dd403 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -787,7 +787,7 @@ impl core::fmt::Debug for Literal { /// the author of the regular expression to disable Unicode mode, which in turn /// impacts the semantics of case insensitive matching. For example, `(?i)k` /// and `(?i-u)k` will not match the same set of strings. -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Eq, PartialEq)] pub enum Class { /// A set of characters represented by Unicode scalar values. Unicode(ClassUnicode), @@ -986,6 +986,27 @@ impl Class { } } +impl core::fmt::Debug for Class { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use crate::debug::Byte; + + let mut fmter = f.debug_set(); + match *self { + Class::Unicode(ref cls) => { + for r in cls.ranges().iter() { + fmter.entry(&(r.start..=r.end)); + } + } + Class::Bytes(ref cls) => { + for r in cls.ranges().iter() { + fmter.entry(&(Byte(r.start)..=Byte(r.end))); + } + } + } + fmter.finish() + } +} + /// A set of characters represented by Unicode scalar values. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassUnicode { From b8ab381b319a010a414ab08d326b2113f7d1f750 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 6 Mar 2023 09:39:21 -0500 Subject: [PATCH 66/79] bug: fix CaptureLocations::get to handle invalid offsets The contract of this function says that any invalid group offset should result in a return value of None. In general, it worked fine, unless the offset was so big that some internal multiplication overflowed. That could in turn produce an incorrect result or a panic. So we fix that here with checked arithmetic. Fixes #738, Fixes #950 --- src/re_bytes.rs | 21 +++++++++++++++++++++ src/re_trait.rs | 2 +- src/re_unicode.rs | 21 +++++++++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/re_bytes.rs b/src/re_bytes.rs index b8f9738e8..b86973d0b 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -896,6 +896,27 @@ impl<'r> FusedIterator for CaptureNames<'r> {} /// In order to build a value of this type, you'll need to call the /// `capture_locations` method on the `Regex` being used to execute the search. /// The value returned can then be reused in subsequent searches. +/// +/// # Example +/// +/// This example shows how to create and use `CaptureLocations` in a search. +/// +/// ``` +/// use regex::bytes::Regex; +/// +/// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); +/// let mut locs = re.capture_locations(); +/// let m = re.captures_read(&mut locs, b"Bruce Springsteen").unwrap(); +/// assert_eq!(0..17, m.range()); +/// assert_eq!(Some((0, 17)), locs.get(0)); +/// assert_eq!(Some((0, 5)), locs.get(1)); +/// assert_eq!(Some((6, 17)), locs.get(2)); +/// +/// // Asking for an invalid capture group always returns None. +/// assert_eq!(None, locs.get(3)); +/// assert_eq!(None, locs.get(34973498648)); +/// assert_eq!(None, locs.get(9944060567225171988)); +/// ``` #[derive(Clone, Debug)] pub struct CaptureLocations(re_trait::Locations); diff --git a/src/re_trait.rs b/src/re_trait.rs index d0c717df5..505810c84 100644 --- a/src/re_trait.rs +++ b/src/re_trait.rs @@ -20,7 +20,7 @@ impl Locations { /// not match anything. The positions returned are *always* byte indices /// with respect to the original string matched. pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - let (s, e) = (i * 2, i * 2 + 1); + let (s, e) = (i.checked_mul(2)?, i.checked_mul(2)?.checked_add(1)?); match (self.0.get(s), self.0.get(e)) { (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), _ => None, diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 0e7fc70a4..41bd8ac09 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -906,6 +906,27 @@ impl<'r, 't> FusedIterator for SplitN<'r, 't> {} /// In order to build a value of this type, you'll need to call the /// `capture_locations` method on the `Regex` being used to execute the search. /// The value returned can then be reused in subsequent searches. +/// +/// # Example +/// +/// This example shows how to create and use `CaptureLocations` in a search. +/// +/// ``` +/// use regex::Regex; +/// +/// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); +/// let mut locs = re.capture_locations(); +/// let m = re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); +/// assert_eq!(0..17, m.range()); +/// assert_eq!(Some((0, 17)), locs.get(0)); +/// assert_eq!(Some((0, 5)), locs.get(1)); +/// assert_eq!(Some((6, 17)), locs.get(2)); +/// +/// // Asking for an invalid capture group always returns None. +/// assert_eq!(None, locs.get(3)); +/// assert_eq!(None, locs.get(34973498648)); +/// assert_eq!(None, locs.get(9944060567225171988)); +/// ``` #[derive(Clone, Debug)] pub struct CaptureLocations(re_trait::Locations); From e65ba17f132fffd2b942b5bd26fa7d844305feb9 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 6 Mar 2023 10:00:32 -0500 Subject: [PATCH 67/79] doc: add wording about Unicode scalar values This makes it clearer that the regex engine works by *logically* treating a haystack as a sequence of codepoints. Or more specifically, Unicode scalar values. Fixes #854 --- src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 042d243f8..af9cea20d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -199,6 +199,8 @@ instead.) This implementation executes regular expressions **only** on valid UTF-8 while exposing match locations as byte indices into the search string. (To relax this restriction, use the [`bytes`](bytes/index.html) sub-module.) +Conceptually, the regex engine works by matching a haystack as if it were a +sequence of Unicode scalar values. Only simple case folding is supported. Namely, when matching case-insensitively, the characters are first mapped using the "simple" case From d04ea102cba1b2a29eecf663d8f5ace59b5f0a3b Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 6 Mar 2023 10:26:59 -0500 Subject: [PATCH 68/79] doc: add more explanation to 'CompiledTooBig' error The existing docs were pretty paltry, and it turns out we can be a bit more helpful for folks when they hit this error. Fixes #846 --- src/error.rs | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/error.rs b/src/error.rs index 3e0ec7521..6c341f604 100644 --- a/src/error.rs +++ b/src/error.rs @@ -6,8 +6,26 @@ use std::iter::repeat; pub enum Error { /// A syntax error. Syntax(String), - /// The compiled program exceeded the set size limit. - /// The argument is the size limit imposed. + /// The compiled program exceeded the set size + /// limit. The argument is the size limit imposed by + /// [`RegexBuilder::size_limit`](crate::RegexBuilder::size_limit). Even + /// when not configured explicitly, it defaults to a reasonable limit. + /// + /// If you're getting this error, it occurred because your regex has been + /// compiled to an intermediate state that is too big. It is important to + /// note that exceeding this limit does _not_ mean the regex is too big to + /// _work_, but rather, the regex is big enough that it may wind up being + /// surprisingly slow when used in a search. In other words, this error is + /// meant to be a practical heuristic for avoiding a performance footgun, + /// and especially so for the case where the regex pattern is coming from + /// an untrusted source. + /// + /// There are generally two ways to move forward if you hit this error. + /// The first is to find some way to use a smaller regex. The second is to + /// increase the size limit via `RegexBuilder::size_limit`. However, if + /// your regex pattern is not from a trusted source, then neither of these + /// approaches may be appropriate. Instead, you'll have to determine just + /// how big of a regex you want to allow. CompiledTooBig(usize), /// Hints that destructuring should not be exhaustive. /// From 07c453dc60125923c9c48c45669de5aa55362c18 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 6 Mar 2023 10:35:01 -0500 Subject: [PATCH 69/79] api: add Match::{is_empty, len} Adding these methods has almost no cost and they can be convenient to have in some cases. Closes #810 --- src/re_bytes.rs | 12 ++++++++++++ src/re_unicode.rs | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/src/re_bytes.rs b/src/re_bytes.rs index b86973d0b..64c09a725 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -37,6 +37,18 @@ impl<'t> Match<'t> { self.end } + /// Returns true if and only if this match has a length of zero. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + /// Returns the length, in bytes, of this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + /// Returns the range over the starting and ending byte offsets of the /// match in the haystack. #[inline] diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 41bd8ac09..bee365e8d 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -45,6 +45,18 @@ impl<'t> Match<'t> { self.end } + /// Returns true if and only if this match has a length of zero. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + /// Returns the length, in bytes, of this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + /// Returns the range over the starting and ending byte offsets of the /// match in the haystack. #[inline] From be2afa1c05890a5d0d9fe222ce86a948621cb892 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 6 Mar 2023 10:41:20 -0500 Subject: [PATCH 70/79] doc: tweak docs for 'shortest_match' The name is somewhat unfortunate, but it's actually kind of difficult to capture the right semantics in the name. The key bit is that the function returns the offset at the point at which a match is known, and that point might vary depending on which internal regex engine was used. Fixes #747 --- src/re_bytes.rs | 9 ++++++++- src/re_unicode.rs | 17 ++++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 64c09a725..7d488a95b 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -549,7 +549,14 @@ impl Regex { /// This method may have the same performance characteristics as /// `is_match`, except it provides an end location for a match. In /// particular, the location returned *may be shorter* than the proper end - /// of the leftmost-first match. + /// of the leftmost-first match that you would find via `Regex::find`. + /// + /// Note that it is not guaranteed that this routine finds the shortest or + /// "earliest" possible match. Instead, the main idea of this API is that + /// it returns the offset at the point at which the internal regex engine + /// has determined that a match has occurred. This may vary depending on + /// which internal regex engine is used, and thus, the offset itself may + /// change. /// /// # Example /// diff --git a/src/re_unicode.rs b/src/re_unicode.rs index bee365e8d..1e8bd0453 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -607,7 +607,14 @@ impl Regex { /// This method may have the same performance characteristics as /// `is_match`, except it provides an end location for a match. In /// particular, the location returned *may be shorter* than the proper end - /// of the leftmost-first match. + /// of the leftmost-first match that you would find via `Regex::find`. + /// + /// Note that it is not guaranteed that this routine finds the shortest or + /// "earliest" possible match. Instead, the main idea of this API is that + /// it returns the offset at the point at which the internal regex engine + /// has determined that a match has occurred. This may vary depending on + /// which internal regex engine is used, and thus, the offset itself may + /// change. /// /// # Example /// @@ -627,12 +634,12 @@ impl Regex { self.shortest_match_at(text, 0) } - /// Returns the same as shortest_match, but starts the search at the given - /// offset. + /// Returns the same as `shortest_match`, but starts the search at the + /// given offset. /// /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. + /// context into consideration. For example, the `\A` anchor can only match + /// when `start == 0`. pub fn shortest_match_at( &self, text: &str, From 061dd6898f5ce6dae03b2caae72b37da5b850d04 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 6 Mar 2023 10:56:13 -0500 Subject: [PATCH 71/79] doc: clarify verbose mode This clarifies that `x` is "verbose mode," and that whitespace becomes insignificant everywhere, including in character classes. We also add guidance for how to insert a space: either escape it or use a hex literal. Fixes #660 --- src/lib.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index af9cea20d..5f92bc85a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -388,9 +388,13 @@ m multi-line mode: ^ and $ match begin/end of line s allow . to match \n U swap the meaning of x* and x*? u Unicode support (enabled by default) -x ignore whitespace and allow line comments (starting with `#`) +x verbose mode, ignores whitespace and allow line comments (starting with `#`)
+Note that in verbose mode, whitespace is ignored everywhere, including within +character classes. To insert whitespace, use its escaped form or a hex literal. +For example, `\ ` or `\x20` for an ASCII space. + Flags can be toggled within a pattern. Here's an example that matches case-insensitively for the first part but case-sensitively for the second part: From 67a60cfdf5aabec5e18ee16ee97dd858277d8c3b Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 6 Mar 2023 11:00:03 -0500 Subject: [PATCH 72/79] doc: clarify meaning of SetMatches::len It is really unfortunate, but SetMatches::len and SetMatcher::iter().count() do not correspond go the same thing. It's not clear why I even added the SetMatches::len method in the first place, but it always returns the number of regexes in the set, and not the number of regexes that matched. We can't change the name (or remove the method) obviously, but we do add a warning to the docs. Fixes #625 --- src/re_set.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/re_set.rs b/src/re_set.rs index 92d475f7b..7c8253f0c 100644 --- a/src/re_set.rs +++ b/src/re_set.rs @@ -321,6 +321,11 @@ impl SetMatches { } /// The total number of regexes in the set that created these matches. + /// + /// **WARNING:** This always returns the same value as [`RegexSet::len`]. + /// In particular, it does *not* return the number of elements yielded by + /// [`SetMatches::iter`]. The only way to determine the total number of + /// matched regexes is to iterate over them. pub fn len(&self) -> usize { self.matches.len() } From f3de42b600e04928989a2e1aaeb427f9969e7070 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 6 Mar 2023 11:06:50 -0500 Subject: [PATCH 73/79] doc: add example that uses an alternation And we make it an interesting example, i.e., one that demonstrates preference order semantics. Closes #610 --- src/lib.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 5f92bc85a..97d304787 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -327,6 +327,25 @@ xy concatenation (x followed by y) x|y alternation (x or y, prefer x) +This example shows how an alternation works, and what it means to prefer a +branch in the alternation over subsequent branches. + +``` +use regex::Regex; + +let haystack = "samwise"; +// If 'samwise' comes first in our alternation, then it is +// preferred as a match, even if the regex engine could +// technically detect that 'sam' led to a match earlier. +let re = Regex::new(r"samwise|sam").unwrap(); +assert_eq!("samwise", re.find(haystack).unwrap().as_str()); +// But if 'sam' comes first, then it will match instead. +// In this case, it is impossible for 'samwise' to match +// because 'sam' is a prefix of it. +let re = Regex::new(r"sam|samwise").unwrap(); +assert_eq!("sam", re.find(haystack).unwrap().as_str()); +``` + ## Repetitions

From cdf6325d8ef49c5bf3a174ebc38f7970e39e1e3b Mon Sep 17 00:00:00 2001
From: Andrew Gallant 
Date: Mon, 6 Mar 2023 11:17:22 -0500
Subject: [PATCH 74/79] api: add Regex::captures_at

This isn't *strictly* needed because of the existence of
Regex::captures_read_at, but it does fill out the singular missing
method. Namely, all other search routines have an *_at variant, so we
might as well add it for Regex::captures too.

Closes #547
---
 src/re_bytes.rs   | 26 ++++++++++++++++++++------
 src/re_unicode.rs | 26 ++++++++++++++++++++------
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/src/re_bytes.rs b/src/re_bytes.rs
index 7d488a95b..fcc9cd5bb 100644
--- a/src/re_bytes.rs
+++ b/src/re_bytes.rs
@@ -265,12 +265,7 @@ impl Regex {
     /// The `0`th capture group is always unnamed, so it must always be
     /// accessed with `get(0)` or `[0]`.
     pub fn captures<'t>(&self, text: &'t [u8]) -> Option> {
-        let mut locs = self.capture_locations();
-        self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
-            text,
-            locs: locs.0,
-            named_groups: self.0.capture_name_idx().clone(),
-        })
+        self.captures_at(text, 0)
     }
 
     /// Returns an iterator over all the non-overlapping capture groups matched
@@ -617,6 +612,25 @@ impl Regex {
             .map(|(s, e)| Match::new(text, s, e))
     }
 
+    /// Returns the same as [`Regex::captures`], but starts the search at the
+    /// given offset.
+    ///
+    /// The significance of the starting point is that it takes the surrounding
+    /// context into consideration. For example, the `\A` anchor can only
+    /// match when `start == 0`.
+    pub fn captures_at<'t>(
+        &self,
+        text: &'t [u8],
+        start: usize,
+    ) -> Option> {
+        let mut locs = self.capture_locations();
+        self.captures_read_at(&mut locs, text, start).map(move |_| Captures {
+            text,
+            locs: locs.0,
+            named_groups: self.0.capture_name_idx().clone(),
+        })
+    }
+
     /// This is like `captures`, but uses
     /// [`CaptureLocations`](struct.CaptureLocations.html)
     /// instead of
diff --git a/src/re_unicode.rs b/src/re_unicode.rs
index 1e8bd0453..296736080 100644
--- a/src/re_unicode.rs
+++ b/src/re_unicode.rs
@@ -321,12 +321,7 @@ impl Regex {
     /// The `0`th capture group is always unnamed, so it must always be
     /// accessed with `get(0)` or `[0]`.
     pub fn captures<'t>(&self, text: &'t str) -> Option> {
-        let mut locs = self.capture_locations();
-        self.captures_read_at(&mut locs, text, 0).map(move |_| Captures {
-            text,
-            locs: locs.0,
-            named_groups: self.0.capture_name_idx().clone(),
-        })
+        self.captures_at(text, 0)
     }
 
     /// Returns an iterator over all the non-overlapping capture groups matched
@@ -675,6 +670,25 @@ impl Regex {
             .map(|(s, e)| Match::new(text, s, e))
     }
 
+    /// Returns the same as [`Regex::captures`], but starts the search at the
+    /// given offset.
+    ///
+    /// The significance of the starting point is that it takes the surrounding
+    /// context into consideration. For example, the `\A` anchor can only
+    /// match when `start == 0`.
+    pub fn captures_at<'t>(
+        &self,
+        text: &'t str,
+        start: usize,
+    ) -> Option> {
+        let mut locs = self.capture_locations();
+        self.captures_read_at(&mut locs, text, start).map(move |_| Captures {
+            text,
+            locs: locs.0,
+            named_groups: self.0.capture_name_idx().clone(),
+        })
+    }
+
     /// This is like `captures`, but uses
     /// [`CaptureLocations`](struct.CaptureLocations.html)
     /// instead of

From 398843133b57c56820c1994b21d9589aa51ec7e0 Mon Sep 17 00:00:00 2001
From: Andrew Gallant 
Date: Mon, 6 Mar 2023 13:23:45 -0500
Subject: [PATCH 75/79] api: improve Debug impl for Match

This makes it so the Debug impl for Match only shows the actual matched
text. Otherwise, the Match shows the entire haystack, which is likely to
be misleading.

Fixes #514
---
 src/re_bytes.rs   | 20 +++++++++++++++++++-
 src/re_unicode.rs | 12 +++++++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/re_bytes.rs b/src/re_bytes.rs
index fcc9cd5bb..e3a3b019b 100644
--- a/src/re_bytes.rs
+++ b/src/re_bytes.rs
@@ -17,7 +17,7 @@ use crate::re_trait::{self, RegularExpression, SubCapturesPosIter};
 /// Match represents a single match of a regex in a haystack.
 ///
 /// The lifetime parameter `'t` refers to the lifetime of the matched text.
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[derive(Copy, Clone, Eq, PartialEq)]
 pub struct Match<'t> {
     text: &'t [u8],
     start: usize,
@@ -69,6 +69,24 @@ impl<'t> Match<'t> {
     }
 }
 
+impl<'t> std::fmt::Debug for Match<'t> {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        let mut fmt = f.debug_struct("Match");
+        fmt.field("start", &self.start).field("end", &self.end);
+        if let Ok(s) = std::str::from_utf8(self.as_bytes()) {
+            fmt.field("bytes", &s);
+        } else {
+            // FIXME: It would be nice if this could be printed as a string
+            // with invalid UTF-8 replaced with hex escapes. A alloc would
+            // probably okay if that makes it easier, but regex-automata does
+            // (at time of writing) have internal routines that do this. So
+            // maybe we should expose them.
+            fmt.field("bytes", &self.as_bytes());
+        }
+        fmt.finish()
+    }
+}
+
 impl<'t> From> for Range {
     fn from(m: Match<'t>) -> Range {
         m.range()
diff --git a/src/re_unicode.rs b/src/re_unicode.rs
index 296736080..57689086d 100644
--- a/src/re_unicode.rs
+++ b/src/re_unicode.rs
@@ -25,7 +25,7 @@ pub fn escape(text: &str) -> String {
 /// Match represents a single match of a regex in a haystack.
 ///
 /// The lifetime parameter `'t` refers to the lifetime of the matched text.
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[derive(Copy, Clone, Eq, PartialEq)]
 pub struct Match<'t> {
     text: &'t str,
     start: usize,
@@ -77,6 +77,16 @@ impl<'t> Match<'t> {
     }
 }
 
+impl<'t> std::fmt::Debug for Match<'t> {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        f.debug_struct("Match")
+            .field("start", &self.start)
+            .field("end", &self.end)
+            .field("string", &self.as_str())
+            .finish()
+    }
+}
+
 impl<'t> From> for &'t str {
     fn from(m: Match<'t>) -> &'t str {
         m.as_str()

From 3f4bfa6f72ea95f039f8fe672bb42be903d81417 Mon Sep 17 00:00:00 2001
From: Andrew Gallant 
Date: Mon, 20 Mar 2023 22:06:28 -0400
Subject: [PATCH 76/79] syntax: add 'Repetition::with'

This is useful when doing structural recursion on a '&Hir' to produce a
new 'Hir' derived from it.
---
 regex-syntax/src/hir/mod.rs | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
index 69c1dd403..36989cc2d 100644
--- a/regex-syntax/src/hir/mod.rs
+++ b/regex-syntax/src/hir/mod.rs
@@ -1739,6 +1739,19 @@ pub struct Repetition {
     pub sub: Box,
 }
 
+impl Repetition {
+    /// Returns a new repetition with the same `min`, `max` and `greedy`
+    /// values, but with its sub-expression replaced with the one given.
+    pub fn with(&self, sub: Hir) -> Repetition {
+        Repetition {
+            min: self.min,
+            max: self.max,
+            greedy: self.greedy,
+            sub: Box::new(sub),
+        }
+    }
+}
+
 /// A type describing the different flavors of `.`.
 ///
 /// This type is meant to be used with [`Hir::dot`], which is a convenience

From e166658f41dbdec9ac05ca538937d4fd3aae4e7c Mon Sep 17 00:00:00 2001
From: Andrew Gallant 
Date: Fri, 24 Mar 2023 10:58:25 -0400
Subject: [PATCH 77/79] syntax: add 'Properties::memory_usage'

Since it uses heap memory and because it's something you typically hang
on to in a regex engine, we expose a routine for computing heap memory.

We might consider doing this for other types in regex-syntax, but there
hasn't been a strong need for it yet.
---
 regex-syntax/src/hir/mod.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs
index 36989cc2d..a19808317 100644
--- a/regex-syntax/src/hir/mod.rs
+++ b/regex-syntax/src/hir/mod.rs
@@ -2080,6 +2080,13 @@ impl Properties {
         self.0.alternation_literal
     }
 
+    /// Returns the total amount of heap memory usage, in bytes, used by this
+    /// `Properties` value.
+    #[inline]
+    pub fn memory_usage(&self) -> usize {
+        core::mem::size_of::()
+    }
+
     /// Returns a new set of properties that corresponds to the union of the
     /// iterator of properties given.
     ///

From a34c1a7a746f561a11b4c8f2a8e2ee66901c8bb9 Mon Sep 17 00:00:00 2001
From: Andrew Gallant 
Date: Sat, 15 Apr 2023 08:29:15 -0400
Subject: [PATCH 78/79] doc: tweak presentation of \pN syntax

The wording appears to be a little unclear, so we switch it around a
bit.

Fixes #975
---
 src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 97d304787..82c1b77ad 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -287,9 +287,9 @@ a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax).
 .             any character except new line (includes new line with s flag)
 \d            digit (\p{Nd})
 \D            not digit
-\pN           One-letter name Unicode character class
+\pX           Unicode character class identified by a one-letter name
 \p{Greek}     Unicode character class (general category or script)
-\PN           Negated one-letter name Unicode character class
+\PX           Negated Unicode character class identified by a one-letter name
 \P{Greek}     negated Unicode character class (general category or script)
 
From 82b0f0dacb2579db008633965af28006b5e09839 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 17 Apr 2023 13:55:36 -0400 Subject: [PATCH 79/79] changelog: add entry for regex 1.8 This will need to be updated again to add a date (maybe today?), but this should cover everything from the commit log. --- CHANGELOG.md | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 118 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 44274acac..a07999ede 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,120 @@ +1.8.0 (TBD) +=========== +This is a sizeable release that will be soon followed by another sizeable +release. Both of them will combined close over 40 existing issues and PRs. + +This first release, despite its size, essentially represent preparatory work +for the second release, which will be even bigger. Namely, this release: + +* Increases the MSRV to Rust 1.60.0, which was released about 1 year ago. +* Upgrades its dependency on `aho-corasick` to the recently release 1.0 +version. +* Upgrades its dependency on `regex-syntax` to the simultaneously released +`0.7` version. The changes to `regex-syntax` principally revolve around a +rewrite of its literal extraction code and a number of simplifications and +optimizations to its high-level intermediate representation (HIR). + +The second release, which will follow ~shortly after the release above, will +contain a soup-to-nuts rewrite of every regex engine. This will be done by +bringing [`regex-automata`](https://github.com/BurntSushi/regex-automata) into +this repository, and then changing the `regex` crate to be nothing but an API +shim layer on top of `regex-automata`'s API. + +These tandem releases are the culmination of about 3 +years of on-and-off work that [began in earnest in March +2020](https://github.com/rust-lang/regex/issues/656). + +Because of the scale of changes involved in these releases, I would love to +hear about your experience. Especially if you notice undocumented changes in +behavior or performance changes (positive *or* negative). + +Most changes in the first release are listed below. For more details, please +see the commit log, which reflects a linear and decently documented history +of all changes. + +New features: + +* [FEATURE #501](https://github.com/rust-lang/regex/issues/501): +Permit many more characters to be escaped, even if they have no significance. +More specifically, any character except for `[0-9A-Za-z<>]` can now be +escaped. Also, a new routine, `is_escapeable_character`, has been added to +`regex-syntax` to query whether a character is escapeable or not. +* [FEATURE #547](https://github.com/rust-lang/regex/issues/547): +Add `Regex::captures_at`. This filles a hole in the API, but doesn't otherwise +introduce any new expressive power. +* [FEATURE #595](https://github.com/rust-lang/regex/issues/595): +Capture group names are now Unicode-aware. They can now begin with either a `_` +or any "alphabetic" codepoint. After the first codepoint, subsequent codepoints +can be any sequence of alpha-numeric codepoints, along with `_`, `.`, `[` and +`]`. Note that replacement syntax has not changed. +* [FEATURE #810](https://github.com/rust-lang/regex/issues/810): +Add `Match::is_empty` and `Match::len` APIs. +* [FEATURE #905](https://github.com/rust-lang/regex/issues/905): +Add an `impl Default for RegexSet`, with the default being the empty set. +* [FEATURE #908](https://github.com/rust-lang/regex/issues/908): +A new method, `Regex::static_captures_len`, has been added which returns the +number of capture groups in the pattern if and only if every possible match +always contains the same number of matching groups. +* [FEATURE #955](https://github.com/rust-lang/regex/issues/955): +Named captures can now be written as `(?re)` in addition to +`(?Pre)`. +* FEATURE: `regex-syntax` now supports empty character classes. +* FEATURE: `regex-syntax` now has an optional `std` feature. (This will come +to `regex` in the second release.) +* FEATURE: The `Hir` type in `regex-syntax` has had a number of simplifications +made to it. +* FEATURE: `regex-syntax` has support for a new `R` flag for enabling CRLF +mode. This will be supported in `regex` proper in the second release. +* FEATURE: `regex-syntax` now has proper support for "regex that never +matches" via `Hir::fail()`. +* FEATURE: The `hir::literal` module of `regex-syntax` has been completely +re-worked. It now has more documentation, examples and advice. +* FEATURE: The `allow_invalid_utf8` option in `regex-syntax` has been renamed +to `utf8`, and the meaning of the boolean has been flipped. + +Performance improvements: + +Bug fixes: + +* [BUG #514](https://github.com/rust-lang/regex/issues/514): +Improve `Debug` impl for `Match` so that it doesn't show the entire haystack. +* BUGS [#516](https://github.com/rust-lang/regex/issues/516), +[#731](https://github.com/rust-lang/regex/issues/731): +Fix a number of issues with printing `Hir` values as regex patterns. +* [BUG #610](https://github.com/rust-lang/regex/issues/610): +Add explicit example of `foo|bar` in the regex syntax docs. +* [BUG #625](https://github.com/rust-lang/regex/issues/625): +Clarify that `SetMatches::len` does not (regretably) refer to the number of +matches in the set. +* [BUG #660](https://github.com/rust-lang/regex/issues/660): +Clarify "verbose mode" in regex syntax documentation. +* BUG [#738](https://github.com/rust-lang/regex/issues/738), +[#950](https://github.com/rust-lang/regex/issues/950): +Fix `CaptureLocations::get` so that it never panics. +* [BUG #747](https://github.com/rust-lang/regex/issues/747): +Clarify documentation for `Regex::shortest_match`. +* [BUG #835](https://github.com/rust-lang/regex/issues/835): +Fix `\p{Sc}` so that it is equivalent to `\p{Currency_Symbol}`. +* [BUG #846](https://github.com/rust-lang/regex/issues/846): +Add more clarifying documentation to the `CompiledTooBig` error variant. +* [BUG #854](https://github.com/rust-lang/regex/issues/854): +Clarify that `regex::Regex` searches as if the haystack is a sequence of +Unicode scalar values. +* [BUG #884](https://github.com/rust-lang/regex/issues/884): +Replace `__Nonexhaustive` variants with `#[non_exhaustive]` attribute. +* [BUG #893](https://github.com/rust-lang/regex/pull/893): +Optimize case folding since it can get quite slow in some pathological cases. +* [BUG #895](https://github.com/rust-lang/regex/issues/895): +Reject `(?-u:\W)` in `regex::Regex` APIs. +* [BUG #942](https://github.com/rust-lang/regex/issues/942): +Add a missing `void` keyword to indicate "no parameters" in C API. +* [BUG #965](https://github.com/rust-lang/regex/issues/965): +Fix `\p{Lc}` so that it is equivalent to `\p{Cased_Letter}`. +* [BUG #975](https://github.com/rust-lang/regex/issues/975): +Clarify documentation for `\pX` syntax. + + + 1.7.3 (2023-03-24) ================== This is a small release that fixes a bug in `Regex::shortest_match_at` that @@ -743,7 +860,7 @@ Bug gixes: ================== This release includes a ground-up rewrite of the regex-syntax crate, which has been in development for over a year. - +731 New features: * Error messages for invalid regexes have been greatly improved. You get these