diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5171d79363..bacb4b0873 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,7 +52,7 @@ jobs: include: - build: pinned os: ubuntu-latest - rust: 1.41.1 + rust: 1.60.0 - build: stable os: ubuntu-latest rust: stable @@ -159,11 +159,6 @@ jobs: cd regex-capi ./test - - if: matrix.build == 'nightly' - name: Compile regex-debug - run: | - ${{ env.CARGO }} build --verbose --manifest-path regex-debug/Cargo.toml $TARGET - - if: matrix.build == 'nightly' name: Run benchmarks as tests run: | diff --git a/CHANGELOG.md b/CHANGELOG.md index 44274acac5..a07999edee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,120 @@ +1.8.0 (TBD) +=========== +This is a sizeable release that will be soon followed by another sizeable +release. Both of them will combined close over 40 existing issues and PRs. + +This first release, despite its size, essentially represent preparatory work +for the second release, which will be even bigger. Namely, this release: + +* Increases the MSRV to Rust 1.60.0, which was released about 1 year ago. +* Upgrades its dependency on `aho-corasick` to the recently release 1.0 +version. +* Upgrades its dependency on `regex-syntax` to the simultaneously released +`0.7` version. The changes to `regex-syntax` principally revolve around a +rewrite of its literal extraction code and a number of simplifications and +optimizations to its high-level intermediate representation (HIR). + +The second release, which will follow ~shortly after the release above, will +contain a soup-to-nuts rewrite of every regex engine. This will be done by +bringing [`regex-automata`](https://github.com/BurntSushi/regex-automata) into +this repository, and then changing the `regex` crate to be nothing but an API +shim layer on top of `regex-automata`'s API. + +These tandem releases are the culmination of about 3 +years of on-and-off work that [began in earnest in March +2020](https://github.com/rust-lang/regex/issues/656). + +Because of the scale of changes involved in these releases, I would love to +hear about your experience. Especially if you notice undocumented changes in +behavior or performance changes (positive *or* negative). + +Most changes in the first release are listed below. For more details, please +see the commit log, which reflects a linear and decently documented history +of all changes. + +New features: + +* [FEATURE #501](https://github.com/rust-lang/regex/issues/501): +Permit many more characters to be escaped, even if they have no significance. +More specifically, any character except for `[0-9A-Za-z<>]` can now be +escaped. Also, a new routine, `is_escapeable_character`, has been added to +`regex-syntax` to query whether a character is escapeable or not. +* [FEATURE #547](https://github.com/rust-lang/regex/issues/547): +Add `Regex::captures_at`. This filles a hole in the API, but doesn't otherwise +introduce any new expressive power. +* [FEATURE #595](https://github.com/rust-lang/regex/issues/595): +Capture group names are now Unicode-aware. They can now begin with either a `_` +or any "alphabetic" codepoint. After the first codepoint, subsequent codepoints +can be any sequence of alpha-numeric codepoints, along with `_`, `.`, `[` and +`]`. Note that replacement syntax has not changed. +* [FEATURE #810](https://github.com/rust-lang/regex/issues/810): +Add `Match::is_empty` and `Match::len` APIs. +* [FEATURE #905](https://github.com/rust-lang/regex/issues/905): +Add an `impl Default for RegexSet`, with the default being the empty set. +* [FEATURE #908](https://github.com/rust-lang/regex/issues/908): +A new method, `Regex::static_captures_len`, has been added which returns the +number of capture groups in the pattern if and only if every possible match +always contains the same number of matching groups. +* [FEATURE #955](https://github.com/rust-lang/regex/issues/955): +Named captures can now be written as `(?re)` in addition to +`(?Pre)`. +* FEATURE: `regex-syntax` now supports empty character classes. +* FEATURE: `regex-syntax` now has an optional `std` feature. (This will come +to `regex` in the second release.) +* FEATURE: The `Hir` type in `regex-syntax` has had a number of simplifications +made to it. +* FEATURE: `regex-syntax` has support for a new `R` flag for enabling CRLF +mode. This will be supported in `regex` proper in the second release. +* FEATURE: `regex-syntax` now has proper support for "regex that never +matches" via `Hir::fail()`. +* FEATURE: The `hir::literal` module of `regex-syntax` has been completely +re-worked. It now has more documentation, examples and advice. +* FEATURE: The `allow_invalid_utf8` option in `regex-syntax` has been renamed +to `utf8`, and the meaning of the boolean has been flipped. + +Performance improvements: + +Bug fixes: + +* [BUG #514](https://github.com/rust-lang/regex/issues/514): +Improve `Debug` impl for `Match` so that it doesn't show the entire haystack. +* BUGS [#516](https://github.com/rust-lang/regex/issues/516), +[#731](https://github.com/rust-lang/regex/issues/731): +Fix a number of issues with printing `Hir` values as regex patterns. +* [BUG #610](https://github.com/rust-lang/regex/issues/610): +Add explicit example of `foo|bar` in the regex syntax docs. +* [BUG #625](https://github.com/rust-lang/regex/issues/625): +Clarify that `SetMatches::len` does not (regretably) refer to the number of +matches in the set. +* [BUG #660](https://github.com/rust-lang/regex/issues/660): +Clarify "verbose mode" in regex syntax documentation. +* BUG [#738](https://github.com/rust-lang/regex/issues/738), +[#950](https://github.com/rust-lang/regex/issues/950): +Fix `CaptureLocations::get` so that it never panics. +* [BUG #747](https://github.com/rust-lang/regex/issues/747): +Clarify documentation for `Regex::shortest_match`. +* [BUG #835](https://github.com/rust-lang/regex/issues/835): +Fix `\p{Sc}` so that it is equivalent to `\p{Currency_Symbol}`. +* [BUG #846](https://github.com/rust-lang/regex/issues/846): +Add more clarifying documentation to the `CompiledTooBig` error variant. +* [BUG #854](https://github.com/rust-lang/regex/issues/854): +Clarify that `regex::Regex` searches as if the haystack is a sequence of +Unicode scalar values. +* [BUG #884](https://github.com/rust-lang/regex/issues/884): +Replace `__Nonexhaustive` variants with `#[non_exhaustive]` attribute. +* [BUG #893](https://github.com/rust-lang/regex/pull/893): +Optimize case folding since it can get quite slow in some pathological cases. +* [BUG #895](https://github.com/rust-lang/regex/issues/895): +Reject `(?-u:\W)` in `regex::Regex` APIs. +* [BUG #942](https://github.com/rust-lang/regex/issues/942): +Add a missing `void` keyword to indicate "no parameters" in C API. +* [BUG #965](https://github.com/rust-lang/regex/issues/965): +Fix `\p{Lc}` so that it is equivalent to `\p{Cased_Letter}`. +* [BUG #975](https://github.com/rust-lang/regex/issues/975): +Clarify documentation for `\pX` syntax. + + + 1.7.3 (2023-03-24) ================== This is a small release that fixes a bug in `Regex::shortest_match_at` that @@ -743,7 +860,7 @@ Bug gixes: ================== This release includes a ground-up rewrite of the regex-syntax crate, which has been in development for over a year. - +731 New features: * Error messages for invalid regexes have been greatly improved. You get these diff --git a/Cargo.toml b/Cargo.toml index 4c5bd1cc11..f4c70aa1a0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,11 +14,12 @@ finite automata and guarantees linear time matching on all inputs. categories = ["text-processing"] autotests = false exclude = ["/scripts/*", "/.github/*"] -edition = "2018" +edition = "2021" +rust-version = "1.60.0" [workspace] members = [ - "bench", "regex-capi", "regex-debug", "regex-syntax", + "bench", "regex-capi", "regex-syntax", ] [lib] @@ -106,7 +107,7 @@ pattern = [] # For very fast prefix literal matching. [dependencies.aho-corasick] -version = "0.7.18" +version = "1.0.0" optional = true # For skipping along search text quickly when a leading byte is known. diff --git a/bench/log/10-last-frontier/rust-after-literal.log b/bench/log/10-last-frontier/rust-after-literal.log new file mode 100644 index 0000000000..c45b55cac7 --- /dev/null +++ b/bench/log/10-last-frontier/rust-after-literal.log @@ -0,0 +1,124 @@ + +running 119 tests +test misc::anchored_literal_long_match ... bench: 18 ns/iter (+/- 0) = 21666 MB/s +test misc::anchored_literal_long_non_match ... bench: 20 ns/iter (+/- 0) = 19500 MB/s +test misc::anchored_literal_short_match ... bench: 18 ns/iter (+/- 0) = 1444 MB/s +test misc::anchored_literal_short_non_match ... bench: 20 ns/iter (+/- 0) = 1300 MB/s +test misc::easy0_1K ... bench: 51 ns/iter (+/- 2) = 20607 MB/s +test misc::easy0_1MB ... bench: 56 ns/iter (+/- 1) = 18725053 MB/s +test misc::easy0_32 ... bench: 51 ns/iter (+/- 0) = 1156 MB/s +test misc::easy0_32K ... bench: 53 ns/iter (+/- 1) = 618773 MB/s +test misc::easy1_1K ... bench: 41 ns/iter (+/- 0) = 25463 MB/s +test misc::easy1_1MB ... bench: 44 ns/iter (+/- 1) = 23831727 MB/s +test misc::easy1_32 ... bench: 40 ns/iter (+/- 1) = 1300 MB/s +test misc::easy1_32K ... bench: 40 ns/iter (+/- 1) = 819700 MB/s +test misc::hard_1K ... bench: 51 ns/iter (+/- 2) = 20607 MB/s +test misc::hard_1MB ... bench: 56 ns/iter (+/- 1) = 18725053 MB/s +test misc::hard_32 ... bench: 51 ns/iter (+/- 2) = 1156 MB/s +test misc::hard_32K ... bench: 51 ns/iter (+/- 1) = 643039 MB/s +test misc::is_match_set ... bench: 61 ns/iter (+/- 2) = 409 MB/s +test misc::literal ... bench: 13 ns/iter (+/- 0) = 3923 MB/s +test misc::long_needle1 ... bench: 3,242 ns/iter (+/- 79) = 30845 MB/s +test misc::long_needle2 ... bench: 350,572 ns/iter (+/- 6,860) = 285 MB/s +test misc::match_class ... bench: 62 ns/iter (+/- 6) = 1306 MB/s +test misc::match_class_in_range ... bench: 14 ns/iter (+/- 0) = 5785 MB/s +test misc::match_class_unicode ... bench: 259 ns/iter (+/- 15) = 621 MB/s +test misc::matches_set ... bench: 462 ns/iter (+/- 9) = 54 MB/s +test misc::medium_1K ... bench: 53 ns/iter (+/- 0) = 19849 MB/s +test misc::medium_1MB ... bench: 58 ns/iter (+/- 1) = 18079379 MB/s +test misc::medium_32 ... bench: 53 ns/iter (+/- 1) = 1132 MB/s +test misc::medium_32K ... bench: 53 ns/iter (+/- 1) = 618792 MB/s +test misc::no_exponential ... bench: 423 ns/iter (+/- 13) = 236 MB/s +test misc::not_literal ... bench: 89 ns/iter (+/- 0) = 573 MB/s +test misc::one_pass_long_prefix ... bench: 52 ns/iter (+/- 0) = 500 MB/s +test misc::one_pass_long_prefix_not ... bench: 52 ns/iter (+/- 1) = 500 MB/s +test misc::one_pass_short ... bench: 38 ns/iter (+/- 1) = 447 MB/s +test misc::one_pass_short_not ... bench: 41 ns/iter (+/- 1) = 414 MB/s +test misc::reallyhard2_1K ... bench: 81 ns/iter (+/- 1) = 12839 MB/s +test misc::reallyhard_1K ... bench: 1,592 ns/iter (+/- 1) = 660 MB/s +test misc::reallyhard_1MB ... bench: 1,575,822 ns/iter (+/- 39,203) = 665 MB/s +test misc::reallyhard_32 ... bench: 102 ns/iter (+/- 0) = 578 MB/s +test misc::reallyhard_32K ... bench: 49,328 ns/iter (+/- 2,598) = 664 MB/s +test misc::replace_all ... bench: 132 ns/iter (+/- 3) +test misc::reverse_suffix_no_quadratic ... bench: 4,171 ns/iter (+/- 134) = 1918 MB/s +test misc::short_haystack_1000000x ... bench: 132,251 ns/iter (+/- 729) = 60491 MB/s +test misc::short_haystack_100000x ... bench: 13,184 ns/iter (+/- 408) = 60680 MB/s +test misc::short_haystack_10000x ... bench: 6,036 ns/iter (+/- 167) = 13255 MB/s +test misc::short_haystack_1000x ... bench: 602 ns/iter (+/- 14) = 13307 MB/s +test misc::short_haystack_100x ... bench: 230 ns/iter (+/- 7) = 3526 MB/s +test misc::short_haystack_10x ... bench: 218 ns/iter (+/- 3) = 417 MB/s +test misc::short_haystack_1x ... bench: 210 ns/iter (+/- 8) = 90 MB/s +test misc::short_haystack_2x ... bench: 225 ns/iter (+/- 6) = 120 MB/s +test misc::short_haystack_3x ... bench: 211 ns/iter (+/- 8) = 165 MB/s +test misc::short_haystack_4x ... bench: 212 ns/iter (+/- 6) = 202 MB/s +test regexdna::find_new_lines ... bench: 12,245,066 ns/iter (+/- 117,141) = 415 MB/s +test regexdna::subst1 ... bench: 786,357 ns/iter (+/- 14,200) = 6464 MB/s +test regexdna::subst10 ... bench: 788,550 ns/iter (+/- 26,456) = 6446 MB/s +test regexdna::subst11 ... bench: 782,161 ns/iter (+/- 15,583) = 6499 MB/s +test regexdna::subst2 ... bench: 784,902 ns/iter (+/- 23,379) = 6476 MB/s +test regexdna::subst3 ... bench: 786,640 ns/iter (+/- 27,063) = 6462 MB/s +test regexdna::subst4 ... bench: 785,591 ns/iter (+/- 20,498) = 6470 MB/s +test regexdna::subst5 ... bench: 787,447 ns/iter (+/- 20,892) = 6455 MB/s +test regexdna::subst6 ... bench: 784,994 ns/iter (+/- 19,687) = 6475 MB/s +test regexdna::subst7 ... bench: 801,921 ns/iter (+/- 15,391) = 6339 MB/s +test regexdna::subst8 ... bench: 785,541 ns/iter (+/- 11,908) = 6471 MB/s +test regexdna::subst9 ... bench: 785,848 ns/iter (+/- 28,020) = 6468 MB/s +test regexdna::variant1 ... bench: 2,195,058 ns/iter (+/- 44,066) = 2315 MB/s +test regexdna::variant2 ... bench: 3,219,968 ns/iter (+/- 59,372) = 1578 MB/s +test regexdna::variant3 ... bench: 3,776,467 ns/iter (+/- 54,326) = 1346 MB/s +test regexdna::variant4 ... bench: 3,803,674 ns/iter (+/- 95,281) = 1336 MB/s +test regexdna::variant5 ... bench: 2,661,333 ns/iter (+/- 46,408) = 1910 MB/s +test regexdna::variant6 ... bench: 2,645,716 ns/iter (+/- 38,659) = 1921 MB/s +test regexdna::variant7 ... bench: 3,228,352 ns/iter (+/- 69,155) = 1574 MB/s +test regexdna::variant8 ... bench: 3,305,563 ns/iter (+/- 59,321) = 1537 MB/s +test regexdna::variant9 ... bench: 3,225,039 ns/iter (+/- 49,720) = 1576 MB/s +test rust_compile::compile_huge ... bench: 100,381 ns/iter (+/- 2,052) +test rust_compile::compile_huge_bytes ... bench: 5,899,989 ns/iter (+/- 114,363) +test rust_compile::compile_huge_full ... bench: 11,650,995 ns/iter (+/- 172,285) +test rust_compile::compile_simple ... bench: 4,082 ns/iter (+/- 88) +test rust_compile::compile_simple_bytes ... bench: 4,153 ns/iter (+/- 120) +test rust_compile::compile_simple_full ... bench: 20,414 ns/iter (+/- 1,860) +test rust_compile::compile_small ... bench: 9,114 ns/iter (+/- 216) +test rust_compile::compile_small_bytes ... bench: 183,049 ns/iter (+/- 9,917) +test rust_compile::compile_small_full ... bench: 361,291 ns/iter (+/- 11,045) +test sherlock::before_after_holmes ... bench: 907,103 ns/iter (+/- 12,165) = 655 MB/s +test sherlock::before_holmes ... bench: 62,501 ns/iter (+/- 1,880) = 9518 MB/s +test sherlock::everything_greedy ... bench: 2,062,116 ns/iter (+/- 41,900) = 288 MB/s +test sherlock::everything_greedy_nl ... bench: 894,529 ns/iter (+/- 38,723) = 665 MB/s +test sherlock::holmes_cochar_watson ... bench: 103,305 ns/iter (+/- 3,798) = 5758 MB/s +test sherlock::holmes_coword_watson ... bench: 479,423 ns/iter (+/- 13,924) = 1240 MB/s +test sherlock::ing_suffix ... bench: 318,300 ns/iter (+/- 6,846) = 1869 MB/s +test sherlock::ing_suffix_limited_space ... bench: 1,066,300 ns/iter (+/- 19,375) = 557 MB/s +test sherlock::letters ... bench: 21,777,358 ns/iter (+/- 230,478) = 27 MB/s +test sherlock::letters_lower ... bench: 21,152,019 ns/iter (+/- 203,617) = 28 MB/s +test sherlock::letters_upper ... bench: 1,777,626 ns/iter (+/- 26,243) = 334 MB/s +test sherlock::line_boundary_sherlock_holmes ... bench: 897,509 ns/iter (+/- 24,983) = 662 MB/s +test sherlock::name_alt1 ... bench: 32,255 ns/iter (+/- 681) = 18444 MB/s +test sherlock::name_alt2 ... bench: 86,369 ns/iter (+/- 2,494) = 6888 MB/s +test sherlock::name_alt3 ... bench: 97,618 ns/iter (+/- 564) = 6094 MB/s +test sherlock::name_alt3_nocase ... bench: 944,848 ns/iter (+/- 31,039) = 629 MB/s +test sherlock::name_alt4 ... bench: 122,029 ns/iter (+/- 2,716) = 4875 MB/s +test sherlock::name_alt4_nocase ... bench: 225,544 ns/iter (+/- 5,783) = 2637 MB/s +test sherlock::name_alt5 ... bench: 91,897 ns/iter (+/- 3,796) = 6473 MB/s +test sherlock::name_alt5_nocase ... bench: 936,420 ns/iter (+/- 15,092) = 635 MB/s +test sherlock::name_holmes ... bench: 33,448 ns/iter (+/- 959) = 17786 MB/s +test sherlock::name_holmes_nocase ... bench: 115,864 ns/iter (+/- 1,645) = 5134 MB/s +test sherlock::name_sherlock ... bench: 22,474 ns/iter (+/- 674) = 26472 MB/s +test sherlock::name_sherlock_holmes ... bench: 22,184 ns/iter (+/- 497) = 26818 MB/s +test sherlock::name_sherlock_holmes_nocase ... bench: 99,629 ns/iter (+/- 2,398) = 5971 MB/s +test sherlock::name_sherlock_nocase ... bench: 99,523 ns/iter (+/- 2,674) = 5977 MB/s +test sherlock::name_whitespace ... bench: 30,815 ns/iter (+/- 107) = 19306 MB/s +test sherlock::no_match_common ... bench: 19,661 ns/iter (+/- 656) = 30259 MB/s +test sherlock::no_match_really_common ... bench: 27,544 ns/iter (+/- 527) = 21599 MB/s +test sherlock::no_match_uncommon ... bench: 19,553 ns/iter (+/- 31) = 30426 MB/s +test sherlock::quotes ... bench: 369,144 ns/iter (+/- 45,316) = 1611 MB/s +test sherlock::repeated_class_negation ... bench: 68,838,857 ns/iter (+/- 330,544) = 8 MB/s +test sherlock::the_lower ... bench: 321,692 ns/iter (+/- 5,418) = 1849 MB/s +test sherlock::the_nocase ... bench: 507,936 ns/iter (+/- 3,080) = 1171 MB/s +test sherlock::the_upper ... bench: 43,705 ns/iter (+/- 788) = 13612 MB/s +test sherlock::the_whitespace ... bench: 819,179 ns/iter (+/- 20,071) = 726 MB/s +test sherlock::word_ending_n ... bench: 1,700,300 ns/iter (+/- 36,623) = 349 MB/s +test sherlock::words ... bench: 8,249,767 ns/iter (+/- 75,015) = 72 MB/s + +test result: ok. 0 passed; 0 failed; 0 ignored; 119 measured; 0 filtered out; finished in 111.55s + diff --git a/bench/log/10-last-frontier/rust-before-literal.log b/bench/log/10-last-frontier/rust-before-literal.log new file mode 100644 index 0000000000..98b3496ae9 --- /dev/null +++ b/bench/log/10-last-frontier/rust-before-literal.log @@ -0,0 +1,124 @@ + +running 119 tests +test misc::anchored_literal_long_match ... bench: 18 ns/iter (+/- 0) = 21666 MB/s +test misc::anchored_literal_long_non_match ... bench: 18 ns/iter (+/- 0) = 21666 MB/s +test misc::anchored_literal_short_match ... bench: 18 ns/iter (+/- 0) = 1444 MB/s +test misc::anchored_literal_short_non_match ... bench: 18 ns/iter (+/- 0) = 1444 MB/s +test misc::easy0_1K ... bench: 15 ns/iter (+/- 0) = 70066 MB/s +test misc::easy0_1MB ... bench: 21 ns/iter (+/- 0) = 49933476 MB/s +test misc::easy0_32 ... bench: 15 ns/iter (+/- 0) = 3933 MB/s +test misc::easy0_32K ... bench: 14 ns/iter (+/- 0) = 2342500 MB/s +test misc::easy1_1K ... bench: 40 ns/iter (+/- 1) = 26100 MB/s +test misc::easy1_1MB ... bench: 45 ns/iter (+/- 1) = 23302133 MB/s +test misc::easy1_32 ... bench: 40 ns/iter (+/- 5) = 1300 MB/s +test misc::easy1_32K ... bench: 40 ns/iter (+/- 1) = 819700 MB/s +test misc::hard_1K ... bench: 51 ns/iter (+/- 1) = 20607 MB/s +test misc::hard_1MB ... bench: 56 ns/iter (+/- 0) = 18725053 MB/s +test misc::hard_32 ... bench: 51 ns/iter (+/- 3) = 1156 MB/s +test misc::hard_32K ... bench: 51 ns/iter (+/- 1) = 643039 MB/s +test misc::is_match_set ... bench: 61 ns/iter (+/- 2) = 409 MB/s +test misc::literal ... bench: 13 ns/iter (+/- 0) = 3923 MB/s +test misc::long_needle1 ... bench: 3,259 ns/iter (+/- 86) = 30684 MB/s +test misc::long_needle2 ... bench: 350,722 ns/iter (+/- 6,984) = 285 MB/s +test misc::match_class ... bench: 60 ns/iter (+/- 1) = 1350 MB/s +test misc::match_class_in_range ... bench: 14 ns/iter (+/- 0) = 5785 MB/s +test misc::match_class_unicode ... bench: 255 ns/iter (+/- 0) = 631 MB/s +test misc::matches_set ... bench: 481 ns/iter (+/- 11) = 51 MB/s +test misc::medium_1K ... bench: 15 ns/iter (+/- 0) = 70133 MB/s +test misc::medium_1MB ... bench: 22 ns/iter (+/- 0) = 47663818 MB/s +test misc::medium_32 ... bench: 15 ns/iter (+/- 0) = 4000 MB/s +test misc::medium_32K ... bench: 15 ns/iter (+/- 0) = 2186400 MB/s +test misc::no_exponential ... bench: 442 ns/iter (+/- 13) = 226 MB/s +test misc::not_literal ... bench: 89 ns/iter (+/- 1) = 573 MB/s +test misc::one_pass_long_prefix ... bench: 54 ns/iter (+/- 1) = 481 MB/s +test misc::one_pass_long_prefix_not ... bench: 52 ns/iter (+/- 1) = 500 MB/s +test misc::one_pass_short ... bench: 39 ns/iter (+/- 0) = 435 MB/s +test misc::one_pass_short_not ... bench: 42 ns/iter (+/- 0) = 404 MB/s +test misc::reallyhard2_1K ... bench: 83 ns/iter (+/- 6) = 12530 MB/s +test misc::reallyhard_1K ... bench: 1,592 ns/iter (+/- 4) = 660 MB/s +test misc::reallyhard_1MB ... bench: 1,575,691 ns/iter (+/- 29,668) = 665 MB/s +test misc::reallyhard_32 ... bench: 101 ns/iter (+/- 5) = 584 MB/s +test misc::reallyhard_32K ... bench: 49,325 ns/iter (+/- 1,734) = 664 MB/s +test misc::replace_all ... bench: 134 ns/iter (+/- 2) +test misc::reverse_suffix_no_quadratic ... bench: 4,189 ns/iter (+/- 274) = 1909 MB/s +test misc::short_haystack_1000000x ... bench: 132,182 ns/iter (+/- 4,966) = 60522 MB/s +test misc::short_haystack_100000x ... bench: 13,344 ns/iter (+/- 275) = 59952 MB/s +test misc::short_haystack_10000x ... bench: 6,119 ns/iter (+/- 285) = 13075 MB/s +test misc::short_haystack_1000x ... bench: 617 ns/iter (+/- 15) = 12983 MB/s +test misc::short_haystack_100x ... bench: 230 ns/iter (+/- 7) = 3526 MB/s +test misc::short_haystack_10x ... bench: 207 ns/iter (+/- 8) = 439 MB/s +test misc::short_haystack_1x ... bench: 213 ns/iter (+/- 7) = 89 MB/s +test misc::short_haystack_2x ... bench: 206 ns/iter (+/- 6) = 131 MB/s +test misc::short_haystack_3x ... bench: 207 ns/iter (+/- 10) = 169 MB/s +test misc::short_haystack_4x ... bench: 208 ns/iter (+/- 7) = 206 MB/s +test regexdna::find_new_lines ... bench: 12,275,804 ns/iter (+/- 145,331) = 414 MB/s +test regexdna::subst1 ... bench: 793,517 ns/iter (+/- 44,203) = 6406 MB/s +test regexdna::subst10 ... bench: 794,922 ns/iter (+/- 23,459) = 6394 MB/s +test regexdna::subst11 ... bench: 790,525 ns/iter (+/- 23,010) = 6430 MB/s +test regexdna::subst2 ... bench: 790,637 ns/iter (+/- 17,962) = 6429 MB/s +test regexdna::subst3 ... bench: 793,559 ns/iter (+/- 17,575) = 6405 MB/s +test regexdna::subst4 ... bench: 792,738 ns/iter (+/- 15,237) = 6412 MB/s +test regexdna::subst5 ... bench: 795,060 ns/iter (+/- 26,172) = 6393 MB/s +test regexdna::subst6 ... bench: 792,357 ns/iter (+/- 15,067) = 6415 MB/s +test regexdna::subst7 ... bench: 797,006 ns/iter (+/- 27,928) = 6378 MB/s +test regexdna::subst8 ... bench: 790,603 ns/iter (+/- 22,754) = 6429 MB/s +test regexdna::subst9 ... bench: 793,055 ns/iter (+/- 13,202) = 6409 MB/s +test regexdna::variant1 ... bench: 2,204,304 ns/iter (+/- 50,669) = 2306 MB/s +test regexdna::variant2 ... bench: 3,224,798 ns/iter (+/- 45,705) = 1576 MB/s +test regexdna::variant3 ... bench: 3,802,774 ns/iter (+/- 86,530) = 1336 MB/s +test regexdna::variant4 ... bench: 3,805,916 ns/iter (+/- 69,737) = 1335 MB/s +test regexdna::variant5 ... bench: 2,662,373 ns/iter (+/- 61,259) = 1909 MB/s +test regexdna::variant6 ... bench: 2,654,072 ns/iter (+/- 51,095) = 1915 MB/s +test regexdna::variant7 ... bench: 3,232,369 ns/iter (+/- 67,147) = 1572 MB/s +test regexdna::variant8 ... bench: 3,311,225 ns/iter (+/- 66,086) = 1535 MB/s +test regexdna::variant9 ... bench: 3,241,601 ns/iter (+/- 68,394) = 1568 MB/s +test rust_compile::compile_huge ... bench: 100,955 ns/iter (+/- 2,466) +test rust_compile::compile_huge_bytes ... bench: 5,936,732 ns/iter (+/- 126,993) +test rust_compile::compile_huge_full ... bench: 11,880,838 ns/iter (+/- 211,387) +test rust_compile::compile_simple ... bench: 4,575 ns/iter (+/- 139) +test rust_compile::compile_simple_bytes ... bench: 4,653 ns/iter (+/- 122) +test rust_compile::compile_simple_full ... bench: 20,656 ns/iter (+/- 535) +test rust_compile::compile_small ... bench: 9,613 ns/iter (+/- 992) +test rust_compile::compile_small_bytes ... bench: 188,349 ns/iter (+/- 4,733) +test rust_compile::compile_small_full ... bench: 341,554 ns/iter (+/- 9,774) +test sherlock::before_after_holmes ... bench: 907,419 ns/iter (+/- 11,645) = 655 MB/s +test sherlock::before_holmes ... bench: 62,036 ns/iter (+/- 1,854) = 9590 MB/s +test sherlock::everything_greedy ... bench: 2,072,694 ns/iter (+/- 45,192) = 287 MB/s +test sherlock::everything_greedy_nl ... bench: 884,483 ns/iter (+/- 25,710) = 672 MB/s +test sherlock::holmes_cochar_watson ... bench: 103,873 ns/iter (+/- 1,310) = 5727 MB/s +test sherlock::holmes_coword_watson ... bench: 481,491 ns/iter (+/- 11,516) = 1235 MB/s +test sherlock::ing_suffix ... bench: 323,119 ns/iter (+/- 7,438) = 1841 MB/s +test sherlock::ing_suffix_limited_space ... bench: 1,067,293 ns/iter (+/- 18,661) = 557 MB/s +test sherlock::letters ... bench: 21,732,526 ns/iter (+/- 253,563) = 27 MB/s +test sherlock::letters_lower ... bench: 21,187,465 ns/iter (+/- 191,023) = 28 MB/s +test sherlock::letters_upper ... bench: 1,766,003 ns/iter (+/- 17,494) = 336 MB/s +test sherlock::line_boundary_sherlock_holmes ... bench: 897,387 ns/iter (+/- 26,674) = 662 MB/s +test sherlock::name_alt1 ... bench: 34,183 ns/iter (+/- 885) = 17404 MB/s +test sherlock::name_alt2 ... bench: 87,151 ns/iter (+/- 2,139) = 6826 MB/s +test sherlock::name_alt3 ... bench: 99,293 ns/iter (+/- 1,938) = 5991 MB/s +test sherlock::name_alt3_nocase ... bench: 379,228 ns/iter (+/- 22,539) = 1568 MB/s +test sherlock::name_alt4 ... bench: 123,040 ns/iter (+/- 2,676) = 4835 MB/s +test sherlock::name_alt4_nocase ... bench: 186,045 ns/iter (+/- 403) = 3197 MB/s +test sherlock::name_alt5 ... bench: 91,679 ns/iter (+/- 2,543) = 6489 MB/s +test sherlock::name_alt5_nocase ... bench: 343,668 ns/iter (+/- 6,807) = 1731 MB/s +test sherlock::name_holmes ... bench: 33,802 ns/iter (+/- 936) = 17600 MB/s +test sherlock::name_holmes_nocase ... bench: 136,208 ns/iter (+/- 4,317) = 4367 MB/s +test sherlock::name_sherlock ... bench: 22,534 ns/iter (+/- 462) = 26401 MB/s +test sherlock::name_sherlock_holmes ... bench: 22,514 ns/iter (+/- 697) = 26425 MB/s +test sherlock::name_sherlock_holmes_nocase ... bench: 97,796 ns/iter (+/- 2,037) = 6083 MB/s +test sherlock::name_sherlock_nocase ... bench: 95,809 ns/iter (+/- 1,538) = 6209 MB/s +test sherlock::name_whitespace ... bench: 30,959 ns/iter (+/- 968) = 19216 MB/s +test sherlock::no_match_common ... bench: 19,568 ns/iter (+/- 616) = 30403 MB/s +test sherlock::no_match_really_common ... bench: 26,273 ns/iter (+/- 1,143) = 22644 MB/s +test sherlock::no_match_uncommon ... bench: 19,643 ns/iter (+/- 496) = 30287 MB/s +test sherlock::quotes ... bench: 371,876 ns/iter (+/- 2,494) = 1599 MB/s +test sherlock::repeated_class_negation ... bench: 76,963,104 ns/iter (+/- 277,311) = 7 MB/s +test sherlock::the_lower ... bench: 331,250 ns/iter (+/- 8,588) = 1796 MB/s +test sherlock::the_nocase ... bench: 516,528 ns/iter (+/- 40,826) = 1151 MB/s +test sherlock::the_upper ... bench: 44,206 ns/iter (+/- 1,277) = 13458 MB/s +test sherlock::the_whitespace ... bench: 822,577 ns/iter (+/- 23,649) = 723 MB/s +test sherlock::word_ending_n ... bench: 1,685,110 ns/iter (+/- 34,615) = 353 MB/s +test sherlock::words ... bench: 8,333,499 ns/iter (+/- 152,757) = 71 MB/s + +test result: ok. 0 passed; 0 failed; 0 ignored; 119 measured; 0 filtered out; finished in 124.94s + diff --git a/bench/log/10-last-frontier/rust-bytes-after-literal.log b/bench/log/10-last-frontier/rust-bytes-after-literal.log new file mode 100644 index 0000000000..470e09b9c8 --- /dev/null +++ b/bench/log/10-last-frontier/rust-bytes-after-literal.log @@ -0,0 +1,112 @@ + +running 107 tests +test misc::anchored_literal_long_match ... bench: 18 ns/iter (+/- 0) = 21666 MB/s +test misc::anchored_literal_long_non_match ... bench: 20 ns/iter (+/- 1) = 19500 MB/s +test misc::anchored_literal_short_match ... bench: 18 ns/iter (+/- 0) = 1444 MB/s +test misc::anchored_literal_short_non_match ... bench: 20 ns/iter (+/- 0) = 1300 MB/s +test misc::easy0_1K ... bench: 54 ns/iter (+/- 2) = 19462 MB/s +test misc::easy0_1MB ... bench: 56 ns/iter (+/- 1) = 18725053 MB/s +test misc::easy0_32 ... bench: 51 ns/iter (+/- 1) = 1156 MB/s +test misc::easy0_32K ... bench: 51 ns/iter (+/- 2) = 643039 MB/s +test misc::easy1_1K ... bench: 41 ns/iter (+/- 1) = 25463 MB/s +test misc::easy1_1MB ... bench: 44 ns/iter (+/- 1) = 23831727 MB/s +test misc::easy1_32 ... bench: 40 ns/iter (+/- 2) = 1300 MB/s +test misc::easy1_32K ... bench: 40 ns/iter (+/- 1) = 819700 MB/s +test misc::hard_1K ... bench: 52 ns/iter (+/- 1) = 20211 MB/s +test misc::hard_1MB ... bench: 57 ns/iter (+/- 0) = 18396543 MB/s +test misc::hard_32 ... bench: 51 ns/iter (+/- 0) = 1156 MB/s +test misc::hard_32K ... bench: 51 ns/iter (+/- 3) = 643039 MB/s +test misc::is_match_set ... bench: 61 ns/iter (+/- 2) = 409 MB/s +test misc::literal ... bench: 14 ns/iter (+/- 0) = 3642 MB/s +test misc::long_needle1 ... bench: 3,249 ns/iter (+/- 87) = 30779 MB/s +test misc::long_needle2 ... bench: 350,559 ns/iter (+/- 7,154) = 285 MB/s +test misc::match_class ... bench: 61 ns/iter (+/- 4) = 1327 MB/s +test misc::match_class_in_range ... bench: 14 ns/iter (+/- 0) = 5785 MB/s +test misc::matches_set ... bench: 401 ns/iter (+/- 17) = 62 MB/s +test misc::medium_1K ... bench: 53 ns/iter (+/- 0) = 19849 MB/s +test misc::medium_1MB ... bench: 58 ns/iter (+/- 0) = 18079379 MB/s +test misc::medium_32 ... bench: 53 ns/iter (+/- 0) = 1132 MB/s +test misc::medium_32K ... bench: 53 ns/iter (+/- 2) = 618792 MB/s +test misc::no_exponential ... bench: 421 ns/iter (+/- 8) = 237 MB/s +test misc::not_literal ... bench: 90 ns/iter (+/- 0) = 566 MB/s +test misc::one_pass_long_prefix ... bench: 53 ns/iter (+/- 1) = 490 MB/s +test misc::one_pass_long_prefix_not ... bench: 53 ns/iter (+/- 0) = 490 MB/s +test misc::one_pass_short ... bench: 38 ns/iter (+/- 0) = 447 MB/s +test misc::one_pass_short_not ... bench: 42 ns/iter (+/- 3) = 404 MB/s +test misc::reallyhard2_1K ... bench: 77 ns/iter (+/- 1) = 13506 MB/s +test misc::reallyhard_1K ... bench: 1,592 ns/iter (+/- 1) = 660 MB/s +test misc::reallyhard_1MB ... bench: 1,575,759 ns/iter (+/- 49,997) = 665 MB/s +test misc::reallyhard_32 ... bench: 102 ns/iter (+/- 2) = 578 MB/s +test misc::reallyhard_32K ... bench: 49,326 ns/iter (+/- 1,055) = 664 MB/s +test misc::reverse_suffix_no_quadratic ... bench: 4,161 ns/iter (+/- 94) = 1922 MB/s +test regexdna::find_new_lines ... bench: 12,344,799 ns/iter (+/- 188,054) = 411 MB/s +test regexdna::subst1 ... bench: 780,449 ns/iter (+/- 14,474) = 6513 MB/s +test regexdna::subst10 ... bench: 795,203 ns/iter (+/- 40,742) = 6392 MB/s +test regexdna::subst11 ... bench: 816,444 ns/iter (+/- 23,334) = 6226 MB/s +test regexdna::subst2 ... bench: 777,546 ns/iter (+/- 19,625) = 6537 MB/s +test regexdna::subst3 ... bench: 783,295 ns/iter (+/- 8,266) = 6489 MB/s +test regexdna::subst4 ... bench: 775,154 ns/iter (+/- 21,350) = 6557 MB/s +test regexdna::subst5 ... bench: 781,414 ns/iter (+/- 21,057) = 6505 MB/s +test regexdna::subst6 ... bench: 783,595 ns/iter (+/- 23,835) = 6487 MB/s +test regexdna::subst7 ... bench: 821,620 ns/iter (+/- 46,131) = 6187 MB/s +test regexdna::subst8 ... bench: 818,402 ns/iter (+/- 21,350) = 6211 MB/s +test regexdna::subst9 ... bench: 779,115 ns/iter (+/- 21,335) = 6524 MB/s +test regexdna::variant1 ... bench: 2,189,308 ns/iter (+/- 32,528) = 2321 MB/s +test regexdna::variant2 ... bench: 3,217,478 ns/iter (+/- 36,011) = 1579 MB/s +test regexdna::variant3 ... bench: 3,771,330 ns/iter (+/- 74,944) = 1347 MB/s +test regexdna::variant4 ... bench: 3,787,593 ns/iter (+/- 37,825) = 1342 MB/s +test regexdna::variant5 ... bench: 2,669,799 ns/iter (+/- 69,777) = 1904 MB/s +test regexdna::variant6 ... bench: 2,651,559 ns/iter (+/- 33,895) = 1917 MB/s +test regexdna::variant7 ... bench: 3,222,991 ns/iter (+/- 41,014) = 1577 MB/s +test regexdna::variant8 ... bench: 3,298,048 ns/iter (+/- 41,331) = 1541 MB/s +test regexdna::variant9 ... bench: 3,218,486 ns/iter (+/- 50,318) = 1579 MB/s +test rust_compile::compile_huge ... bench: 100,031 ns/iter (+/- 3,464) +test rust_compile::compile_huge_bytes ... bench: 5,885,102 ns/iter (+/- 130,016) +test rust_compile::compile_huge_full ... bench: 11,641,251 ns/iter (+/- 147,700) +test rust_compile::compile_simple ... bench: 4,263 ns/iter (+/- 116) +test rust_compile::compile_simple_bytes ... bench: 4,236 ns/iter (+/- 91) +test rust_compile::compile_simple_full ... bench: 22,349 ns/iter (+/- 2,085) +test rust_compile::compile_small ... bench: 9,537 ns/iter (+/- 298) +test rust_compile::compile_small_bytes ... bench: 178,561 ns/iter (+/- 3,796) +test rust_compile::compile_small_full ... bench: 363,343 ns/iter (+/- 9,481) +test sherlock::before_after_holmes ... bench: 907,022 ns/iter (+/- 19,133) = 655 MB/s +test sherlock::before_holmes ... bench: 63,729 ns/iter (+/- 1,830) = 9335 MB/s +test sherlock::everything_greedy ... bench: 2,181,593 ns/iter (+/- 46,002) = 272 MB/s +test sherlock::everything_greedy_nl ... bench: 884,811 ns/iter (+/- 26,211) = 672 MB/s +test sherlock::holmes_cochar_watson ... bench: 105,610 ns/iter (+/- 3,120) = 5633 MB/s +test sherlock::holmes_coword_watson ... bench: 480,986 ns/iter (+/- 13,228) = 1236 MB/s +test sherlock::ing_suffix ... bench: 322,921 ns/iter (+/- 3,555) = 1842 MB/s +test sherlock::ing_suffix_limited_space ... bench: 1,065,372 ns/iter (+/- 21,242) = 558 MB/s +test sherlock::letters ... bench: 22,109,015 ns/iter (+/- 146,243) = 26 MB/s +test sherlock::letters_lower ... bench: 21,686,153 ns/iter (+/- 206,041) = 27 MB/s +test sherlock::letters_upper ... bench: 1,778,225 ns/iter (+/- 25,935) = 334 MB/s +test sherlock::line_boundary_sherlock_holmes ... bench: 897,355 ns/iter (+/- 26,781) = 662 MB/s +test sherlock::name_alt1 ... bench: 31,927 ns/iter (+/- 633) = 18634 MB/s +test sherlock::name_alt2 ... bench: 87,040 ns/iter (+/- 1,859) = 6835 MB/s +test sherlock::name_alt3 ... bench: 97,715 ns/iter (+/- 2,109) = 6088 MB/s +test sherlock::name_alt3_nocase ... bench: 944,955 ns/iter (+/- 26,503) = 629 MB/s +test sherlock::name_alt4 ... bench: 120,935 ns/iter (+/- 2,399) = 4919 MB/s +test sherlock::name_alt4_nocase ... bench: 228,597 ns/iter (+/- 7,137) = 2602 MB/s +test sherlock::name_alt5 ... bench: 91,174 ns/iter (+/- 1,096) = 6525 MB/s +test sherlock::name_alt5_nocase ... bench: 937,189 ns/iter (+/- 23,839) = 634 MB/s +test sherlock::name_holmes ... bench: 34,020 ns/iter (+/- 752) = 17487 MB/s +test sherlock::name_holmes_nocase ... bench: 117,194 ns/iter (+/- 3,444) = 5076 MB/s +test sherlock::name_sherlock ... bench: 22,557 ns/iter (+/- 388) = 26374 MB/s +test sherlock::name_sherlock_holmes ... bench: 22,428 ns/iter (+/- 683) = 26526 MB/s +test sherlock::name_sherlock_holmes_nocase ... bench: 99,637 ns/iter (+/- 636) = 5971 MB/s +test sherlock::name_sherlock_nocase ... bench: 97,895 ns/iter (+/- 1,875) = 6077 MB/s +test sherlock::name_whitespace ... bench: 30,772 ns/iter (+/- 1,591) = 19333 MB/s +test sherlock::no_match_common ... bench: 19,665 ns/iter (+/- 296) = 30253 MB/s +test sherlock::no_match_really_common ... bench: 27,403 ns/iter (+/- 2,507) = 21710 MB/s +test sherlock::no_match_uncommon ... bench: 19,601 ns/iter (+/- 293) = 30352 MB/s +test sherlock::quotes ... bench: 370,323 ns/iter (+/- 1,345) = 1606 MB/s +test sherlock::repeated_class_negation ... bench: 68,414,794 ns/iter (+/- 342,428) = 8 MB/s +test sherlock::the_lower ... bench: 327,767 ns/iter (+/- 5,493) = 1815 MB/s +test sherlock::the_nocase ... bench: 507,818 ns/iter (+/- 1,796) = 1171 MB/s +test sherlock::the_upper ... bench: 45,045 ns/iter (+/- 1,400) = 13207 MB/s +test sherlock::the_whitespace ... bench: 822,080 ns/iter (+/- 16,581) = 723 MB/s +test sherlock::word_ending_n ... bench: 1,690,084 ns/iter (+/- 40,361) = 352 MB/s +test sherlock::words ... bench: 8,573,617 ns/iter (+/- 143,313) = 69 MB/s + +test result: ok. 0 passed; 0 failed; 0 ignored; 107 measured; 0 filtered out; finished in 110.03s + diff --git a/bench/log/10-last-frontier/rust-bytes-before-literal.log b/bench/log/10-last-frontier/rust-bytes-before-literal.log new file mode 100644 index 0000000000..7016e3c565 --- /dev/null +++ b/bench/log/10-last-frontier/rust-bytes-before-literal.log @@ -0,0 +1,112 @@ + +running 107 tests +test misc::anchored_literal_long_match ... bench: 18 ns/iter (+/- 0) = 21666 MB/s +test misc::anchored_literal_long_non_match ... bench: 19 ns/iter (+/- 1) = 20526 MB/s +test misc::anchored_literal_short_match ... bench: 18 ns/iter (+/- 0) = 1444 MB/s +test misc::anchored_literal_short_non_match ... bench: 20 ns/iter (+/- 0) = 1300 MB/s +test misc::easy0_1K ... bench: 14 ns/iter (+/- 0) = 75071 MB/s +test misc::easy0_1MB ... bench: 21 ns/iter (+/- 0) = 49933476 MB/s +test misc::easy0_32 ... bench: 14 ns/iter (+/- 0) = 4214 MB/s +test misc::easy0_32K ... bench: 14 ns/iter (+/- 0) = 2342500 MB/s +test misc::easy1_1K ... bench: 41 ns/iter (+/- 0) = 25463 MB/s +test misc::easy1_1MB ... bench: 48 ns/iter (+/- 0) = 21845750 MB/s +test misc::easy1_32 ... bench: 41 ns/iter (+/- 0) = 1268 MB/s +test misc::easy1_32K ... bench: 41 ns/iter (+/- 1) = 799707 MB/s +test misc::hard_1K ... bench: 51 ns/iter (+/- 1) = 20607 MB/s +test misc::hard_1MB ... bench: 56 ns/iter (+/- 2) = 18725053 MB/s +test misc::hard_32 ... bench: 51 ns/iter (+/- 6) = 1156 MB/s +test misc::hard_32K ... bench: 51 ns/iter (+/- 1) = 643039 MB/s +test misc::is_match_set ... bench: 62 ns/iter (+/- 2) = 403 MB/s +test misc::literal ... bench: 13 ns/iter (+/- 0) = 3923 MB/s +test misc::long_needle1 ... bench: 2,825 ns/iter (+/- 57) = 35398 MB/s +test misc::long_needle2 ... bench: 350,755 ns/iter (+/- 11,905) = 285 MB/s +test misc::match_class ... bench: 64 ns/iter (+/- 1) = 1265 MB/s +test misc::match_class_in_range ... bench: 13 ns/iter (+/- 0) = 6230 MB/s +test misc::matches_set ... bench: 422 ns/iter (+/- 12) = 59 MB/s +test misc::medium_1K ... bench: 15 ns/iter (+/- 0) = 70133 MB/s +test misc::medium_1MB ... bench: 21 ns/iter (+/- 0) = 49933523 MB/s +test misc::medium_32 ... bench: 15 ns/iter (+/- 0) = 4000 MB/s +test misc::medium_32K ... bench: 14 ns/iter (+/- 0) = 2342571 MB/s +test misc::no_exponential ... bench: 443 ns/iter (+/- 12) = 225 MB/s +test misc::not_literal ... bench: 89 ns/iter (+/- 1) = 573 MB/s +test misc::one_pass_long_prefix ... bench: 52 ns/iter (+/- 1) = 500 MB/s +test misc::one_pass_long_prefix_not ... bench: 52 ns/iter (+/- 1) = 500 MB/s +test misc::one_pass_short ... bench: 40 ns/iter (+/- 1) = 425 MB/s +test misc::one_pass_short_not ... bench: 42 ns/iter (+/- 0) = 404 MB/s +test misc::reallyhard2_1K ... bench: 80 ns/iter (+/- 0) = 13000 MB/s +test misc::reallyhard_1K ... bench: 1,592 ns/iter (+/- 1) = 660 MB/s +test misc::reallyhard_1MB ... bench: 1,575,789 ns/iter (+/- 34,236) = 665 MB/s +test misc::reallyhard_32 ... bench: 101 ns/iter (+/- 2) = 584 MB/s +test misc::reallyhard_32K ... bench: 49,321 ns/iter (+/- 2,718) = 664 MB/s +test misc::reverse_suffix_no_quadratic ... bench: 4,158 ns/iter (+/- 93) = 1924 MB/s +test regexdna::find_new_lines ... bench: 12,391,732 ns/iter (+/- 180,913) = 410 MB/s +test regexdna::subst1 ... bench: 781,690 ns/iter (+/- 29,637) = 6503 MB/s +test regexdna::subst10 ... bench: 778,306 ns/iter (+/- 22,706) = 6531 MB/s +test regexdna::subst11 ... bench: 777,716 ns/iter (+/- 24,635) = 6536 MB/s +test regexdna::subst2 ... bench: 791,786 ns/iter (+/- 15,778) = 6420 MB/s +test regexdna::subst3 ... bench: 783,470 ns/iter (+/- 25,543) = 6488 MB/s +test regexdna::subst4 ... bench: 814,902 ns/iter (+/- 14,146) = 6238 MB/s +test regexdna::subst5 ... bench: 781,464 ns/iter (+/- 19,532) = 6504 MB/s +test regexdna::subst6 ... bench: 780,116 ns/iter (+/- 16,558) = 6516 MB/s +test regexdna::subst7 ... bench: 795,982 ns/iter (+/- 11,254) = 6386 MB/s +test regexdna::subst8 ... bench: 781,746 ns/iter (+/- 24,996) = 6502 MB/s +test regexdna::subst9 ... bench: 783,793 ns/iter (+/- 14,943) = 6485 MB/s +test regexdna::variant1 ... bench: 2,188,940 ns/iter (+/- 42,308) = 2322 MB/s +test regexdna::variant2 ... bench: 3,218,011 ns/iter (+/- 50,700) = 1579 MB/s +test regexdna::variant3 ... bench: 3,778,907 ns/iter (+/- 90,543) = 1345 MB/s +test regexdna::variant4 ... bench: 3,803,852 ns/iter (+/- 68,319) = 1336 MB/s +test regexdna::variant5 ... bench: 2,660,949 ns/iter (+/- 55,488) = 1910 MB/s +test regexdna::variant6 ... bench: 2,647,131 ns/iter (+/- 26,846) = 1920 MB/s +test regexdna::variant7 ... bench: 3,235,032 ns/iter (+/- 37,599) = 1571 MB/s +test regexdna::variant8 ... bench: 3,305,124 ns/iter (+/- 67,109) = 1538 MB/s +test regexdna::variant9 ... bench: 3,231,033 ns/iter (+/- 55,626) = 1573 MB/s +test rust_compile::compile_huge ... bench: 99,387 ns/iter (+/- 2,366) +test rust_compile::compile_huge_bytes ... bench: 5,865,693 ns/iter (+/- 62,255) +test rust_compile::compile_huge_full ... bench: 11,752,845 ns/iter (+/- 195,440) +test rust_compile::compile_simple ... bench: 4,117 ns/iter (+/- 141) +test rust_compile::compile_simple_bytes ... bench: 4,162 ns/iter (+/- 67) +test rust_compile::compile_simple_full ... bench: 19,955 ns/iter (+/- 622) +test rust_compile::compile_small ... bench: 9,140 ns/iter (+/- 112) +test rust_compile::compile_small_bytes ... bench: 165,990 ns/iter (+/- 5,876) +test rust_compile::compile_small_full ... bench: 342,897 ns/iter (+/- 13,730) +test sherlock::before_after_holmes ... bench: 906,789 ns/iter (+/- 13,931) = 656 MB/s +test sherlock::before_holmes ... bench: 62,319 ns/iter (+/- 790) = 9546 MB/s +test sherlock::everything_greedy ... bench: 2,175,424 ns/iter (+/- 47,720) = 273 MB/s +test sherlock::everything_greedy_nl ... bench: 884,406 ns/iter (+/- 22,679) = 672 MB/s +test sherlock::holmes_cochar_watson ... bench: 105,261 ns/iter (+/- 3,536) = 5651 MB/s +test sherlock::holmes_coword_watson ... bench: 479,524 ns/iter (+/- 7,749) = 1240 MB/s +test sherlock::ing_suffix ... bench: 321,401 ns/iter (+/- 9,123) = 1851 MB/s +test sherlock::ing_suffix_limited_space ... bench: 1,069,722 ns/iter (+/- 16,366) = 556 MB/s +test sherlock::letters ... bench: 21,959,896 ns/iter (+/- 204,695) = 27 MB/s +test sherlock::letters_lower ... bench: 21,462,457 ns/iter (+/- 207,449) = 27 MB/s +test sherlock::letters_upper ... bench: 1,768,026 ns/iter (+/- 41,459) = 336 MB/s +test sherlock::line_boundary_sherlock_holmes ... bench: 897,197 ns/iter (+/- 14,349) = 663 MB/s +test sherlock::name_alt1 ... bench: 34,037 ns/iter (+/- 719) = 17479 MB/s +test sherlock::name_alt2 ... bench: 86,788 ns/iter (+/- 1,203) = 6855 MB/s +test sherlock::name_alt3 ... bench: 98,225 ns/iter (+/- 1,589) = 6056 MB/s +test sherlock::name_alt3_nocase ... bench: 377,597 ns/iter (+/- 14,840) = 1575 MB/s +test sherlock::name_alt4 ... bench: 122,440 ns/iter (+/- 8,123) = 4858 MB/s +test sherlock::name_alt4_nocase ... bench: 187,282 ns/iter (+/- 5,176) = 3176 MB/s +test sherlock::name_alt5 ... bench: 91,429 ns/iter (+/- 1,944) = 6507 MB/s +test sherlock::name_alt5_nocase ... bench: 348,111 ns/iter (+/- 12,721) = 1709 MB/s +test sherlock::name_holmes ... bench: 33,547 ns/iter (+/- 1,119) = 17734 MB/s +test sherlock::name_holmes_nocase ... bench: 132,342 ns/iter (+/- 3,974) = 4495 MB/s +test sherlock::name_sherlock ... bench: 22,562 ns/iter (+/- 364) = 26368 MB/s +test sherlock::name_sherlock_holmes ... bench: 22,313 ns/iter (+/- 579) = 26663 MB/s +test sherlock::name_sherlock_holmes_nocase ... bench: 97,556 ns/iter (+/- 2,092) = 6098 MB/s +test sherlock::name_sherlock_nocase ... bench: 95,917 ns/iter (+/- 4,054) = 6202 MB/s +test sherlock::name_whitespace ... bench: 30,997 ns/iter (+/- 1,039) = 19193 MB/s +test sherlock::no_match_common ... bench: 19,690 ns/iter (+/- 378) = 30214 MB/s +test sherlock::no_match_really_common ... bench: 27,629 ns/iter (+/- 465) = 21532 MB/s +test sherlock::no_match_uncommon ... bench: 19,681 ns/iter (+/- 291) = 30228 MB/s +test sherlock::quotes ... bench: 368,290 ns/iter (+/- 1,508) = 1615 MB/s +test sherlock::repeated_class_negation ... bench: 73,004,024 ns/iter (+/- 1,040,743) = 8 MB/s +test sherlock::the_lower ... bench: 320,929 ns/iter (+/- 12,287) = 1853 MB/s +test sherlock::the_nocase ... bench: 514,946 ns/iter (+/- 11,241) = 1155 MB/s +test sherlock::the_upper ... bench: 43,816 ns/iter (+/- 1,719) = 13577 MB/s +test sherlock::the_whitespace ... bench: 825,245 ns/iter (+/- 20,797) = 720 MB/s +test sherlock::word_ending_n ... bench: 1,676,908 ns/iter (+/- 40,650) = 354 MB/s +test sherlock::words ... bench: 8,449,099 ns/iter (+/- 123,842) = 70 MB/s + +test result: ok. 0 passed; 0 failed; 0 ignored; 107 measured; 0 filtered out; finished in 128.47s + diff --git a/regex-capi/include/rure.h b/regex-capi/include/rure.h index a87be61a89..01173b4518 100644 --- a/regex-capi/include/rure.h +++ b/regex-capi/include/rure.h @@ -408,7 +408,7 @@ size_t rure_captures_len(rure_captures *captures); * safe to call rure_compile from multiple threads simultaneously using the * same options pointer. */ -rure_options *rure_options_new(); +rure_options *rure_options_new(void); /* * rure_options_free frees the given options. @@ -536,7 +536,7 @@ size_t rure_set_len(rure_set *re); * It is not safe to use errors from multiple threads simultaneously. An error * value may be reused on subsequent calls to rure_compile. */ -rure_error *rure_error_new(); +rure_error *rure_error_new(void); /* * rure_error_free frees the error given. diff --git a/regex-debug/Cargo.toml b/regex-debug/Cargo.toml deleted file mode 100644 index 1db4036b98..0000000000 --- a/regex-debug/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -publish = false -name = "regex-debug" -version = "0.1.0" -authors = ["The Rust Project Developers"] -license = "MIT OR Apache-2.0" -repository = "https://github.com/rust-lang/regex" -documentation = "https://docs.rs/regex" -homepage = "https://github.com/rust-lang/regex" -description = "A tool useful for debugging regular expressions." -workspace = ".." -edition = "2018" - -[dependencies] -docopt = "1" -regex = { version = "1.1", path = ".." } -regex-syntax = { version = "0.6", path = "../regex-syntax" } -serde = { version = "1", features = ["derive"] } diff --git a/regex-debug/src/main.rs b/regex-debug/src/main.rs deleted file mode 100644 index a7dd453e1f..0000000000 --- a/regex-debug/src/main.rs +++ /dev/null @@ -1,376 +0,0 @@ -use std::error; -use std::io::{self, Write}; -use std::process; -use std::result; - -use docopt::Docopt; -use regex::internal::{Compiler, LiteralSearcher}; -use regex_syntax::hir::literal::Literals; -use regex_syntax::hir::Hir; - -const USAGE: &'static str = " -Usage: - regex-debug [options] ast - regex-debug [options] hir - regex-debug [options] prefixes ... - regex-debug [options] suffixes ... - regex-debug [options] anchors - regex-debug [options] captures - regex-debug [options] compile ... - regex-debug [options] utf8-ranges - regex-debug [options] utf8-ranges-rev - regex-debug --help - -Options: - --help Show this usage message. - --size-limit ARG An approximate size limit on the total size (in bytes) - of a compiled regular expression program. - [default: 10485760] - --bytes Show the instruction codes for byte oriented programs. - (As opposed to Unicode oriented programs.) - --dfa Show the instruction codes for a DFA. - --dfa-reverse Show the instruction codes for a reverse DFA. - This implies --dfa. - -a, --all-literals Shows all literals extracted. - By default, only unambiguous literals are shown. - --literal-limit ARG An approximate limit on the total size (in bytes) - of all literals extracted. [default: 250] - --class-limit ARG A limit on the size of character classes used to - extract literals. [default: 10] - --literal-bytes Show raw literal bytes instead of Unicode chars. - --lcp Show the longest common prefix of all the literals - extracted. - --lcs Show the longest common suffix of all the literals - extracted. - --searcher Show the debug output for the literal searcher - constructed by the literals found. - --quiet Show less output. -"; - -#[derive(serde::Deserialize)] -struct Args { - cmd_ast: bool, - cmd_hir: bool, - cmd_prefixes: bool, - cmd_suffixes: bool, - cmd_anchors: bool, - cmd_captures: bool, - cmd_compile: bool, - cmd_utf8_ranges: bool, - cmd_utf8_ranges_rev: bool, - - arg_pattern: String, - arg_patterns: Vec, - arg_class: String, - - flag_size_limit: usize, - flag_bytes: bool, - flag_dfa: bool, - flag_dfa_reverse: bool, - flag_all_literals: bool, - flag_literal_limit: usize, - flag_class_limit: usize, - flag_literal_bytes: bool, - flag_lcp: bool, - flag_lcs: bool, - flag_searcher: bool, - flag_quiet: bool, -} - -type Result = result::Result>; - -fn main() { - let mut args: Args = Docopt::new(USAGE) - .and_then(|d| d.deserialize()) - .unwrap_or_else(|e| e.exit()); - if args.flag_dfa_reverse { - args.flag_dfa = true; - } - match run(&args) { - Ok(_) => process::exit(0), - Err(err) => { - let _ = writeln!(&mut io::stderr(), "{}", err); - process::exit(1) - } - } -} - -fn run(args: &Args) -> Result<()> { - if args.cmd_ast { - cmd_ast(args) - } else if args.cmd_hir { - cmd_hir(args) - } else if args.cmd_prefixes { - cmd_literals(args) - } else if args.cmd_suffixes { - cmd_literals(args) - } else if args.cmd_anchors { - cmd_anchors(args) - } else if args.cmd_captures { - cmd_captures(args) - } else if args.cmd_compile { - cmd_compile(args) - } else if args.cmd_utf8_ranges { - cmd_utf8_ranges(args) - } else if args.cmd_utf8_ranges_rev { - cmd_utf8_ranges_rev(args) - } else { - unreachable!() - } -} - -fn cmd_ast(args: &Args) -> Result<()> { - use regex_syntax::ast::parse::Parser; - - let mut parser = Parser::new(); - let ast = parser.parse(&args.arg_pattern)?; - println!("{:#?}", ast); - Ok(()) -} - -fn cmd_hir(args: &Args) -> Result<()> { - use regex_syntax::ParserBuilder; - - let mut parser = ParserBuilder::new().allow_invalid_utf8(false).build(); - let hir = parser.parse(&args.arg_pattern)?; - println!("{:#?}", hir); - Ok(()) -} - -fn cmd_literals(args: &Args) -> Result<()> { - let exprs = args.parse_many()?; - let mut lits = if args.cmd_prefixes { - args.literals(&exprs, |lits, e| lits.union_prefixes(e)) - } else { - args.literals(&exprs, |lits, e| lits.union_suffixes(e)) - }; - if !args.flag_all_literals { - if args.cmd_prefixes { - lits = lits.unambiguous_prefixes(); - } else { - lits = lits.unambiguous_suffixes(); - } - } - if args.flag_searcher { - if args.cmd_prefixes { - println!("{:?}", LiteralSearcher::prefixes(lits)) - } else { - println!("{:?}", LiteralSearcher::suffixes(lits)) - } - } else if args.flag_lcp { - println!("{}", escape_unicode(lits.longest_common_prefix())); - } else if args.flag_lcs { - println!("{}", escape_unicode(lits.longest_common_suffix())); - } else { - for lit in lits.literals() { - if args.flag_literal_bytes { - if lit.is_cut() { - println!("Cut({})", escape_bytes(lit)); - } else { - println!("Complete({})", escape_bytes(lit)); - } - } else { - println!("{:?}", lit); - } - } - } - Ok(()) -} - -fn cmd_anchors(args: &Args) -> Result<()> { - let expr = args.parse_one()?; - if expr.is_anchored_start() { - println!("start"); - } - if expr.is_anchored_end() { - println!("end"); - } - Ok(()) -} - -fn cmd_captures(args: &Args) -> Result<()> { - let expr = args.parse_one()?; - let prog = args.compiler().only_utf8(false).compile(&[expr])?; - for (i, name) in prog.captures.iter().enumerate() { - match *name { - None => println!("{}", i), - Some(ref name) => println!("{}:{}", i, name), - } - } - Ok(()) -} - -fn cmd_compile(args: &Args) -> Result<()> { - let exprs = args.parse_many()?; - let compiler = args - .compiler() - .bytes(args.flag_bytes) - .only_utf8(!args.flag_bytes) - .dfa(args.flag_dfa) - .reverse(args.flag_dfa_reverse); - let prog = compiler.compile(&exprs)?; - if !args.flag_quiet { - print!("{:?}", prog); - } else { - println!("instruction count: {}", prog.insts.len()); - } - Ok(()) -} - -fn cmd_utf8_ranges(args: &Args) -> Result<()> { - use regex_syntax::hir::{self, HirKind}; - use regex_syntax::utf8::Utf8Sequences; - use regex_syntax::ParserBuilder; - - let hir = ParserBuilder::new() - .build() - .parse(&format!("[{}]", args.arg_class))?; - let cls = match hir.into_kind() { - HirKind::Class(hir::Class::Unicode(cls)) => cls, - _ => { - return Err( - format!("unexpected HIR, expected Unicode class").into() - ) - } - }; - let mut char_count = 0; - for (i, range) in cls.iter().enumerate() { - if i > 0 { - println!("----------------------------"); - } - char_count += (range.end() as u32) - (range.start() as u32) + 1; - for seq in Utf8Sequences::new(range.start(), range.end()) { - for utf8_range in seq.into_iter() { - print!("[{:02X}-{:02X}]", utf8_range.start, utf8_range.end); - } - println!(); - } - } - println!("codepoint count: {}", char_count); - Ok(()) -} - -fn cmd_utf8_ranges_rev(args: &Args) -> Result<()> { - use regex_syntax::hir::{self, HirKind}; - use regex_syntax::utf8::Utf8Sequences; - use regex_syntax::ParserBuilder; - - let hir = ParserBuilder::new() - .build() - .parse(&format!("[{}]", args.arg_class))?; - let cls = match hir.into_kind() { - HirKind::Class(hir::Class::Unicode(cls)) => cls, - _ => { - return Err( - format!("unexpected HIR, expected Unicode class").into() - ) - } - }; - let mut char_count = 0; - let mut seqs = vec![]; - for (_, range) in cls.iter().enumerate() { - char_count += (range.end() as u32) - (range.start() as u32) + 1; - for seq in Utf8Sequences::new(range.start(), range.end()) { - let mut seq = seq.as_slice().to_vec(); - seq.reverse(); - seqs.push(seq); - } - } - seqs.sort(); - for seq in seqs { - for utf8_range in seq.into_iter() { - print!("[{:02X}-{:02X}]", utf8_range.start, utf8_range.end); - } - println!(); - } - println!("codepoint count: {}", char_count); - Ok(()) -} - -impl Args { - fn parse_one(&self) -> Result { - parse(&self.arg_pattern) - } - - fn parse_many(&self) -> Result> { - self.arg_patterns.iter().map(|s| parse(s)).collect() - } - - fn literals bool>( - &self, - exprs: &[Hir], - get_literals: F, - ) -> Literals { - let mut lits = Some(self.empty_literals()); - for e in exprs { - lits = lits.and_then(|mut lits| { - if !get_literals(&mut lits, e) { - None - } else { - Some(lits) - } - }); - } - lits.unwrap_or(self.empty_literals()) - } - - fn empty_literals(&self) -> Literals { - let mut lits = Literals::empty(); - lits.set_limit_size(self.flag_literal_limit); - lits.set_limit_class(self.flag_class_limit); - lits - } - - fn compiler(&self) -> Compiler { - Compiler::new().size_limit(self.flag_size_limit) - } -} - -fn parse(re: &str) -> Result { - use regex_syntax::ParserBuilder; - ParserBuilder::new() - .allow_invalid_utf8(true) - .build() - .parse(re) - .map_err(From::from) -} - -fn escape_unicode(bytes: &[u8]) -> String { - let show = match ::std::str::from_utf8(bytes) { - Ok(v) => v.to_string(), - Err(_) => escape_bytes(bytes), - }; - let mut space_escaped = String::new(); - for c in show.chars() { - if c.is_whitespace() { - let escaped = if c as u32 <= 0x7F { - escape_byte(c as u8) - } else { - if c as u32 <= 0xFFFF { - format!(r"\u{{{:04x}}}", c as u32) - } else { - format!(r"\U{{{:08x}}}", c as u32) - } - }; - space_escaped.push_str(&escaped); - } else { - space_escaped.push(c); - } - } - space_escaped -} - -fn escape_bytes(bytes: &[u8]) -> String { - let mut s = String::new(); - for &b in bytes { - s.push_str(&escape_byte(b)); - } - s -} - -fn escape_byte(byte: u8) -> String { - use std::ascii::escape_default; - - let escaped: Vec = escape_default(byte).collect(); - String::from_utf8_lossy(&escaped).into_owned() -} diff --git a/regex-syntax/Cargo.toml b/regex-syntax/Cargo.toml index be9aeb5689..6ae9b84852 100644 --- a/regex-syntax/Cargo.toml +++ b/regex-syntax/Cargo.toml @@ -8,12 +8,14 @@ documentation = "https://docs.rs/regex-syntax" homepage = "https://github.com/rust-lang/regex" description = "A regular expression parser." workspace = ".." -edition = "2018" +edition = "2021" +rust-version = "1.60.0" # Features are documented in the "Crate features" section of the crate docs: # https://docs.rs/regex-syntax/*/#crate-features [features] -default = ["unicode"] +default = ["std", "unicode"] +std = [] unicode = [ "unicode-age", @@ -31,3 +33,16 @@ unicode-gencat = [] unicode-perl = [] unicode-script = [] unicode-segment = [] + +[package.metadata.docs.rs] +# We want to document all features. +all-features = true +# Since this crate's feature setup is pretty complicated, it is worth opting +# into a nightly unstable option to show the features that need to be enabled +# for public API items. To do that, we set 'docsrs', and when that's enabled, +# we enable the 'doc_auto_cfg' feature. +# +# To test this locally, run: +# +# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features +rustdoc-args = ["--cfg", "docsrs"] diff --git a/regex-syntax/README.md b/regex-syntax/README.md index 592f842686..ff4fe094c3 100644 --- a/regex-syntax/README.md +++ b/regex-syntax/README.md @@ -30,13 +30,12 @@ concrete syntax that produced the `Hir`. This example shows how to parse a pattern string into its HIR: ```rust -use regex_syntax::Parser; -use regex_syntax::hir::{self, Hir}; +use regex_syntax::{hir::Hir, parse}; -let hir = Parser::new().parse("a|b").unwrap(); +let hir = parse("a|b").unwrap(); assert_eq!(hir, Hir::alternation(vec![ - Hir::literal(hir::Literal::Unicode('a')), - Hir::literal(hir::Literal::Unicode('b')), + Hir::literal("a".as_bytes()), + Hir::literal("b".as_bytes()), ])); ``` diff --git a/regex-syntax/src/ast/mod.rs b/regex-syntax/src/ast/mod.rs index 9db9afaf17..faabca2c1c 100644 --- a/regex-syntax/src/ast/mod.rs +++ b/regex-syntax/src/ast/mod.rs @@ -2,9 +2,9 @@ Defines an abstract syntax for regular expressions. */ -use std::cmp::Ordering; -use std::error; -use std::fmt; +use core::cmp::Ordering; + +use alloc::{boxed::Box, string::String, vec, vec::Vec}; pub use crate::ast::visitor::{visit, Visitor}; @@ -65,6 +65,10 @@ impl Error { } /// The type of an error that occurred while building an AST. +/// +/// This error type is marked as `non_exhaustive`. This means that adding a +/// new variant is not considered a breaking change. +#[non_exhaustive] #[derive(Clone, Debug, Eq, PartialEq)] pub enum ErrorKind { /// The capturing group limit was exceeded. @@ -169,71 +173,26 @@ pub enum ErrorKind { /// `(? &str { - use self::ErrorKind::*; - match self.kind { - CaptureLimitExceeded => "capture group limit exceeded", - ClassEscapeInvalid => "invalid escape sequence in character class", - ClassRangeInvalid => "invalid character class range", - ClassRangeLiteral => "invalid range boundary, must be a literal", - ClassUnclosed => "unclosed character class", - DecimalEmpty => "empty decimal literal", - DecimalInvalid => "invalid decimal literal", - EscapeHexEmpty => "empty hexadecimal literal", - EscapeHexInvalid => "invalid hexadecimal literal", - EscapeHexInvalidDigit => "invalid hexadecimal digit", - EscapeUnexpectedEof => "unexpected eof (escape sequence)", - EscapeUnrecognized => "unrecognized escape sequence", - FlagDanglingNegation => "dangling flag negation operator", - FlagDuplicate { .. } => "duplicate flag", - FlagRepeatedNegation { .. } => "repeated negation", - FlagUnexpectedEof => "unexpected eof (flag)", - FlagUnrecognized => "unrecognized flag", - GroupNameDuplicate { .. } => "duplicate capture group name", - GroupNameEmpty => "empty capture group name", - GroupNameInvalid => "invalid capture group name", - GroupNameUnexpectedEof => "unclosed capture group name", - GroupUnclosed => "unclosed group", - GroupUnopened => "unopened group", - NestLimitExceeded(_) => "nest limit exceeded", - RepetitionCountInvalid => "invalid repetition count range", - RepetitionCountUnclosed => "unclosed counted repetition", - RepetitionMissing => "repetition operator missing expression", - UnicodeClassInvalid => "invalid Unicode character class", - UnsupportedBackreference => "backreferences are not supported", - UnsupportedLookAround => "look-around is not supported", - _ => unreachable!(), - } - } -} +#[cfg(feature = "std")] +impl std::error::Error for Error {} -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { crate::error::Formatter::from(self).fmt(f) } } -impl fmt::Display for ErrorKind { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for ErrorKind { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use self::ErrorKind::*; match *self { CaptureLimitExceeded => write!( f, "exceeded the maximum number of \ capturing groups ({})", - ::std::u32::MAX + u32::MAX ), ClassEscapeInvalid => { write!(f, "invalid escape sequence found in character class") @@ -310,7 +269,6 @@ impl fmt::Display for ErrorKind { "look-around, including look-ahead and look-behind, \ is not supported" ), - _ => unreachable!(), } } } @@ -327,8 +285,8 @@ pub struct Span { pub end: Position, } -impl fmt::Debug for Span { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Debug for Span { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "Span({:?}, {:?})", self.start, self.end) } } @@ -360,8 +318,8 @@ pub struct Position { pub column: usize, } -impl fmt::Debug for Position { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Debug for Position { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!( f, "Position(o: {:?}, l: {:?}, c: {:?})", @@ -541,8 +499,8 @@ impl Ast { /// /// This implementation uses constant stack space and heap space proportional /// to the size of the `Ast`. -impl fmt::Display for Ast { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for Ast { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use crate::ast::print::Printer; Printer::new().print(self, f) } @@ -615,11 +573,12 @@ impl Literal { /// If this literal was written as a `\x` hex escape, then this returns /// the corresponding byte value. Otherwise, this returns `None`. pub fn byte(&self) -> Option { - let short_hex = LiteralKind::HexFixed(HexLiteralKind::X); - if self.c as u32 <= 255 && self.kind == short_hex { - Some(self.c as u8) - } else { - None + match self.kind { + LiteralKind::HexFixed(HexLiteralKind::X) => { + // MSRV(1.59): Use 'u8::try_from(self.c)' instead. + u8::try_from(u32::from(self.c)).ok() + } + _ => None, } } } @@ -629,9 +588,12 @@ impl Literal { pub enum LiteralKind { /// The literal is written verbatim, e.g., `a` or `☃`. Verbatim, - /// The literal is written as an escape because it is punctuation, e.g., - /// `\*` or `\[`. - Punctuation, + /// The literal is written as an escape because it is otherwise a special + /// regex meta character, e.g., `\*` or `\[`. + Meta, + /// The literal is written as an escape despite the fact that the escape is + /// unnecessary, e.g., `\%` or `\/`. + Superfluous, /// The literal is written as an octal escape, e.g., `\141`. Octal, /// The literal is written as a hex code with a fixed number of digits @@ -1203,7 +1165,7 @@ impl Group { /// Returns true if and only if this group is capturing. pub fn is_capturing(&self) -> bool { match self.kind { - GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true, + GroupKind::CaptureIndex(_) | GroupKind::CaptureName { .. } => true, GroupKind::NonCapturing(_) => false, } } @@ -1214,7 +1176,7 @@ impl Group { pub fn capture_index(&self) -> Option { match self.kind { GroupKind::CaptureIndex(i) => Some(i), - GroupKind::CaptureName(ref x) => Some(x.index), + GroupKind::CaptureName { ref name, .. } => Some(name.index), GroupKind::NonCapturing(_) => None, } } @@ -1225,8 +1187,13 @@ impl Group { pub enum GroupKind { /// `(a)` CaptureIndex(u32), - /// `(?Pa)` - CaptureName(CaptureName), + /// `(?a)` or `(?Pa)` + CaptureName { + /// True if the `?P<` syntax is used and false if the `?<` syntax is used. + starts_with_p: bool, + /// The capture name. + name: CaptureName, + }, /// `(?:a)` and `(?i:a)` NonCapturing(Flags), } @@ -1350,6 +1317,8 @@ pub enum Flag { SwapGreed, /// `u` Unicode, + /// `R` + CRLF, /// `x` IgnoreWhitespace, } @@ -1358,7 +1327,7 @@ pub enum Flag { /// space but heap space proportional to the depth of the `Ast`. impl Drop for Ast { fn drop(&mut self) { - use std::mem; + use core::mem; match *self { Ast::Empty(_) @@ -1408,7 +1377,7 @@ impl Drop for Ast { /// stack space but heap space proportional to the depth of the `ClassSet`. impl Drop for ClassSet { fn drop(&mut self) { - use std::mem; + use core::mem; match *self { ClassSet::Item(ref item) => match *item { diff --git a/regex-syntax/src/ast/parse.rs b/regex-syntax/src/ast/parse.rs index 6e9c9aca06..9cf64e9ec7 100644 --- a/regex-syntax/src/ast/parse.rs +++ b/regex-syntax/src/ast/parse.rs @@ -2,17 +2,26 @@ This module provides a regular expression parser. */ -use std::borrow::Borrow; -use std::cell::{Cell, RefCell}; -use std::mem; -use std::result; - -use crate::ast::{self, Ast, Position, Span}; -use crate::either::Either; - -use crate::is_meta_character; - -type Result = result::Result; +use core::{ + borrow::Borrow, + cell::{Cell, RefCell}, + mem, +}; + +use alloc::{ + boxed::Box, + string::{String, ToString}, + vec, + vec::Vec, +}; + +use crate::{ + ast::{self, Ast, Position, Span}, + either::Either, + is_escapeable_character, is_meta_character, +}; + +type Result = core::result::Result; /// A primitive is an expression with no sub-expressions. This includes /// literals, assertions and non-set character classes. This representation @@ -100,11 +109,11 @@ fn is_hex(c: char) -> bool { /// If `first` is true, then `c` is treated as the first character in the /// group name (which must be alphabetic or underscore). fn is_capture_char(c: char, first: bool) -> bool { - c == '_' - || (!first - && (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']')) - || ('A' <= c && c <= 'Z') - || ('a' <= c && c <= 'z') + if first { + c == '_' || c.is_alphabetic() + } else { + c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() + } } /// A builder for a regular expression parser. @@ -162,7 +171,7 @@ impl ParserBuilder { /// constant stack space and moving the call stack to the heap), other /// crates may. /// - /// This limit is not checked until the entire Ast is parsed. Therefore, + /// This limit is not checked until the entire AST is parsed. Therefore, /// if callers want to put a limit on the amount of heap space used, then /// they should impose a limit on the length, in bytes, of the concrete /// pattern string. In particular, this is viable since this parser @@ -220,8 +229,7 @@ impl ParserBuilder { /// abstract syntax tree. The size of the tree is proportional to the length /// of the regular expression pattern. /// -/// A `Parser` can be configured in more detail via a -/// [`ParserBuilder`](struct.ParserBuilder.html). +/// A `Parser` can be configured in more detail via a [`ParserBuilder`]. #[derive(Clone, Debug)] pub struct Parser { /// The current position of the parser. @@ -327,8 +335,7 @@ impl Parser { /// The parser can be run with either the `parse` or `parse_with_comments` /// methods. The parse methods return an abstract syntax tree. /// - /// To set configuration options on the parser, use - /// [`ParserBuilder`](struct.ParserBuilder.html). + /// To set configuration options on the parser, use [`ParserBuilder`]. pub fn new() -> Parser { ParserBuilder::new().build() } @@ -1195,12 +1202,16 @@ impl<'s, P: Borrow> ParserI<'s, P> { )); } let inner_span = self.span(); - if self.bump_if("?P<") { + let mut starts_with_p = true; + if self.bump_if("?P<") || { + starts_with_p = false; + self.bump_if("?<") + } { let capture_index = self.next_capture_index(open_span)?; - let cap = self.parse_capture_name(capture_index)?; + let name = self.parse_capture_name(capture_index)?; Ok(Either::Right(ast::Group { span: open_span, - kind: ast::GroupKind::CaptureName(cap), + kind: ast::GroupKind::CaptureName { starts_with_p, name }, ast: Box::new(Ast::Empty(self.span())), })) } else if self.bump_if("?") { @@ -1370,6 +1381,7 @@ impl<'s, P: Borrow> ParserI<'s, P> { 's' => Ok(ast::Flag::DotMatchesNewLine), 'U' => Ok(ast::Flag::SwapGreed), 'u' => Ok(ast::Flag::Unicode), + 'R' => Ok(ast::Flag::CRLF), 'x' => Ok(ast::Flag::IgnoreWhitespace), _ => { Err(self @@ -1483,7 +1495,14 @@ impl<'s, P: Borrow> ParserI<'s, P> { if is_meta_character(c) { return Ok(Primitive::Literal(ast::Literal { span, - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, + c, + })); + } + if is_escapeable_character(c) { + return Ok(Primitive::Literal(ast::Literal { + span, + kind: ast::LiteralKind::Superfluous, c, })); } @@ -1501,9 +1520,6 @@ impl<'s, P: Borrow> ParserI<'s, P> { 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'), 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'), 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'), - ' ' if self.ignore_whitespace() => { - special(ast::SpecialLiteralKind::Space, ' ') - } 'A' => Ok(Primitive::Assertion(ast::Assertion { span, kind: ast::AssertionKind::StartText, @@ -1533,9 +1549,6 @@ impl<'s, P: Borrow> ParserI<'s, P> { /// Assuming the preconditions are met, this routine can never fail. #[inline(never)] fn parse_octal(&self) -> ast::Literal { - use std::char; - use std::u32; - assert!(self.parser().octal); assert!('0' <= self.char() && self.char() <= '7'); let start = self.pos(); @@ -1600,9 +1613,6 @@ impl<'s, P: Borrow> ParserI<'s, P> { &self, kind: ast::HexLiteralKind, ) -> Result { - use std::char; - use std::u32; - let mut scratch = self.parser().scratch.borrow_mut(); scratch.clear(); @@ -1646,9 +1656,6 @@ impl<'s, P: Borrow> ParserI<'s, P> { &self, kind: ast::HexLiteralKind, ) -> Result { - use std::char; - use std::u32; - let mut scratch = self.parser().scratch.borrow_mut(); scratch.clear(); @@ -2146,7 +2153,7 @@ impl<'p, 's, P: Borrow> NestLimiter<'p, 's, P> { let new = self.depth.checked_add(1).ok_or_else(|| { self.p.error( span.clone(), - ast::ErrorKind::NestLimitExceeded(::std::u32::MAX), + ast::ErrorKind::NestLimitExceeded(u32::MAX), ) })?; let limit = self.p.parser().nest_limit; @@ -2297,11 +2304,14 @@ fn specialize_err( #[cfg(test)] mod tests { - use std::ops::Range; + use core::ops::Range; + + use alloc::format; - use super::{Parser, ParserBuilder, ParserI, Primitive}; use crate::ast::{self, Ast, Position, Span}; + use super::*; + // Our own assert_eq, which has slightly better formatting (but honestly // still kind of crappy). macro_rules! assert_eq { @@ -2414,13 +2424,9 @@ mod tests { lit_with(c, span(start..start + c.len_utf8())) } - /// Create a punctuation literal starting at the given position. - fn punct_lit(c: char, span: Span) -> Ast { - Ast::Literal(ast::Literal { - span, - kind: ast::LiteralKind::Punctuation, - c, - }) + /// Create a meta literal starting at the given position. + fn meta_lit(c: char, span: Span) -> Ast { + Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) } /// Create a verbatim literal with the given span. @@ -2704,24 +2710,24 @@ bar Ok(concat( 0..36, vec![ - punct_lit('\\', span(0..2)), - punct_lit('.', span(2..4)), - punct_lit('+', span(4..6)), - punct_lit('*', span(6..8)), - punct_lit('?', span(8..10)), - punct_lit('(', span(10..12)), - punct_lit(')', span(12..14)), - punct_lit('|', span(14..16)), - punct_lit('[', span(16..18)), - punct_lit(']', span(18..20)), - punct_lit('{', span(20..22)), - punct_lit('}', span(22..24)), - punct_lit('^', span(24..26)), - punct_lit('$', span(26..28)), - punct_lit('#', span(28..30)), - punct_lit('&', span(30..32)), - punct_lit('-', span(32..34)), - punct_lit('~', span(34..36)), + meta_lit('\\', span(0..2)), + meta_lit('.', span(2..4)), + meta_lit('+', span(4..6)), + meta_lit('*', span(6..8)), + meta_lit('?', span(8..10)), + meta_lit('(', span(10..12)), + meta_lit(')', span(12..14)), + meta_lit('|', span(14..16)), + meta_lit('[', span(16..18)), + meta_lit(']', span(18..20)), + meta_lit('{', span(20..22)), + meta_lit('}', span(22..24)), + meta_lit('^', span(24..26)), + meta_lit('$', span(26..28)), + meta_lit('#', span(28..30)), + meta_lit('&', span(30..32)), + meta_lit('-', span(32..34)), + meta_lit('~', span(34..36)), ] )) ); @@ -2799,11 +2805,14 @@ bar flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), Ast::Group(ast::Group { span: span_range(pat, 4..pat.len()), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span_range(pat, 9..12), - name: s("foo"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span_range(pat, 9..12), + name: s("foo"), + index: 1, + } + }, ast: Box::new(lit_with('a', span_range(pat, 14..15))), }), ] @@ -2870,23 +2879,12 @@ bar flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), Ast::Literal(ast::Literal { span: span_range(pat, 4..6), - kind: ast::LiteralKind::Special( - ast::SpecialLiteralKind::Space - ), + kind: ast::LiteralKind::Superfluous, c: ' ', }), ] )) ); - // ... but only when `x` mode is enabled. - let pat = r"\ "; - assert_eq!( - parser(pat).parse().unwrap_err(), - TestError { - span: span_range(pat, 0..2), - kind: ast::ErrorKind::EscapeUnrecognized, - } - ); } #[test] @@ -3818,15 +3816,33 @@ bar #[test] fn parse_capture_name() { + assert_eq!( + parser("(?z)").parse(), + Ok(Ast::Group(ast::Group { + span: span(0..7), + kind: ast::GroupKind::CaptureName { + starts_with_p: false, + name: ast::CaptureName { + span: span(3..4), + name: s("a"), + index: 1, + } + }, + ast: Box::new(lit('z', 5)), + })) + ); assert_eq!( parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..8), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..5), - name: s("a"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..5), + name: s("a"), + index: 1, + } + }, ast: Box::new(lit('z', 6)), })) ); @@ -3834,11 +3850,14 @@ bar parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..10), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..7), - name: s("abc"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("abc"), + index: 1, + } + }, ast: Box::new(lit('z', 8)), })) ); @@ -3847,11 +3866,14 @@ bar parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..10), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..7), - name: s("a_1"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("a_1"), + index: 1, + } + }, ast: Box::new(lit('z', 8)), })) ); @@ -3860,11 +3882,14 @@ bar parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..10), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..7), - name: s("a.1"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..7), + name: s("a.1"), + index: 1, + } + }, ast: Box::new(lit('z', 8)), })) ); @@ -3873,15 +3898,67 @@ bar parser("(?Pz)").parse(), Ok(Ast::Group(ast::Group { span: span(0..11), - kind: ast::GroupKind::CaptureName(ast::CaptureName { - span: span(4..8), - name: s("a[1]"), - index: 1, - }), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: span(4..8), + name: s("a[1]"), + index: 1, + } + }, ast: Box::new(lit('z', 9)), })) ); + assert_eq!( + parser("(?P)").parse(), + Ok(Ast::Group(ast::Group { + span: Span::new( + Position::new(0, 1, 1), + Position::new(9, 1, 9), + ), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: Span::new( + Position::new(4, 1, 5), + Position::new(7, 1, 7), + ), + name: s("a¾"), + index: 1, + } + }, + ast: Box::new(Ast::Empty(Span::new( + Position::new(8, 1, 8), + Position::new(8, 1, 8), + ))), + })) + ); + assert_eq!( + parser("(?P<名字>)").parse(), + Ok(Ast::Group(ast::Group { + span: Span::new( + Position::new(0, 1, 1), + Position::new(12, 1, 9), + ), + kind: ast::GroupKind::CaptureName { + starts_with_p: true, + name: ast::CaptureName { + span: Span::new( + Position::new(4, 1, 5), + Position::new(10, 1, 7), + ), + name: s("名字"), + index: 1, + } + }, + ast: Box::new(Ast::Empty(Span::new( + Position::new(11, 1, 8), + Position::new(11, 1, 8), + ))), + })) + ); + assert_eq!( parser("(?P<").parse().unwrap_err(), TestError { @@ -3940,6 +4017,60 @@ bar }, } ); + assert_eq!( + parser("(?P<5>)").parse().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<5a>)").parse().unwrap_err(), + TestError { + span: span(4..5), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<¾>)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(4, 1, 5), + Position::new(6, 1, 6), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<¾a>)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(4, 1, 5), + Position::new(6, 1, 6), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P<☃>)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(4, 1, 5), + Position::new(7, 1, 6), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); + assert_eq!( + parser("(?P)").parse().unwrap_err(), + TestError { + span: Span::new( + Position::new(5, 1, 6), + Position::new(8, 1, 7), + ), + kind: ast::ErrorKind::GroupNameInvalid, + } + ); } #[test] @@ -4046,6 +4177,34 @@ bar ], }) ); + assert_eq!( + parser("i-sR:").parse_flags(), + Ok(ast::Flags { + span: span(0..4), + items: vec![ + ast::FlagsItem { + span: span(0..1), + kind: ast::FlagsItemKind::Flag( + ast::Flag::CaseInsensitive + ), + }, + ast::FlagsItem { + span: span(1..2), + kind: ast::FlagsItemKind::Negation, + }, + ast::FlagsItem { + span: span(2..3), + kind: ast::FlagsItemKind::Flag( + ast::Flag::DotMatchesNewLine + ), + }, + ast::FlagsItem { + span: span(3..4), + kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF), + }, + ], + }) + ); assert_eq!( parser("isU").parse_flags().unwrap_err(), @@ -4107,6 +4266,7 @@ bar assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed)); assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode)); + assert_eq!(parser("R").parse_flag(), Ok(ast::Flag::CRLF)); assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); assert_eq!( @@ -4178,7 +4338,7 @@ bar parser(r"\|").parse_primitive(), Ok(Primitive::Literal(ast::Literal { span: span(0..2), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '|', })) ); @@ -4229,11 +4389,26 @@ bar })) ); + // We also support superfluous escapes in most cases now too. + for c in ['!', '@', '%', '"', '\'', '/', ' '] { + let pat = format!(r"\{}", c); + assert_eq!( + parser(&pat).parse_primitive(), + Ok(Primitive::Literal(ast::Literal { + span: span(0..2), + kind: ast::LiteralKind::Superfluous, + c, + })) + ); + } + + // Some superfluous escapes, namely [0-9A-Za-z], are still banned. This + // gives flexibility for future evolution. assert_eq!( - parser(r"\").parse_escape().unwrap_err(), + parser(r"\e").parse_escape().unwrap_err(), TestError { - span: span(0..1), - kind: ast::ErrorKind::EscapeUnexpectedEof, + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, } ); assert_eq!( @@ -4243,6 +4418,31 @@ bar kind: ast::ErrorKind::EscapeUnrecognized, } ); + // But also, < and > are banned, so that we may evolve them into + // start/end word boundary assertions. (Not sure if we will...) + assert_eq!( + parser(r"\<").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + assert_eq!( + parser(r"\>").parse_escape().unwrap_err(), + TestError { + span: span(0..2), + kind: ast::ErrorKind::EscapeUnrecognized, + } + ); + + // An unfinished escape is illegal. + assert_eq!( + parser(r"\").parse_escape().unwrap_err(), + TestError { + span: span(0..1), + kind: ast::ErrorKind::EscapeUnexpectedEof, + } + ); } #[test] @@ -4272,7 +4472,7 @@ bar Ok(Primitive::Literal(ast::Literal { span: span(0..pat.len()), kind: ast::LiteralKind::Octal, - c: ::std::char::from_u32(i).unwrap(), + c: char::from_u32(i).unwrap(), })) ); } @@ -4347,7 +4547,7 @@ bar Ok(Primitive::Literal(ast::Literal { span: span(0..pat.len()), kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X), - c: ::std::char::from_u32(i).unwrap(), + c: char::from_u32(i).unwrap(), })) ); } @@ -4378,7 +4578,7 @@ bar #[test] fn parse_hex_four() { for i in 0..65536 { - let c = match ::std::char::from_u32(i) { + let c = match char::from_u32(i) { None => continue, Some(c) => c, }; @@ -4442,7 +4642,7 @@ bar #[test] fn parse_hex_eight() { for i in 0..65536 { - let c = match ::std::char::from_u32(i) { + let c = match char::from_u32(i) { None => continue, Some(c) => c, }; @@ -4839,7 +5039,7 @@ bar lit(span(1..2), 'a'), ast::ClassSetItem::Literal(ast::Literal { span: span(2..4), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: ']', }), ] @@ -4857,7 +5057,7 @@ bar lit(span(1..2), 'a'), ast::ClassSetItem::Literal(ast::Literal { span: span(2..4), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '-', }), lit(span(4..5), 'z'), @@ -5049,7 +5249,7 @@ bar span(1..6), itemset(ast::ClassSetItem::Literal(ast::Literal { span: span(1..3), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '^', })), itemset(lit(span(5..6), '^')), @@ -5065,7 +5265,7 @@ bar span(1..6), itemset(ast::ClassSetItem::Literal(ast::Literal { span: span(1..3), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '&', })), itemset(lit(span(5..6), '&')), @@ -5130,7 +5330,7 @@ bar lit(span(1..2), ']'), ast::ClassSetItem::Literal(ast::Literal { span: span(2..4), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '[', }), ] @@ -5148,7 +5348,7 @@ bar kind: itemset(ast::ClassSetItem::Literal( ast::Literal { span: span(1..3), - kind: ast::LiteralKind::Punctuation, + kind: ast::LiteralKind::Meta, c: '[', } )), diff --git a/regex-syntax/src/ast/print.rs b/regex-syntax/src/ast/print.rs index 045de2eaf2..86a87e1439 100644 --- a/regex-syntax/src/ast/print.rs +++ b/regex-syntax/src/ast/print.rs @@ -2,10 +2,13 @@ This module provides a regular expression printer for `Ast`. */ -use std::fmt; +use core::fmt; -use crate::ast::visitor::{self, Visitor}; -use crate::ast::{self, Ast}; +use crate::ast::{ + self, + visitor::{self, Visitor}, + Ast, +}; /// A builder for constructing a printer. /// @@ -157,9 +160,10 @@ impl Writer { use crate::ast::GroupKind::*; match ast.kind { CaptureIndex(_) => self.wtr.write_str("("), - CaptureName(ref x) => { - self.wtr.write_str("(?P<")?; - self.wtr.write_str(&x.name)?; + CaptureName { ref name, starts_with_p } => { + let start = if starts_with_p { "(?P<" } else { "(?<" }; + self.wtr.write_str(start)?; + self.wtr.write_str(&name.name)?; self.wtr.write_str(">")?; Ok(()) } @@ -212,25 +216,25 @@ impl Writer { match ast.kind { Verbatim => self.wtr.write_char(ast.c), - Punctuation => write!(self.wtr, r"\{}", ast.c), - Octal => write!(self.wtr, r"\{:o}", ast.c as u32), + Meta | Superfluous => write!(self.wtr, r"\{}", ast.c), + Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)), HexFixed(ast::HexLiteralKind::X) => { - write!(self.wtr, r"\x{:02X}", ast.c as u32) + write!(self.wtr, r"\x{:02X}", u32::from(ast.c)) } HexFixed(ast::HexLiteralKind::UnicodeShort) => { - write!(self.wtr, r"\u{:04X}", ast.c as u32) + write!(self.wtr, r"\u{:04X}", u32::from(ast.c)) } HexFixed(ast::HexLiteralKind::UnicodeLong) => { - write!(self.wtr, r"\U{:08X}", ast.c as u32) + write!(self.wtr, r"\U{:08X}", u32::from(ast.c)) } HexBrace(ast::HexLiteralKind::X) => { - write!(self.wtr, r"\x{{{:X}}}", ast.c as u32) + write!(self.wtr, r"\x{{{:X}}}", u32::from(ast.c)) } HexBrace(ast::HexLiteralKind::UnicodeShort) => { - write!(self.wtr, r"\u{{{:X}}}", ast.c as u32) + write!(self.wtr, r"\u{{{:X}}}", u32::from(ast.c)) } HexBrace(ast::HexLiteralKind::UnicodeLong) => { - write!(self.wtr, r"\U{{{:X}}}", ast.c as u32) + write!(self.wtr, r"\U{{{:X}}}", u32::from(ast.c)) } Special(ast::SpecialLiteralKind::Bell) => { self.wtr.write_str(r"\a") @@ -285,6 +289,7 @@ impl Writer { Flag::DotMatchesNewLine => self.wtr.write_str("s"), Flag::SwapGreed => self.wtr.write_str("U"), Flag::Unicode => self.wtr.write_str("u"), + Flag::CRLF => self.wtr.write_str("R"), Flag::IgnoreWhitespace => self.wtr.write_str("x"), }, }?; @@ -395,9 +400,12 @@ impl Writer { #[cfg(test)] mod tests { - use super::Printer; + use alloc::string::String; + use crate::ast::parse::ParserBuilder; + use super::*; + fn roundtrip(given: &str) { roundtrip_with(|b| b, given); } @@ -499,6 +507,7 @@ mod tests { fn print_group() { roundtrip("(?i:a)"); roundtrip("(?Pa)"); + roundtrip("(?a)"); roundtrip("(a)"); } diff --git a/regex-syntax/src/ast/visitor.rs b/regex-syntax/src/ast/visitor.rs index 78ee487cff..ab136739e6 100644 --- a/regex-syntax/src/ast/visitor.rs +++ b/regex-syntax/src/ast/visitor.rs @@ -1,4 +1,4 @@ -use std::fmt; +use alloc::{vec, vec::Vec}; use crate::ast::{self, Ast}; @@ -11,15 +11,12 @@ use crate::ast::{self, Ast}; /// may be proportional to end user input. /// /// Typical usage of this trait involves providing an implementation and then -/// running it using the [`visit`](fn.visit.html) function. +/// running it using the [`visit`] function. /// /// Note that the abstract syntax tree for a regular expression is quite -/// complex. Unless you specifically need it, you might be able to use the -/// much simpler -/// [high-level intermediate representation](../hir/struct.Hir.html) -/// and its -/// [corresponding `Visitor` trait](../hir/trait.Visitor.html) -/// instead. +/// complex. Unless you specifically need it, you might be able to use the much +/// simpler [high-level intermediate representation](crate::hir::Hir) and its +/// [corresponding `Visitor` trait](crate::hir::Visitor) instead. pub trait Visitor { /// The result of visiting an AST. type Output; @@ -46,13 +43,12 @@ pub trait Visitor { } /// This method is called between child nodes of an - /// [`Alternation`](struct.Alternation.html). + /// [`Alternation`](ast::Alternation). fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { Ok(()) } - /// This method is called on every - /// [`ClassSetItem`](enum.ClassSetItem.html) + /// This method is called on every [`ClassSetItem`](ast::ClassSetItem) /// before descending into child nodes. fn visit_class_set_item_pre( &mut self, @@ -61,8 +57,7 @@ pub trait Visitor { Ok(()) } - /// This method is called on every - /// [`ClassSetItem`](enum.ClassSetItem.html) + /// This method is called on every [`ClassSetItem`](ast::ClassSetItem) /// after descending into child nodes. fn visit_class_set_item_post( &mut self, @@ -72,8 +67,8 @@ pub trait Visitor { } /// This method is called on every - /// [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html) - /// before descending into child nodes. + /// [`ClassSetBinaryOp`](ast::ClassSetBinaryOp) before descending into + /// child nodes. fn visit_class_set_binary_op_pre( &mut self, _ast: &ast::ClassSetBinaryOp, @@ -82,8 +77,8 @@ pub trait Visitor { } /// This method is called on every - /// [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html) - /// after descending into child nodes. + /// [`ClassSetBinaryOp`](ast::ClassSetBinaryOp) after descending into child + /// nodes. fn visit_class_set_binary_op_post( &mut self, _ast: &ast::ClassSetBinaryOp, @@ -92,7 +87,7 @@ pub trait Visitor { } /// This method is called between the left hand and right hand child nodes - /// of a [`ClassSetBinaryOp`](struct.ClassSetBinaryOp.html). + /// of a [`ClassSetBinaryOp`](ast::ClassSetBinaryOp). fn visit_class_set_binary_op_in( &mut self, _ast: &ast::ClassSetBinaryOp, @@ -104,8 +99,7 @@ pub trait Visitor { /// Executes an implementation of `Visitor` in constant stack space. /// /// This function will visit every node in the given `Ast` while calling the -/// appropriate methods provided by the -/// [`Visitor`](trait.Visitor.html) trait. +/// appropriate methods provided by the [`Visitor`] trait. /// /// The primary use case for this method is when one wants to perform case /// analysis over an `Ast` without using a stack size proportional to the depth @@ -475,8 +469,8 @@ impl<'a> ClassInduct<'a> { } } -impl<'a> fmt::Debug for ClassFrame<'a> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl<'a> core::fmt::Debug for ClassFrame<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let x = match *self { ClassFrame::Union { .. } => "Union", ClassFrame::Binary { .. } => "Binary", @@ -487,8 +481,8 @@ impl<'a> fmt::Debug for ClassFrame<'a> { } } -impl<'a> fmt::Debug for ClassInduct<'a> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl<'a> core::fmt::Debug for ClassInduct<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let x = match *self { ClassInduct::Item(it) => match *it { ast::ClassSetItem::Empty(_) => "Item(Empty)", diff --git a/regex-syntax/src/debug.rs b/regex-syntax/src/debug.rs new file mode 100644 index 0000000000..a0b051b441 --- /dev/null +++ b/regex-syntax/src/debug.rs @@ -0,0 +1,107 @@ +/// A type that wraps a single byte with a convenient fmt::Debug impl that +/// escapes the byte. +pub(crate) struct Byte(pub(crate) u8); + +impl core::fmt::Debug for Byte { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + // Special case ASCII space. It's too hard to read otherwise, so + // put quotes around it. I sometimes wonder whether just '\x20' would + // be better... + if self.0 == b' ' { + return write!(f, "' '"); + } + // 10 bytes is enough to cover any output from ascii::escape_default. + let mut bytes = [0u8; 10]; + let mut len = 0; + for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { + // capitalize \xab to \xAB + if i >= 2 && b'a' <= b && b <= b'f' { + b -= 32; + } + bytes[len] = b; + len += 1; + } + write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) + } +} + +/// A type that provides a human readable debug impl for arbitrary bytes. +/// +/// This generally works best when the bytes are presumed to be mostly UTF-8, +/// but will work for anything. +/// +/// N.B. This is copied nearly verbatim from regex-automata. Sigh. +pub(crate) struct Bytes<'a>(pub(crate) &'a [u8]); + +impl<'a> core::fmt::Debug for Bytes<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "\"")?; + // This is a sad re-implementation of a similar impl found in bstr. + let mut bytes = self.0; + while let Some(result) = utf8_decode(bytes) { + let ch = match result { + Ok(ch) => ch, + Err(byte) => { + write!(f, r"\x{:02x}", byte)?; + bytes = &bytes[1..]; + continue; + } + }; + bytes = &bytes[ch.len_utf8()..]; + match ch { + '\0' => write!(f, "\\0")?, + // ASCII control characters except \0, \n, \r, \t + '\x01'..='\x08' + | '\x0b' + | '\x0c' + | '\x0e'..='\x19' + | '\x7f' => { + write!(f, "\\x{:02x}", u32::from(ch))?; + } + '\n' | '\r' | '\t' | _ => { + write!(f, "{}", ch.escape_debug())?; + } + } + } + write!(f, "\"")?; + Ok(()) + } +} + +/// Decodes the next UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the beginning of the given +/// byte slice, then the first byte is returned instead. +/// +/// This returns `None` if and only if `bytes` is empty. +pub(crate) fn utf8_decode(bytes: &[u8]) -> Option> { + fn len(byte: u8) -> Option { + if byte <= 0x7F { + return Some(1); + } else if byte & 0b1100_0000 == 0b1000_0000 { + return None; + } else if byte <= 0b1101_1111 { + Some(2) + } else if byte <= 0b1110_1111 { + Some(3) + } else if byte <= 0b1111_0111 { + Some(4) + } else { + None + } + } + + if bytes.is_empty() { + return None; + } + let len = match len(bytes[0]) { + None => return Some(Err(bytes[0])), + Some(len) if len > bytes.len() => return Some(Err(bytes[0])), + Some(1) => return Some(Ok(char::from(bytes[0]))), + Some(len) => len, + }; + match core::str::from_utf8(&bytes[..len]) { + Ok(s) => Some(Ok(s.chars().next().unwrap())), + Err(_) => Some(Err(bytes[0])), + } +} diff --git a/regex-syntax/src/error.rs b/regex-syntax/src/error.rs index 1230d2fc5d..98869c4f79 100644 --- a/regex-syntax/src/error.rs +++ b/regex-syntax/src/error.rs @@ -1,15 +1,17 @@ -use std::cmp; -use std::error; -use std::fmt; -use std::result; +use alloc::{ + format, + string::{String, ToString}, + vec, + vec::Vec, +}; -use crate::ast; -use crate::hir; - -/// A type alias for dealing with errors returned by this crate. -pub type Result = result::Result; +use crate::{ast, hir}; /// This error type encompasses any error that can be returned by this crate. +/// +/// This error type is marked as `non_exhaustive`. This means that adding a +/// new variant is not considered a breaking change. +#[non_exhaustive] #[derive(Clone, Debug, Eq, PartialEq)] pub enum Error { /// An error that occurred while translating concrete syntax into abstract @@ -18,13 +20,6 @@ pub enum Error { /// An error that occurred while translating abstract syntax into a high /// level intermediate representation (HIR). Translate(hir::Error), - /// Hints that destructuring should not be exhaustive. - /// - /// This enum may grow additional variants, so this makes sure clients - /// don't count on exhaustive matching. (Otherwise, adding a new variant - /// could break existing code.) - #[doc(hidden)] - __Nonexhaustive, } impl From for Error { @@ -39,24 +34,14 @@ impl From for Error { } } -impl error::Error for Error { - // TODO: Remove this method entirely on the next breaking semver release. - #[allow(deprecated)] - fn description(&self) -> &str { - match *self { - Error::Parse(ref x) => x.description(), - Error::Translate(ref x) => x.description(), - _ => unreachable!(), - } - } -} +#[cfg(feature = "std")] +impl std::error::Error for Error {} -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match *self { Error::Parse(ref x) => x.fmt(f), Error::Translate(ref x) => x.fmt(f), - _ => unreachable!(), } } } @@ -101,8 +86,8 @@ impl<'e> From<&'e hir::Error> for Formatter<'e, hir::ErrorKind> { } } -impl<'e, E: fmt::Display> fmt::Display for Formatter<'e, E> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl<'e, E: core::fmt::Display> core::fmt::Display for Formatter<'e, E> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let spans = Spans::from_formatter(self); if self.pattern.contains('\n') { let divider = repeat_char('~', 79); @@ -168,7 +153,7 @@ struct Spans<'p> { impl<'p> Spans<'p> { /// Build a sequence of spans from a formatter. - fn from_formatter<'e, E: fmt::Display>( + fn from_formatter<'e, E: core::fmt::Display>( fmter: &'p Formatter<'e, E>, ) -> Spans<'p> { let mut line_count = fmter.pattern.lines().count(); @@ -248,7 +233,7 @@ impl<'p> Spans<'p> { pos += 1; } let note_len = span.end.column.saturating_sub(span.start.column); - for _ in 0..cmp::max(1, note_len) { + for _ in 0..core::cmp::max(1, note_len) { notes.push('^'); pos += 1; } @@ -281,11 +266,13 @@ impl<'p> Spans<'p> { } fn repeat_char(c: char, count: usize) -> String { - ::std::iter::repeat(c).take(count).collect() + core::iter::repeat(c).take(count).collect() } #[cfg(test)] mod tests { + use alloc::string::ToString; + use crate::ast::parse::Parser; fn assert_panic_message(pattern: &str, expected_msg: &str) { diff --git a/regex-syntax/src/hir/interval.rs b/regex-syntax/src/hir/interval.rs index 56698c53af..e063390a8f 100644 --- a/regex-syntax/src/hir/interval.rs +++ b/regex-syntax/src/hir/interval.rs @@ -1,8 +1,6 @@ -use std::char; -use std::cmp; -use std::fmt::Debug; -use std::slice; -use std::u8; +use core::{char, cmp, fmt::Debug, slice}; + +use alloc::vec::Vec; use crate::unicode; @@ -32,9 +30,38 @@ use crate::unicode; // // Tests on this are relegated to the public API of HIR in src/hir.rs. -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Debug)] pub struct IntervalSet { + /// A sorted set of non-overlapping ranges. ranges: Vec, + /// While not required at all for correctness, we keep track of whether an + /// interval set has been case folded or not. This helps us avoid doing + /// redundant work if, for example, a set has already been cased folded. + /// And note that whether a set is folded or not is preserved through + /// all of the pairwise set operations. That is, if both interval sets + /// have been case folded, then any of difference, union, intersection or + /// symmetric difference all produce a case folded set. + /// + /// Note that when this is true, it *must* be the case that the set is case + /// folded. But when it's false, the set *may* be case folded. In other + /// words, we only set this to true when we know it to be case, but we're + /// okay with it being false if it would otherwise be costly to determine + /// whether it should be true. This means code cannot assume that a false + /// value necessarily indicates that the set is not case folded. + /// + /// Bottom line: this is a performance optimization. + folded: bool, +} + +impl Eq for IntervalSet {} + +// We implement PartialEq manually so that we don't consider the set's internal +// 'folded' property to be part of its identity. The 'folded' property is +// strictly an optimization. +impl PartialEq for IntervalSet { + fn eq(&self, other: &IntervalSet) -> bool { + self.ranges.eq(&other.ranges) + } } impl IntervalSet { @@ -44,7 +71,10 @@ impl IntervalSet { /// The given ranges do not need to be in any specific order, and ranges /// may overlap. pub fn new>(intervals: T) -> IntervalSet { - let mut set = IntervalSet { ranges: intervals.into_iter().collect() }; + let ranges: Vec = intervals.into_iter().collect(); + // An empty set is case folded. + let folded = ranges.is_empty(); + let mut set = IntervalSet { ranges, folded }; set.canonicalize(); set } @@ -55,6 +85,10 @@ impl IntervalSet { // it preserves canonicalization. self.ranges.push(interval); self.canonicalize(); + // We don't know whether the new interval added here is considered + // case folded, so we conservatively assume that the entire set is + // no longer case folded if it was previously. + self.folded = false; } /// Return an iterator over all intervals in this set. @@ -79,6 +113,9 @@ impl IntervalSet { /// This returns an error if the necessary case mapping data is not /// available. pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> { + if self.folded { + return Ok(()); + } let len = self.ranges.len(); for i in 0..len { let range = self.ranges[i]; @@ -88,14 +125,19 @@ impl IntervalSet { } } self.canonicalize(); + self.folded = true; Ok(()) } /// Union this set with the given set, in place. pub fn union(&mut self, other: &IntervalSet) { + if other.ranges.is_empty() || self.ranges == other.ranges { + return; + } // This could almost certainly be done more efficiently. self.ranges.extend(&other.ranges); self.canonicalize(); + self.folded = self.folded && other.folded; } /// Intersect this set with the given set, in place. @@ -105,6 +147,8 @@ impl IntervalSet { } if other.ranges.is_empty() { self.ranges.clear(); + // An empty set is case folded. + self.folded = true; return; } @@ -134,6 +178,7 @@ impl IntervalSet { } } self.ranges.drain(..drain_end); + self.folded = self.folded && other.folded; } /// Subtract the given set from this set, in place. @@ -226,6 +271,7 @@ impl IntervalSet { a += 1; } self.ranges.drain(..drain_end); + self.folded = self.folded && other.folded; } /// Compute the symmetric difference of the two sets, in place. @@ -251,6 +297,8 @@ impl IntervalSet { if self.ranges.is_empty() { let (min, max) = (I::Bound::min_value(), I::Bound::max_value()); self.ranges.push(I::create(min, max)); + // The set containing everything must case folded. + self.folded = true; return; } @@ -276,6 +324,19 @@ impl IntervalSet { self.ranges.push(I::create(lower, I::Bound::max_value())); } self.ranges.drain(..drain_end); + // We don't need to update whether this set is folded or not, because + // it is conservatively preserved through negation. Namely, if a set + // is not folded, then it is possible that its negation is folded, for + // example, [^☃]. But we're fine with assuming that the set is not + // folded in that case. (`folded` permits false negatives but not false + // positives.) + // + // But what about when a set is folded, is its negation also + // necessarily folded? Yes. Because if a set is folded, then for every + // character in the set, it necessarily included its equivalence class + // of case folded characters. Negating it in turn means that all + // equivalence classes in the set are negated, and any equivalence + // class that was previously not in the set is now entirely in the set. } /// Converts this set into a canonical ordering. @@ -481,7 +542,7 @@ impl Bound for u8 { u8::MAX } fn as_u32(self) -> u32 { - self as u32 + u32::from(self) } fn increment(self) -> Self { self.checked_add(1).unwrap() @@ -499,20 +560,20 @@ impl Bound for char { '\u{10FFFF}' } fn as_u32(self) -> u32 { - self as u32 + u32::from(self) } fn increment(self) -> Self { match self { '\u{D7FF}' => '\u{E000}', - c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(), + c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(), } } fn decrement(self) -> Self { match self { '\u{E000}' => '\u{D7FF}', - c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(), + c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(), } } } diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs new file mode 100644 index 0000000000..bd3a2d143b --- /dev/null +++ b/regex-syntax/src/hir/literal.rs @@ -0,0 +1,3165 @@ +/*! +Provides literal extraction from `Hir` expressions. + +An [`Extractor`] pulls literals out of [`Hir`] expressions and returns a +[`Seq`] of [`Literal`]s. + +The purpose of literal extraction is generally to provide avenues for +optimizing regex searches. The main idea is that substring searches can be an +order of magnitude faster than a regex search. Therefore, if one can execute +a substring search to find candidate match locations and only run the regex +search at those locations, then it is possible for huge improvements in +performance to be realized. + +With that said, literal optimizations are generally a black art because even +though substring search is generally faster, if the number of candidates +produced is high, then it can create a lot of overhead by ping-ponging between +the substring search and the regex search. + +Here are some heuristics that might be used to help increase the chances of +effective literal optimizations: + +* Stick to small [`Seq`]s. If you search for too many literals, it's likely +to lead to substring search that is only a little faster than a regex search, +and thus the overhead of using literal optimizations in the first place might +make things slower overall. +* The literals in your [`Seq`] shoudn't be too short. In general, longer is +better. A sequence corresponding to single bytes that occur frequently in the +haystack, for example, is probably a bad literal optimization because it's +likely to produce many false positive candidates. Longer literals are less +likely to match, and thus probably produce fewer false positives. +* If it's possible to estimate the approximate frequency of each byte according +to some pre-computed background distribution, it is possible to compute a score +of how "good" a `Seq` is. If a `Seq` isn't good enough, you might consider +skipping the literal optimization and just use the regex engine. + +(It should be noted that there are always pathological cases that can make +any kind of literal optimization be a net slower result. This is why it +might be a good idea to be conservative, or to even provide a means for +literal optimizations to be dynamically disabled if they are determined to be +ineffective according to some measure.) + +You're encouraged to explore the methods on [`Seq`], which permit shrinking +the size of sequences in a preference-order preserving fashion. + +Finally, note that it isn't strictly necessary to use an [`Extractor`]. Namely, +an `Extractor` only uses public APIs of the [`Seq`] and [`Literal`] types, +so it is possible to implement your own extractor. For example, for n-grams +or "inner" literals (i.e., not prefix or suffix literals). The `Extractor` +is mostly responsible for the case analysis over `Hir` expressions. Much of +the "trickier" parts are how to combine literal sequences, and that is all +implemented on [`Seq`]. +*/ + +use core::{cmp, mem}; + +use alloc::{vec, vec::Vec}; + +use crate::hir::{self, Hir}; + +/// Extracts prefix or suffix literal sequences from [`Hir`] expressions. +/// +/// Literal extraction is based on the following observations: +/// +/// * Many regexes start with one or a small number of literals. +/// * Substring search for literals is often much faster (sometimes by an order +/// of magnitude) than a regex search. +/// +/// Thus, in many cases, one can search for literals to find candidate starting +/// locations of a match, and then only run the full regex engine at each such +/// location instead of over the full haystack. +/// +/// The main downside of literal extraction is that it can wind up causing a +/// search to be slower overall. For example, if there are many matches or if +/// there are many candidates that don't ultimately lead to a match, then a +/// lot of overhead will be spent in shuffing back-and-forth between substring +/// search and the regex engine. This is the fundamental reason why literal +/// optimizations for regex patterns is sometimes considered a "black art." +/// +/// # Look-around assertions +/// +/// Literal extraction treats all look-around assertions as-if they match every +/// empty string. So for example, the regex `\bquux\b` will yield a sequence +/// containing a single exact literal `quux`. However, not all occurrences +/// of `quux` correspond to a match a of the regex. For example, `\bquux\b` +/// does not match `ZquuxZ` anywhere because `quux` does not fall on a word +/// boundary. +/// +/// In effect, if your regex contains look-around assertions, then a match of +/// an exact literal does not necessarily mean the regex overall matches. So +/// you may still need to run the regex engine in such cases to confirm the +/// match. +/// +/// The precise guarantee you get from a literal sequence is: if every literal +/// in the sequence is exact and the original regex contains zero look-around +/// assertions, then a preference-order multi-substring search of those +/// literals will precisely match a preference-order search of the original +/// regex. +/// +/// # Example +/// +/// This shows how to extract prefixes: +/// +/// ``` +/// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; +/// +/// let hir = parse(r"(a|b|c)(x|y|z)[A-Z]+foo")?; +/// let got = Extractor::new().extract(&hir); +/// // All literals returned are "inexact" because none of them reach the +/// // match state. +/// let expected = Seq::from_iter([ +/// Literal::inexact("ax"), +/// Literal::inexact("ay"), +/// Literal::inexact("az"), +/// Literal::inexact("bx"), +/// Literal::inexact("by"), +/// Literal::inexact("bz"), +/// Literal::inexact("cx"), +/// Literal::inexact("cy"), +/// Literal::inexact("cz"), +/// ]); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box>(()) +/// ``` +/// +/// This shows how to extract suffixes: +/// +/// ``` +/// use regex_syntax::{ +/// hir::literal::{Extractor, ExtractKind, Literal, Seq}, +/// parse, +/// }; +/// +/// let hir = parse(r"foo|[A-Z]+bar")?; +/// let got = Extractor::new().kind(ExtractKind::Suffix).extract(&hir); +/// // Since 'foo' gets to a match state, it is considered exact. But 'bar' +/// // does not because of the '[A-Z]+', and thus is marked inexact. +/// let expected = Seq::from_iter([ +/// Literal::exact("foo"), +/// Literal::inexact("bar"), +/// ]); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Extractor { + kind: ExtractKind, + limit_class: usize, + limit_repeat: usize, + limit_literal_len: usize, + limit_total: usize, +} + +impl Extractor { + /// Create a new extractor with a default configuration. + /// + /// The extractor can be optionally configured before calling + /// [`Extractor::extract`] to get a literal sequence. + pub fn new() -> Extractor { + Extractor { + kind: ExtractKind::Prefix, + limit_class: 10, + limit_repeat: 10, + limit_literal_len: 100, + limit_total: 250, + } + } + + /// Execute the extractor and return a sequence of literals. + pub fn extract(&self, hir: &Hir) -> Seq { + use crate::hir::HirKind::*; + + match *hir.kind() { + Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), + Literal(hir::Literal(ref bytes)) => { + let mut seq = + Seq::singleton(self::Literal::exact(bytes.to_vec())); + self.enforce_literal_len(&mut seq); + seq + } + Class(hir::Class::Unicode(ref cls)) => { + self.extract_class_unicode(cls) + } + Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls), + Repetition(ref rep) => self.extract_repetition(rep), + Capture(hir::Capture { ref sub, .. }) => self.extract(sub), + Concat(ref hirs) => match self.kind { + ExtractKind::Prefix => self.extract_concat(hirs.iter()), + ExtractKind::Suffix => self.extract_concat(hirs.iter().rev()), + }, + Alternation(ref hirs) => { + // Unlike concat, we always union starting from the beginning, + // since the beginning corresponds to the highest preference, + // which doesn't change based on forwards vs reverse. + self.extract_alternation(hirs.iter()) + } + } + } + + /// Set the kind of literal sequence to extract from an [`Hir`] expression. + /// + /// The default is to extract prefixes, but suffixes can be selected + /// instead. The contract for prefixes is that every match of the + /// corresponding `Hir` must start with one of the literals in the sequence + /// returned. Moreover, the _order_ of the sequence returned corresponds to + /// the preference order. + /// + /// Suffixes satisfy a similar contract in that every match of the + /// corresponding `Hir` must end with one of the literals in the sequence + /// returned. However, there is no guarantee that the literals are in + /// preference order. + /// + /// Remember that a sequence can be infinite. For example, unless the + /// limits are configured to be impractically large, attempting to extract + /// prefixes (or suffixes) for the pattern `[A-Z]` will return an infinite + /// sequence. Generally speaking, if the sequence returned is infinite, + /// then it is presumed to be unwise to do prefix (or suffix) optimizations + /// for the pattern. + pub fn kind(&mut self, kind: ExtractKind) -> &mut Extractor { + self.kind = kind; + self + } + + /// Configure a limit on the length of the sequence that is permitted for + /// a character class. If a character class exceeds this limit, then the + /// sequence returned for it is infinite. + /// + /// This prevents classes like `[A-Z]` or `\pL` from getting turned into + /// huge and likely unproductive sequences of literals. + /// + /// # Example + /// + /// This example shows how this limit can be lowered to decrease the tolerance + /// for character classes being turned into literal sequences. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Seq}, parse}; + /// + /// let hir = parse(r"[0-9]")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new([ + /// "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", + /// ]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_class(4).extract(&hir); + /// let expected = Seq::infinite(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn limit_class(&mut self, limit: usize) -> &mut Extractor { + self.limit_class = limit; + self + } + + /// Configure a limit on the total number of repetitions that is permitted + /// before literal extraction is stopped. + /// + /// This is useful for limiting things like `(abcde){50}`, or more + /// insidiously, `(?:){1000000000}`. This limit prevents any one single + /// repetition from adding too much to a literal sequence. + /// + /// With this limit set, repetitions that exceed it will be stopped and any + /// literals extracted up to that point will be made inexact. + /// + /// # Example + /// + /// This shows how to decrease the limit and compares it with the default. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; + /// + /// let hir = parse(r"(abc){8}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_repeat(4).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("abcabcabcabc"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn limit_repeat(&mut self, limit: usize) -> &mut Extractor { + self.limit_repeat = limit; + self + } + + /// Configure a limit on the maximum length of any literal in a sequence. + /// + /// This is useful for limiting things like `(abcde){5}{5}{5}{5}`. While + /// each repetition or literal in that regex is small, when all the + /// repetitions are applied, one ends up with a literal of length `5^4 = + /// 625`. + /// + /// With this limit set, literals that exceed it will be made inexact and + /// thus prevented from growing. + /// + /// # Example + /// + /// This shows how to decrease the limit and compares it with the default. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; + /// + /// let hir = parse(r"(abc){2}{2}{2}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); + /// assert_eq!(expected, got); + /// + /// // Now let's shrink the limit and see how that changes things. + /// let got = Extractor::new().limit_literal_len(14).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("abcabcabcabcab"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn limit_literal_len(&mut self, limit: usize) -> &mut Extractor { + self.limit_literal_len = limit; + self + } + + /// Configure a limit on the total number of literals that will be + /// returned. + /// + /// This is useful as a practical measure for avoiding the creation of + /// large sequences of literals. While the extractor will automatically + /// handle local creations of large sequences (for example, `[A-Z]` yields + /// an infinite sequence by default), large sequences can be created + /// through non-local means as well. + /// + /// For example, `[ab]{3}{3}` would yield a sequence of length `512 = 2^9` + /// despite each of the repetitions being small on their own. This limit + /// thus represents a "catch all" for avoiding locally small sequences from + /// combining into large sequences. + /// + /// # Example + /// + /// This example shows how reducing the limit will change the literal + /// sequence returned. + /// + /// ``` + /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; + /// + /// let hir = parse(r"[ab]{2}{2}")?; + /// + /// let got = Extractor::new().extract(&hir); + /// let expected = Seq::new([ + /// "aaaa", "aaab", "aaba", "aabb", + /// "abaa", "abab", "abba", "abbb", + /// "baaa", "baab", "baba", "babb", + /// "bbaa", "bbab", "bbba", "bbbb", + /// ]); + /// assert_eq!(expected, got); + /// + /// // The default limit is not too big, but big enough to extract all + /// // literals from '[ab]{2}{2}'. If we shrink the limit to less than 16, + /// // then we'll get a truncated set. Notice that it returns a sequence of + /// // length 4 even though our limit was 10. This is because the sequence + /// // is difficult to increase without blowing the limit. Notice also + /// // that every literal in the sequence is now inexact because they were + /// // stripped of some suffix. + /// let got = Extractor::new().limit_total(10).extract(&hir); + /// let expected = Seq::from_iter([ + /// Literal::inexact("aa"), + /// Literal::inexact("ab"), + /// Literal::inexact("ba"), + /// Literal::inexact("bb"), + /// ]); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn limit_total(&mut self, limit: usize) -> &mut Extractor { + self.limit_total = limit; + self + } + + /// Extract a sequence from the given concatenation. Sequences from each of + /// the child HIR expressions are combined via cross product. + /// + /// This short circuits once the cross product turns into a sequence + /// containing only inexact literals. + fn extract_concat<'a, I: Iterator>(&self, it: I) -> Seq { + let mut seq = Seq::singleton(self::Literal::exact(vec![])); + for hir in it { + // If every element in the sequence is inexact, then a cross + // product will always be a no-op. Thus, there is nothing else we + // can add to it and can quit early. Note that this also includes + // infinite sequences. + if seq.is_inexact() { + break; + } + // Note that 'cross' also dispatches based on whether we're + // extracting prefixes or suffixes. + seq = self.cross(seq, &mut self.extract(hir)); + } + seq + } + + /// Extract a sequence from the given alternation. + /// + /// This short circuits once the union turns into an infinite sequence. + fn extract_alternation<'a, I: Iterator>( + &self, + it: I, + ) -> Seq { + let mut seq = Seq::empty(); + for hir in it { + // Once our 'seq' is infinite, every subsequent union + // operation on it will itself always result in an + // infinite sequence. Thus, it can never change and we can + // short-circuit. + if !seq.is_finite() { + break; + } + seq = self.union(seq, &mut self.extract(hir)); + } + seq + } + + /// Extract a sequence of literals from the given repetition. We do our + /// best, Some examples: + /// + /// 'a*' => [inexact(a), exact("")] + /// 'a*?' => [exact(""), inexact(a)] + /// 'a+' => [inexact(a)] + /// 'a{3}' => [exact(aaa)] + /// 'a{3,5} => [inexact(aaa)] + /// + /// The key here really is making sure we get the 'inexact' vs 'exact' + /// attributes correct on each of the literals we add. For example, the + /// fact that 'a*' gives us an inexact 'a' and an exact empty string means + /// that a regex like 'ab*c' will result in [inexact(ab), exact(ac)] + /// literals being extracted, which might actually be a better prefilter + /// than just 'a'. + fn extract_repetition(&self, rep: &hir::Repetition) -> Seq { + let mut subseq = self.extract(&rep.sub); + match *rep { + hir::Repetition { min: 0, max, greedy, .. } => { + // When 'max=1', we can retain exactness, since 'a?' is + // equivalent to 'a|'. Similarly below, 'a??' is equivalent to + // '|a'. + if max != Some(1) { + subseq.make_inexact(); + } + let mut empty = Seq::singleton(Literal::exact(vec![])); + if !greedy { + mem::swap(&mut subseq, &mut empty); + } + self.union(subseq, &mut empty) + } + hir::Repetition { min, max: Some(max), .. } if min == max => { + assert!(min > 0); // handled above + let limit = + u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); + let mut seq = Seq::singleton(Literal::exact(vec![])); + for _ in 0..cmp::min(min, limit) { + if seq.is_inexact() { + break; + } + seq = self.cross(seq, &mut subseq.clone()); + } + if usize::try_from(min).is_err() || min > limit { + seq.make_inexact(); + } + seq + } + hir::Repetition { min, max: Some(max), .. } if min < max => { + assert!(min > 0); // handled above + let limit = + u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); + let mut seq = Seq::singleton(Literal::exact(vec![])); + for _ in 0..cmp::min(min, limit) { + if seq.is_inexact() { + break; + } + seq = self.cross(seq, &mut subseq.clone()); + } + seq.make_inexact(); + seq + } + hir::Repetition { .. } => { + subseq.make_inexact(); + subseq + } + } + } + + /// Convert the given Unicode class into a sequence of literals if the + /// class is small enough. If the class is too big, return an infinite + /// sequence. + fn extract_class_unicode(&self, cls: &hir::ClassUnicode) -> Seq { + if self.class_over_limit_unicode(cls) { + return Seq::infinite(); + } + let mut seq = Seq::empty(); + for r in cls.iter() { + for ch in r.start()..=r.end() { + seq.push(Literal::from(ch)); + } + } + self.enforce_literal_len(&mut seq); + seq + } + + /// Convert the given byte class into a sequence of literals if the class + /// is small enough. If the class is too big, return an infinite sequence. + fn extract_class_bytes(&self, cls: &hir::ClassBytes) -> Seq { + if self.class_over_limit_bytes(cls) { + return Seq::infinite(); + } + let mut seq = Seq::empty(); + for r in cls.iter() { + for b in r.start()..=r.end() { + seq.push(Literal::from(b)); + } + } + self.enforce_literal_len(&mut seq); + seq + } + + /// Returns true if the given Unicode class exceeds the configured limits + /// on this extractor. + fn class_over_limit_unicode(&self, cls: &hir::ClassUnicode) -> bool { + let mut count = 0; + for r in cls.iter() { + if count > self.limit_class { + return true; + } + count += r.len(); + } + count > self.limit_class + } + + /// Returns true if the given byte class exceeds the configured limits on + /// this extractor. + fn class_over_limit_bytes(&self, cls: &hir::ClassBytes) -> bool { + let mut count = 0; + for r in cls.iter() { + if count > self.limit_class { + return true; + } + count += r.len(); + } + count > self.limit_class + } + + /// Compute the cross product of the two sequences if the result would be + /// within configured limits. Otherwise, make `seq2` infinite and cross the + /// infinite sequence with `seq1`. + fn cross(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { + if seq1.max_cross_len(seq2).map_or(false, |len| len > self.limit_total) + { + seq2.make_infinite(); + } + if let ExtractKind::Suffix = self.kind { + seq1.cross_reverse(seq2); + } else { + seq1.cross_forward(seq2); + } + assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); + self.enforce_literal_len(&mut seq1); + seq1 + } + + /// Union the two sequences if the result would be within configured + /// limits. Otherwise, make `seq2` infinite and union the infinite sequence + /// with `seq1`. + fn union(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { + if seq1.max_union_len(seq2).map_or(false, |len| len > self.limit_total) + { + // We try to trim our literal sequences to see if we can make + // room for more literals. The idea is that we'd rather trim down + // literals already in our sequence if it means we can add a few + // more and retain a finite sequence. Otherwise, we'll union with + // an infinite sequence and that infects everything and effectively + // stops literal extraction in its tracks. + // + // We do we keep 4 bytes here? Well, it's a bit of an abstraction + // leakage. Downstream, the literals may wind up getting fed to + // the Teddy algorithm, which supports searching literals up to + // length 4. So that's why we pick that number here. Arguably this + // should be a tuneable parameter, but it seems a little tricky to + // describe. And I'm still unsure if this is the right way to go + // about culling literal sequences. + match self.kind { + ExtractKind::Prefix => { + seq1.keep_first_bytes(4); + seq2.keep_first_bytes(4); + } + ExtractKind::Suffix => { + seq1.keep_last_bytes(4); + seq2.keep_last_bytes(4); + } + } + seq1.dedup(); + seq2.dedup(); + if seq1 + .max_union_len(seq2) + .map_or(false, |len| len > self.limit_total) + { + seq2.make_infinite(); + } + } + seq1.union(seq2); + assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); + seq1 + } + + /// Applies the literal length limit to the given sequence. If none of the + /// literals in the sequence exceed the limit, then this is a no-op. + fn enforce_literal_len(&self, seq: &mut Seq) { + let len = self.limit_literal_len; + match self.kind { + ExtractKind::Prefix => seq.keep_first_bytes(len), + ExtractKind::Suffix => seq.keep_last_bytes(len), + } + } +} + +impl Default for Extractor { + fn default() -> Extractor { + Extractor::new() + } +} + +/// The kind of literals to extract from an [`Hir`] expression. +/// +/// The default extraction kind is `Prefix`. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum ExtractKind { + /// Extracts only prefix literals from a regex. + Prefix, + /// Extracts only suffix literals from a regex. + /// + /// Note that the sequence returned by suffix literals currently may + /// not correctly represent leftmost-first or "preference" order match + /// semantics. + Suffix, +} + +impl ExtractKind { + /// Returns true if this kind is the `Prefix` variant. + pub fn is_prefix(&self) -> bool { + matches!(*self, ExtractKind::Prefix) + } + + /// Returns true if this kind is the `Suffix` variant. + pub fn is_suffix(&self) -> bool { + matches!(*self, ExtractKind::Suffix) + } +} + +impl Default for ExtractKind { + fn default() -> ExtractKind { + ExtractKind::Prefix + } +} + +/// A sequence of literals. +/// +/// A `Seq` is very much like a set in that it represents a union of its +/// members. That is, it corresponds to a set of literals where at least one +/// must match in order for a particular [`Hir`] expression to match. (Whether +/// this corresponds to the entire `Hir` expression, a prefix of it or a suffix +/// of it depends on how the `Seq` was extracted from the `Hir`.) +/// +/// It is also unlike a set in that multiple identical literals may appear, +/// and that the order of the literals in the `Seq` matters. For example, if +/// the sequence is `[sam, samwise]` and leftmost-first matching is used, then +/// `samwise` can never match and the sequence is equivalent to `[sam]`. +/// +/// # States of a sequence +/// +/// A `Seq` has a few different logical states to consider: +/// +/// * The sequence can represent "any" literal. When this happens, the set does +/// not have a finite size. The purpose of this state is to inhibit callers +/// from making assumptions about what literals are required in order to match +/// a particular [`Hir`] expression. Generally speaking, when a set is in this +/// state, literal optimizations are inhibited. A good example of a regex that +/// will cause this sort of set to apppear is `[A-Za-z]`. The character class +/// is just too big (and also too narrow) to be usefully expanded into 52 +/// different literals. (Note that the decision for when a seq should become +/// infinite is determined by the caller. A seq itself has no hard-coded +/// limits.) +/// * The sequence can be empty, in which case, it is an affirmative statement +/// that there are no literals that can match the corresponding `Hir`. +/// Consequently, the `Hir` never matches any input. For example, `[a&&b]`. +/// * The sequence can be non-empty, in which case, at least one of the +/// literals must match in order for the corresponding `Hir` to match. +/// +/// # Example +/// +/// This example shows how literal sequences can be simplified by stripping +/// suffixes and minimizing while maintaining preference order. +/// +/// ``` +/// use regex_syntax::hir::literal::{Literal, Seq}; +/// +/// let mut seq = Seq::new(&[ +/// "farm", +/// "appliance", +/// "faraway", +/// "apple", +/// "fare", +/// "gap", +/// "applicant", +/// "applaud", +/// ]); +/// seq.keep_first_bytes(3); +/// seq.minimize_by_preference(); +/// // Notice that 'far' comes before 'app', which matches the order in the +/// // original sequence. This guarantees that leftmost-first semantics are +/// // not altered by simplifying the set. +/// let expected = Seq::from_iter([ +/// Literal::inexact("far"), +/// Literal::inexact("app"), +/// Literal::exact("gap"), +/// ]); +/// assert_eq!(expected, seq); +/// ``` +#[derive(Clone, Eq, PartialEq)] +pub struct Seq { + /// The members of this seq. + /// + /// When `None`, the seq represents all possible literals. That is, it + /// prevents one from making assumptions about specific literals in the + /// seq, and forces one to treat it as if any literal might be in the seq. + /// + /// Note that `Some(vec![])` is valid and corresponds to the empty seq of + /// literals, i.e., a regex that can never match. For example, `[a&&b]`. + /// It is distinct from `Some(vec![""])`, which corresponds to the seq + /// containing an empty string, which matches at every position. + literals: Option>, +} + +impl Seq { + /// Returns an empty sequence. + /// + /// An empty sequence matches zero literals, and thus corresponds to a + /// regex that itself can never match. + #[inline] + pub fn empty() -> Seq { + Seq { literals: Some(vec![]) } + } + + /// Returns a sequence of literals without a finite size and may contain + /// any literal. + /// + /// A sequence without finite size does not reveal anything about the + /// characteristics of the literals in its set. There are no fixed prefixes + /// or suffixes, nor are lower or upper bounds on the length of the literals + /// in the set known. + /// + /// This is useful to represent constructs in a regex that are "too big" + /// to useful represent as a sequence of literals. For example, `[A-Za-z]`. + /// When sequences get too big, they lose their discriminating nature and + /// are more likely to produce false positives, which in turn makes them + /// less likely to speed up searches. + /// + /// More pragmatically, for many regexes, enumerating all possible literals + /// is itself not possible or might otherwise use too many resources. So + /// constraining the size of sets during extraction is a practical trade + /// off to make. + #[inline] + pub fn infinite() -> Seq { + Seq { literals: None } + } + + /// Returns a sequence containing a single literal. + #[inline] + pub fn singleton(lit: Literal) -> Seq { + Seq { literals: Some(vec![lit]) } + } + + /// Returns a sequence of exact literals from the given byte strings. + #[inline] + pub fn new(it: I) -> Seq + where + I: IntoIterator, + B: AsRef<[u8]>, + { + it.into_iter().map(|b| Literal::exact(b.as_ref())).collect() + } + + /// If this is a finite sequence, return its members as a slice of + /// literals. + /// + /// The slice returned may be empty, in which case, there are no literals + /// that can match this sequence. + #[inline] + pub fn literals(&self) -> Option<&[Literal]> { + self.literals.as_deref() + } + + /// Push a literal to the end of this sequence. + /// + /// If this sequence is not finite, then this is a no-op. + /// + /// Similarly, if the most recently added item of this sequence is + /// equivalent to the literal given, then it is not added. This reflects + /// a `Seq`'s "set like" behavior, and represents a practical trade off. + /// Namely, there is never any need to have two adjacent and equivalent + /// literals in the same sequence, _and_ it is easy to detect in some + /// cases. + #[inline] + pub fn push(&mut self, lit: Literal) { + let lits = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + if lits.last().map_or(false, |m| m == &lit) { + return; + } + lits.push(lit); + } + + /// Make all of the literals in this sequence inexact. + /// + /// This is a no-op if this sequence is not finite. + #[inline] + pub fn make_inexact(&mut self) { + let lits = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + for lit in lits.iter_mut() { + lit.make_inexact(); + } + } + + /// Converts this sequence to an infinite sequence. + /// + /// This is a no-op if the sequence is already infinite. + #[inline] + pub fn make_infinite(&mut self) { + self.literals = None; + } + + /// Modify this sequence to contain the cross product between it and the + /// sequence given. + /// + /// The cross product only considers literals in this sequence that are + /// exact. That is, inexact literals are not extended. + /// + /// The literals are always drained from `other`, even if none are used. + /// This permits callers to reuse the sequence allocation elsewhere. + /// + /// If this sequence is infinite, then this is a no-op, regardless of what + /// `other` contains (and in this case, the literals are still drained from + /// `other`). If `other` is infinite and this sequence is finite, then this + /// is a no-op, unless this sequence contains a zero-length literal. In + /// which case, the infiniteness of `other` infects this sequence, and this + /// sequence is itself made infinite. + /// + /// Like [`Seq::union`], this may attempt to deduplicate literals. See + /// [`Seq::dedup`] for how deduplication deals with exact and inexact + /// literals. + /// + /// # Example + /// + /// This example shows basic usage and how exact and inexact literals + /// interact. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::from_iter([ + /// Literal::inexact("quux"), + /// Literal::exact("baz"), + /// ]); + /// seq1.cross_forward(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// let expected = Seq::from_iter([ + /// Literal::inexact("fooquux"), + /// Literal::exact("foobaz"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example shows the behavior of when `other` is an infinite + /// sequence. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_forward(&mut seq2); + /// + /// // When seq2 is infinite, cross product doesn't add anything, but + /// // ensures all members of seq1 are inexact. + /// let expected = Seq::from_iter([ + /// Literal::inexact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example is like the one above, but shows what happens when this + /// sequence contains an empty string. In this case, an infinite `other` + /// sequence infects this sequence (because the empty string means that + /// there are no finite prefixes): + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact(""), // inexact provokes same behavior + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_forward(&mut seq2); + /// + /// // seq1 is now infinite! + /// assert!(!seq1.is_finite()); + /// ``` + /// + /// This example shows the behavior of this sequence is infinite. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::infinite(); + /// let mut seq2 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// seq1.cross_forward(&mut seq2); + /// + /// // seq1 remains unchanged. + /// assert!(!seq1.is_finite()); + /// // Even though the literals in seq2 weren't used, it was still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn cross_forward(&mut self, other: &mut Seq) { + let (lits1, lits2) = match self.cross_preamble(other) { + None => return, + Some((lits1, lits2)) => (lits1, lits2), + }; + let newcap = lits1.len().saturating_mul(lits2.len()); + for selflit in mem::replace(lits1, Vec::with_capacity(newcap)) { + if !selflit.is_exact() { + lits1.push(selflit); + continue; + } + for otherlit in lits2.iter() { + let mut newlit = Literal::exact(Vec::with_capacity( + selflit.len() + otherlit.len(), + )); + newlit.extend(&selflit); + newlit.extend(&otherlit); + if !otherlit.is_exact() { + newlit.make_inexact(); + } + lits1.push(newlit); + } + } + lits2.drain(..); + self.dedup(); + } + + /// Modify this sequence to contain the cross product between it and + /// the sequence given, where the sequences are treated as suffixes + /// instead of prefixes. Namely, the sequence `other` is *prepended* + /// to `self` (as opposed to `other` being *appended* to `self` in + /// [`Seq::cross_forward`]). + /// + /// The cross product only considers literals in this sequence that are + /// exact. That is, inexact literals are not extended. + /// + /// The literals are always drained from `other`, even if none are used. + /// This permits callers to reuse the sequence allocation elsewhere. + /// + /// If this sequence is infinite, then this is a no-op, regardless of what + /// `other` contains (and in this case, the literals are still drained from + /// `other`). If `other` is infinite and this sequence is finite, then this + /// is a no-op, unless this sequence contains a zero-length literal. In + /// which case, the infiniteness of `other` infects this sequence, and this + /// sequence is itself made infinite. + /// + /// Like [`Seq::union`], this may attempt to deduplicate literals. See + /// [`Seq::dedup`] for how deduplication deals with exact and inexact + /// literals. + /// + /// # Example + /// + /// This example shows basic usage and how exact and inexact literals + /// interact. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::from_iter([ + /// Literal::inexact("quux"), + /// Literal::exact("baz"), + /// ]); + /// seq1.cross_reverse(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// let expected = Seq::from_iter([ + /// Literal::inexact("quuxfoo"), + /// Literal::inexact("bar"), + /// Literal::exact("bazfoo"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example shows the behavior of when `other` is an infinite + /// sequence. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_reverse(&mut seq2); + /// + /// // When seq2 is infinite, cross product doesn't add anything, but + /// // ensures all members of seq1 are inexact. + /// let expected = Seq::from_iter([ + /// Literal::inexact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// assert_eq!(expected, seq1); + /// ``` + /// + /// This example is like the one above, but shows what happens when this + /// sequence contains an empty string. In this case, an infinite `other` + /// sequence infects this sequence (because the empty string means that + /// there are no finite suffixes): + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact(""), // inexact provokes same behavior + /// Literal::inexact("bar"), + /// ]); + /// let mut seq2 = Seq::infinite(); + /// seq1.cross_reverse(&mut seq2); + /// + /// // seq1 is now infinite! + /// assert!(!seq1.is_finite()); + /// ``` + /// + /// This example shows the behavior when this sequence is infinite. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq1 = Seq::infinite(); + /// let mut seq2 = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("bar"), + /// ]); + /// seq1.cross_reverse(&mut seq2); + /// + /// // seq1 remains unchanged. + /// assert!(!seq1.is_finite()); + /// // Even though the literals in seq2 weren't used, it was still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn cross_reverse(&mut self, other: &mut Seq) { + let (lits1, lits2) = match self.cross_preamble(other) { + None => return, + Some((lits1, lits2)) => (lits1, lits2), + }; + // We basically proceed as we do in 'cross_forward' at this point, + // except that the outer loop is now 'other' and the inner loop is now + // 'self'. That's because 'self' corresponds to suffixes and 'other' + // corresponds to the sequence we want to *prepend* to the suffixes. + let newcap = lits1.len().saturating_mul(lits2.len()); + let selflits = mem::replace(lits1, Vec::with_capacity(newcap)); + for (i, otherlit) in lits2.drain(..).enumerate() { + for selflit in selflits.iter() { + if !selflit.is_exact() { + // If the suffix isn't exact, then we can't prepend + // anything to it. However, we still want to keep it. But + // we only want to keep one of them, to avoid duplication. + // (The duplication is okay from a correctness perspective, + // but wasteful.) + if i == 0 { + lits1.push(selflit.clone()); + } + continue; + } + let mut newlit = Literal::exact(Vec::with_capacity( + otherlit.len() + selflit.len(), + )); + newlit.extend(&otherlit); + newlit.extend(&selflit); + if !otherlit.is_exact() { + newlit.make_inexact(); + } + lits1.push(newlit); + } + } + self.dedup(); + } + + /// A helper function the corresponds to the subtle preamble for both + /// `cross_forward` and `cross_reverse`. In effect, it handles the cases + /// of infinite sequences for both `self` and `other`, as well as ensuring + /// that literals from `other` are drained even if they aren't used. + fn cross_preamble<'a>( + &'a mut self, + other: &'a mut Seq, + ) -> Option<(&'a mut Vec, &'a mut Vec)> { + let lits2 = match other.literals { + None => { + // If our current seq contains the empty string and the seq + // we're adding matches any literal, then it follows that the + // current seq must now also match any literal. + // + // Otherwise, we just have to make sure everything in this + // sequence is inexact. + if self.min_literal_len() == Some(0) { + *self = Seq::infinite(); + } else { + self.make_inexact(); + } + return None; + } + Some(ref mut lits) => lits, + }; + let lits1 = match self.literals { + None => { + // If we aren't going to make it to the end of this routine + // where lits2 is drained, then we need to do it now. + lits2.drain(..); + return None; + } + Some(ref mut lits) => lits, + }; + Some((lits1, lits2)) + } + + /// Unions the `other` sequence into this one. + /// + /// The literals are always drained out of the given `other` sequence, + /// even if they are being unioned into an infinite sequence. This permits + /// the caller to reuse the `other` sequence in another context. + /// + /// Some literal deduping may be performed. If any deduping happens, + /// any leftmost-first or "preference" order match semantics will be + /// preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::new(&["foo", "bar"]); + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); + /// seq1.union(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// + /// // Adjacent literals are deduped, but non-adjacent literals may not be. + /// assert_eq!(Seq::new(&["foo", "bar", "quux", "foo"]), seq1); + /// ``` + /// + /// This example shows that literals are drained from `other` even when + /// they aren't necessarily used. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::infinite(); + /// // Infinite sequences have no finite length. + /// assert_eq!(None, seq1.len()); + /// + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); + /// seq1.union(&mut seq2); + /// + /// // seq1 is still infinite and seq2 has been drained. + /// assert_eq!(None, seq1.len()); + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn union(&mut self, other: &mut Seq) { + let lits2 = match other.literals { + None => { + // Unioning with an infinite sequence always results in an + // infinite sequence. + self.make_infinite(); + return; + } + Some(ref mut lits) => lits.drain(..), + }; + let lits1 = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + lits1.extend(lits2); + self.dedup(); + } + + /// Unions the `other` sequence into this one by splice the `other` + /// sequence at the position of the first zero-length literal. + /// + /// This is useful for preserving preference order semantics when combining + /// two literal sequences. For example, in the regex `(a||f)+foo`, the + /// correct preference order prefix sequence is `[a, foo, f]`. + /// + /// The literals are always drained out of the given `other` sequence, + /// even if they are being unioned into an infinite sequence. This permits + /// the caller to reuse the `other` sequence in another context. Note that + /// the literals are drained even if no union is performed as well, i.e., + /// when this sequence does not contain a zero-length literal. + /// + /// Some literal deduping may be performed. If any deduping happens, + /// any leftmost-first or "preference" order match semantics will be + /// preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::new(&["a", "", "f", ""]); + /// let mut seq2 = Seq::new(&["foo"]); + /// seq1.union_into_empty(&mut seq2); + /// + /// // The literals are pulled out of seq2. + /// assert_eq!(Some(0), seq2.len()); + /// // 'foo' gets spliced into seq1 where the first empty string occurs. + /// assert_eq!(Seq::new(&["a", "foo", "f"]), seq1); + /// ``` + /// + /// This example shows that literals are drained from `other` even when + /// they aren't necessarily used. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq1 = Seq::new(&["foo", "bar"]); + /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); + /// seq1.union_into_empty(&mut seq2); + /// + /// // seq1 has no zero length literals, so no splicing happens. + /// assert_eq!(Seq::new(&["foo", "bar"]), seq1); + /// // Even though no splicing happens, seq2 is still drained. + /// assert_eq!(Some(0), seq2.len()); + /// ``` + #[inline] + pub fn union_into_empty(&mut self, other: &mut Seq) { + let lits2 = other.literals.as_mut().map(|lits| lits.drain(..)); + let lits1 = match self.literals { + None => return, + Some(ref mut lits) => lits, + }; + let first_empty = match lits1.iter().position(|m| m.is_empty()) { + None => return, + Some(i) => i, + }; + let lits2 = match lits2 { + None => { + // Note that we are only here if we've found an empty literal, + // which implies that an infinite sequence infects this seq and + // also turns it into an infinite sequence. + self.literals = None; + return; + } + Some(lits) => lits, + }; + // Clearing out the empties needs to come before the splice because + // the splice might add more empties that we don't want to get rid + // of. Since we're splicing into the position of the first empty, the + // 'first_empty' position computed above is still correct. + lits1.retain(|m| !m.is_empty()); + lits1.splice(first_empty..first_empty, lits2); + self.dedup(); + } + + /// Deduplicate adjacent equivalent literals in this sequence. + /// + /// If adjacent literals are equivalent strings but one is exact and the + /// other inexact, the inexact literal is kept and the exact one is + /// removed. + /// + /// Deduping an infinite sequence is a no-op. + /// + /// # Example + /// + /// This example shows how literals that are duplicate byte strings but + /// are not equivalent with respect to exactness are resolved. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::inexact("foo"), + /// ]); + /// seq.dedup(); + /// + /// assert_eq!(Seq::from_iter([Literal::inexact("foo")]), seq); + /// ``` + #[inline] + pub fn dedup(&mut self) { + if let Some(ref mut lits) = self.literals { + lits.dedup_by(|lit1, lit2| { + if lit1.as_bytes() != lit2.as_bytes() { + return false; + } + if lit1.is_exact() != lit2.is_exact() { + lit1.make_inexact(); + lit2.make_inexact(); + } + true + }); + } + } + + /// Sorts this sequence of literals lexicographically. + /// + /// Note that if, before sorting, if a literal that is a prefix of another + /// literal appears after it, then after sorting, the sequence will not + /// represent the same preference order match semantics. For example, + /// sorting the sequence `[samwise, sam]` yields the sequence `[sam, + /// samwise]`. Under preference order semantics, the latter sequence will + /// never match `samwise` where as the first sequence can. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq = Seq::new(&["foo", "quux", "bar"]); + /// seq.sort(); + /// + /// assert_eq!(Seq::new(&["bar", "foo", "quux"]), seq); + /// ``` + #[inline] + pub fn sort(&mut self) { + if let Some(ref mut lits) = self.literals { + lits.sort(); + } + } + + /// Reverses all of the literals in this sequence. + /// + /// The order of the sequence itself is preserved. + /// + /// # Example + /// + /// This example shows basic usage. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let mut seq = Seq::new(&["oof", "rab"]); + /// seq.reverse_literals(); + /// assert_eq!(Seq::new(&["foo", "bar"]), seq); + /// ``` + #[inline] + pub fn reverse_literals(&mut self) { + if let Some(ref mut lits) = self.literals { + for lit in lits.iter_mut() { + lit.reverse(); + } + } + } + + /// Shrinks this seq to its minimal size while respecting the preference + /// order of its literals. + /// + /// While this routine will remove duplicate literals from this seq, it + /// will also remove literals that can never match in a leftmost-first or + /// "preference order" search. Similar to [`Seq::dedup`], if a literal is + /// deduped, then the one that remains is made inexact. + /// + /// This is a no-op on seqs that are empty or not finite. + /// + /// # Example + /// + /// This example shows the difference between `{sam, samwise}` and + /// `{samwise, sam}`. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// // If 'sam' comes before 'samwise' and a preference order search is + /// // executed, then 'samwise' can never match. + /// let mut seq = Seq::new(&["sam", "samwise"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::from_iter([Literal::inexact("sam")]), seq); + /// + /// // But if they are reversed, then it's possible for 'samwise' to match + /// // since it is given higher preference. + /// let mut seq = Seq::new(&["samwise", "sam"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::new(&["samwise", "sam"]), seq); + /// ``` + /// + /// This example shows that if an empty string is in this seq, then + /// anything that comes after it can never match. + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// // An empty string is a prefix of all strings, so it automatically + /// // inhibits any subsequent strings from matching. + /// let mut seq = Seq::new(&["foo", "bar", "", "quux", "fox"]); + /// seq.minimize_by_preference(); + /// let expected = Seq::from_iter([ + /// Literal::exact("foo"), + /// Literal::exact("bar"), + /// Literal::inexact(""), + /// ]); + /// assert_eq!(expected, seq); + /// + /// // And of course, if it's at the beginning, then it makes it impossible + /// // for anything else to match. + /// let mut seq = Seq::new(&["", "foo", "quux", "fox"]); + /// seq.minimize_by_preference(); + /// assert_eq!(Seq::from_iter([Literal::inexact("")]), seq); + /// ``` + #[inline] + pub fn minimize_by_preference(&mut self) { + if let Some(ref mut lits) = self.literals { + PreferenceTrie::minimize(lits, false); + } + } + + /// Trims all literals in this seq such that only the first `len` bytes + /// remain. If a literal has less than or equal to `len` bytes, then it + /// remains unchanged. Otherwise, it is trimmed and made inexact. + /// + /// # Example + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::new(&["a", "foo", "quux"]); + /// seq.keep_first_bytes(2); + /// + /// let expected = Seq::from_iter([ + /// Literal::exact("a"), + /// Literal::inexact("fo"), + /// Literal::inexact("qu"), + /// ]); + /// assert_eq!(expected, seq); + /// ``` + #[inline] + pub fn keep_first_bytes(&mut self, len: usize) { + if let Some(ref mut lits) = self.literals { + for m in lits.iter_mut() { + m.keep_first_bytes(len); + } + } + } + + /// Trims all literals in this seq such that only the last `len` bytes + /// remain. If a literal has less than or equal to `len` bytes, then it + /// remains unchanged. Otherwise, it is trimmed and made inexact. + /// + /// # Example + /// + /// ``` + /// use regex_syntax::hir::literal::{Literal, Seq}; + /// + /// let mut seq = Seq::new(&["a", "foo", "quux"]); + /// seq.keep_last_bytes(2); + /// + /// let expected = Seq::from_iter([ + /// Literal::exact("a"), + /// Literal::inexact("oo"), + /// Literal::inexact("ux"), + /// ]); + /// assert_eq!(expected, seq); + /// ``` + #[inline] + pub fn keep_last_bytes(&mut self, len: usize) { + if let Some(ref mut lits) = self.literals { + for m in lits.iter_mut() { + m.keep_last_bytes(len); + } + } + } + + /// Returns true if this sequence is finite. + /// + /// When false, this sequence is infinite and must be treated as if it + /// contains every possible literal. + #[inline] + pub fn is_finite(&self) -> bool { + self.literals.is_some() + } + + /// Returns true if and only if this sequence is finite and empty. + /// + /// An empty sequence never matches anything. It can only be produced by + /// literal extraction when the corresponding regex itself cannot match. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == Some(0) + } + + /// Returns the number of literals in this sequence if the sequence is + /// finite. If the sequence is infinite, then `None` is returned. + #[inline] + pub fn len(&self) -> Option { + self.literals.as_ref().map(|lits| lits.len()) + } + + /// Returns true if and only if all literals in this sequence are exact. + /// + /// This returns false if the sequence is infinite. + #[inline] + pub fn is_exact(&self) -> bool { + self.literals().map_or(false, |lits| lits.iter().all(|x| x.is_exact())) + } + + /// Returns true if and only if all literals in this sequence are inexact. + /// + /// This returns true if the sequence is infinite. + #[inline] + pub fn is_inexact(&self) -> bool { + self.literals().map_or(true, |lits| lits.iter().all(|x| !x.is_exact())) + } + + /// Return the maximum length of the sequence that would result from + /// unioning `self` with `other`. If either set is infinite, then this + /// returns `None`. + #[inline] + fn max_union_len(&self, other: &Seq) -> Option { + let len1 = self.len()?; + let len2 = other.len()?; + Some(len1.saturating_add(len2)) + } + + /// Return the maximum length of the sequence that would result from the + /// cross product of `self` with `other`. If either set is infinite, then + /// this returns `None`. + #[inline] + fn max_cross_len(&self, other: &Seq) -> Option { + let len1 = self.len()?; + let len2 = other.len()?; + Some(len1.saturating_mul(len2)) + } + + /// Returns the length of the shortest literal in this sequence. + /// + /// If the sequence is infinite or empty, then this returns `None`. + #[inline] + pub fn min_literal_len(&self) -> Option { + self.literals.as_ref()?.iter().map(|x| x.len()).min() + } + + /// Returns the length of the longest literal in this sequence. + /// + /// If the sequence is infinite or empty, then this returns `None`. + #[inline] + pub fn max_literal_len(&self) -> Option { + self.literals.as_ref()?.iter().map(|x| x.len()).max() + } + + /// Returns the longest common prefix from this seq. + /// + /// If the seq matches any literal or other contains no literals, then + /// there is no meaningful prefix and this returns `None`. + /// + /// # Example + /// + /// This shows some example seqs and their longest common prefix. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let seq = Seq::new(&["foo", "foobar", "fo"]); + /// assert_eq!(Some(&b"fo"[..]), seq.longest_common_prefix()); + /// let seq = Seq::new(&["foo", "foo"]); + /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_prefix()); + /// let seq = Seq::new(&["foo", "bar"]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); + /// let seq = Seq::new(&[""]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); + /// + /// let seq = Seq::infinite(); + /// assert_eq!(None, seq.longest_common_prefix()); + /// let seq = Seq::empty(); + /// assert_eq!(None, seq.longest_common_prefix()); + /// ``` + #[inline] + pub fn longest_common_prefix(&self) -> Option<&[u8]> { + // If we match everything or match nothing, then there's no meaningful + // longest common prefix. + let lits = match self.literals { + None => return None, + Some(ref lits) => lits, + }; + if lits.len() == 0 { + return None; + } + let base = lits[0].as_bytes(); + let mut len = base.len(); + for m in lits.iter().skip(1) { + len = m + .as_bytes() + .iter() + .zip(base[..len].iter()) + .take_while(|&(a, b)| a == b) + .count(); + if len == 0 { + return Some(&[]); + } + } + Some(&base[..len]) + } + + /// Returns the longest common suffix from this seq. + /// + /// If the seq matches any literal or other contains no literals, then + /// there is no meaningful suffix and this returns `None`. + /// + /// # Example + /// + /// This shows some example seqs and their longest common suffix. + /// + /// ``` + /// use regex_syntax::hir::literal::Seq; + /// + /// let seq = Seq::new(&["oof", "raboof", "of"]); + /// assert_eq!(Some(&b"of"[..]), seq.longest_common_suffix()); + /// let seq = Seq::new(&["foo", "foo"]); + /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_suffix()); + /// let seq = Seq::new(&["foo", "bar"]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); + /// let seq = Seq::new(&[""]); + /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); + /// + /// let seq = Seq::infinite(); + /// assert_eq!(None, seq.longest_common_suffix()); + /// let seq = Seq::empty(); + /// assert_eq!(None, seq.longest_common_suffix()); + /// ``` + #[inline] + pub fn longest_common_suffix(&self) -> Option<&[u8]> { + // If we match everything or match nothing, then there's no meaningful + // longest common suffix. + let lits = match self.literals { + None => return None, + Some(ref lits) => lits, + }; + if lits.len() == 0 { + return None; + } + let base = lits[0].as_bytes(); + let mut len = base.len(); + for m in lits.iter().skip(1) { + len = m + .as_bytes() + .iter() + .rev() + .zip(base[base.len() - len..].iter().rev()) + .take_while(|&(a, b)| a == b) + .count(); + if len == 0 { + return Some(&[]); + } + } + Some(&base[base.len() - len..]) + } + + /// Optimizes this seq while treating its literals as prefixes and + /// respecting the preference order of its literals. + /// + /// The specific way "optimization" works is meant to be an implementation + /// detail, as it essentially represents a set of heuristics. The goal + /// that optimization tries to accomplish is to make the literals in this + /// set reflect inputs that will result in a more effective prefilter. + /// Principally by reducing the false positive rate of candidates found by + /// the literals in this sequence. That is, when a match of a literal is + /// found, we would like it to be a strong predictor of the overall match + /// of the regex. If it isn't, then much time will be spent starting and + /// stopping the prefilter search and attempting to confirm the match only + /// to have it fail. + /// + /// Some of those heuristics might be: + /// + /// * Identifying a common prefix from a larger sequence of literals, and + /// shrinking the sequence down to that single common prefix. + /// * Rejecting the sequence entirely if it is believed to result in very + /// high false positive rate. When this happens, the sequence is made + /// infinite. + /// * Shrinking the sequence to a smaller number of literals representing + /// prefixes, but not shrinking it so much as to make literals too short. + /// (A sequence with very short literals, of 1 or 2 bytes, will typically + /// result in a higher false positive rate.) + /// + /// Optimization should only be run once extraction is complete. Namely, + /// optimization may make assumptions that do not compose with other + /// operations in the middle of extraction. For example, optimization will + /// reduce `[E(sam), E(samwise)]` to `[E(sam)]`, but such a transformation + /// is only valid if no other extraction will occur. If other extraction + /// may occur, then the correct transformation would be to `[I(sam)]`. + /// + /// The [`Seq::optimize_for_suffix_by_preference`] does the same thing, but + /// for suffixes. + /// + /// # Example + /// + /// This shows how optimization might transform a sequence. Note that + /// the specific behavior is not a documented guarantee. The heuristics + /// used are an implementation detail and may change over time in semver + /// compatible releases. + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// "sam", + /// "samwise", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert_eq!(Seq::from_iter([ + /// Literal::exact("samantha"), + /// // Kept exact even though 'samwise' got pruned + /// // because optimization assumes literal extraction + /// // has finished. + /// Literal::exact("sam"), + /// Literal::exact("frodo"), + /// ]), seq); + /// ``` + /// + /// # Example: optimization may make the sequence infinite + /// + /// If the heuristics deem that the sequence could cause a very high false + /// positive rate, then it may make the sequence infinite, effectively + /// disabling its use as a prefilter. + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// // An empty string matches at every position, + /// // thus rendering the prefilter completely + /// // ineffective. + /// "", + /// "sam", + /// "samwise", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert!(!seq.is_finite()); + /// ``` + /// + /// Do note that just because there is a `" "` in the sequence, that + /// doesn't mean the sequence will always be made infinite after it is + /// optimized. Namely, if the sequence is considered exact (any match + /// corresponds to an overall match of the original regex), then any match + /// is an overall match, and so the false positive rate is always `0`. + /// + /// To demonstrate this, we remove `samwise` from our sequence. This + /// results in no optimization happening and all literals remain exact. + /// Thus the entire sequence is exact, and it is kept as-is, even though + /// one is an ASCII space: + /// + /// ``` + /// use regex_syntax::hir::literal::{Seq, Literal}; + /// + /// let mut seq = Seq::new(&[ + /// "samantha", + /// " ", + /// "sam", + /// "frodo", + /// ]); + /// seq.optimize_for_prefix_by_preference(); + /// assert!(seq.is_finite()); + /// ``` + #[inline] + pub fn optimize_for_prefix_by_preference(&mut self) { + self.optimize_by_preference(true); + } + + /// Optimizes this seq while treating its literals as suffixes and + /// respecting the preference order of its literals. + /// + /// Optimization should only be run once extraction is complete. + /// + /// The [`Seq::optimize_for_prefix_by_preference`] does the same thing, but + /// for prefixes. See its documentation for more explanation. + #[inline] + pub fn optimize_for_suffix_by_preference(&mut self) { + self.optimize_by_preference(false); + } + + fn optimize_by_preference(&mut self, prefix: bool) { + let origlen = match self.len() { + None => return, + Some(len) => len, + }; + // Make sure we start with the smallest sequence possible. We use a + // special version of preference minimization that retains exactness. + // This is legal because optimization is only expected to occur once + // extraction is complete. + if prefix { + if let Some(ref mut lits) = self.literals { + PreferenceTrie::minimize(lits, true); + } + } + + // Look for a common prefix (or suffix). If we found one of those and + // it's long enough, then it's a good bet that it will be our fastest + // possible prefilter since single-substring search is so fast. + let fix = if prefix { + self.longest_common_prefix() + } else { + self.longest_common_suffix() + }; + if let Some(fix) = fix { + // As a special case, if we have a common prefix and the leading + // byte of that prefix is one that we think probably occurs rarely, + // then strip everything down to just that single byte. This should + // promote the use of memchr. + // + // ... we only do this though if our sequence has more than one + // literal. Otherwise, we'd rather just stick with a single literal + // scan. That is, using memchr is probably better than looking + // for 2 or more literals, but probably not as good as a straight + // memmem search. + // + // ... and also only do this when the prefix is short and probably + // not too discriminatory anyway. If it's longer, then it's + // probably quite discriminatory and thus is likely to have a low + // false positive rate. + if prefix + && origlen > 1 + && fix.len() >= 1 + && fix.len() <= 3 + && rank(fix[0]) < 200 + { + self.keep_first_bytes(1); + self.dedup(); + return; + } + // We only strip down to the common prefix/suffix if we think + // the existing set of literals isn't great, or if the common + // prefix/suffix is expected to be particularly discriminatory. + let isfast = + self.is_exact() && self.len().map_or(false, |len| len <= 16); + let usefix = fix.len() > 4 || (fix.len() > 1 && !isfast); + if usefix { + // If we keep exactly the number of bytes equal to the length + // of the prefix (or suffix), then by the definition of a + // prefix, every literal in the sequence will be equivalent. + // Thus, 'dedup' will leave us with one literal. + // + // We do it this way to avoid an alloc, but also to make sure + // the exactness of literals is kept (or not). + if prefix { + self.keep_first_bytes(fix.len()); + } else { + self.keep_last_bytes(fix.len()); + } + self.dedup(); + assert_eq!(Some(1), self.len()); + // We still fall through here. In particular, we want our + // longest common prefix to be subject to the poison check. + } + } + // Everything below this check is more-or-less about trying to + // heuristically reduce the false positive rate of a prefilter. But + // if our sequence is completely exact, then it's possible the regex + // engine can be skipped entirely. In this case, the false positive + // rate is zero because every literal match corresponds to a regex + // match. + // + // This is OK even if the sequence contains a poison literal. Remember, + // a literal is only poisononous because of what we assume about its + // impact on the false positive rate. However, we do still check for + // an empty string. Empty strings are weird and it's best to let the + // regex engine handle those. + // + // We do currently do this check after the longest common prefix (or + // suffix) check, under the theory that single-substring search is so + // fast that we want that even if we'd end up turning an exact sequence + // into an inexact one. But this might be wrong... + if self.is_exact() + && self.min_literal_len().map_or(false, |len| len > 0) + { + return; + } + // Now we attempt to shorten the sequence. The idea here is that we + // don't want to look for too many literals, but we want to shorten + // our sequence enough to improve our odds of using better algorithms + // downstream (such as Teddy). + const ATTEMPTS: [(usize, usize); 5] = + [(5, 64), (4, 64), (3, 64), (2, 64), (1, 10)]; + for (keep, limit) in ATTEMPTS { + let len = match self.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + if prefix { + self.keep_first_bytes(keep); + } else { + self.keep_last_bytes(keep); + } + self.minimize_by_preference(); + } + // Check for a poison literal. A poison literal is one that is short + // and is believed to have a very high match count. These poisons + // generally lead to a prefilter with a very high false positive rate, + // and thus overall worse performance. + // + // We do this last because we could have gone from a non-poisonous + // sequence to a poisonous one. Perhaps we should add some code to + // prevent such transitions in the first place, but then again, we + // likely only made the transition in the first place if the sequence + // was itself huge. And huge sequences are themselves poisonous. So... + if let Some(lits) = self.literals() { + if lits.iter().any(|lit| lit.is_poisonous()) { + self.make_infinite(); + } + } + } +} + +impl core::fmt::Debug for Seq { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "Seq")?; + if let Some(lits) = self.literals() { + f.debug_list().entries(lits.iter()).finish() + } else { + write!(f, "[∅]") + } + } +} + +impl FromIterator for Seq { + fn from_iter>(it: T) -> Seq { + let mut seq = Seq::empty(); + for literal in it { + seq.push(literal); + } + seq + } +} + +/// A single literal extracted from an [`Hir`] expression. +/// +/// A literal is composed of two things: +/// +/// * A sequence of bytes. No guarantees with respect to UTF-8 are provided. +/// In particular, even if the regex a literal is extracted from is UTF-8, the +/// literal extracted may not be valid UTF-8. (For example, if an [`Extractor`] +/// limit resulted in trimming a literal in a way that splits a codepoint.) +/// * Whether the literal is "exact" or not. An "exact" literal means that it +/// has not been trimmed, and may continue to be extended. If a literal is +/// "exact" after visiting the entire `Hir` expression, then this implies that +/// the literal leads to a match state. (Although it doesn't necessarily imply +/// all occurrences of the literal correspond to a match of the regex, since +/// literal extraction ignores look-around assertions.) +#[derive(Clone, Eq, PartialEq, PartialOrd, Ord)] +pub struct Literal { + bytes: Vec, + exact: bool, +} + +impl Literal { + /// Returns a new exact literal containing the bytes given. + #[inline] + pub fn exact>>(bytes: B) -> Literal { + Literal { bytes: bytes.into(), exact: true } + } + + /// Returns a new inexact literal containing the bytes given. + #[inline] + pub fn inexact>>(bytes: B) -> Literal { + Literal { bytes: bytes.into(), exact: false } + } + + /// Returns the bytes in this literal. + #[inline] + pub fn as_bytes(&self) -> &[u8] { + &self.bytes + } + + /// Yields ownership of the bytes inside this literal. + /// + /// Note that this throws away whether the literal is "exact" or not. + #[inline] + pub fn into_bytes(self) -> Vec { + self.bytes + } + + /// Returns the length of this literal in bytes. + #[inline] + pub fn len(&self) -> usize { + self.as_bytes().len() + } + + /// Returns true if and only if this literal has zero bytes. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns true if and only if this literal is exact. + #[inline] + pub fn is_exact(&self) -> bool { + self.exact + } + + /// Marks this literal as inexact. + /// + /// Inexact literals can never be extended. For example, + /// [`Seq::cross_forward`] will not extend inexact literals. + #[inline] + pub fn make_inexact(&mut self) { + self.exact = false; + } + + /// Reverse the bytes in this literal. + #[inline] + pub fn reverse(&mut self) { + self.bytes.reverse(); + } + + /// Extend this literal with the literal given. + /// + /// If this literal is inexact, then this is a no-op. + #[inline] + pub fn extend(&mut self, lit: &Literal) { + if !self.is_exact() { + return; + } + self.bytes.extend_from_slice(&lit.bytes); + } + + /// Trims this literal such that only the first `len` bytes remain. If + /// this literal has fewer than `len` bytes, then it remains unchanged. + /// Otherwise, the literal is marked as inexact. + #[inline] + pub fn keep_first_bytes(&mut self, len: usize) { + if len >= self.len() { + return; + } + self.make_inexact(); + self.bytes.truncate(len); + } + + /// Trims this literal such that only the last `len` bytes remain. If this + /// literal has fewer than `len` bytes, then it remains unchanged. + /// Otherwise, the literal is marked as inexact. + #[inline] + pub fn keep_last_bytes(&mut self, len: usize) { + if len >= self.len() { + return; + } + self.make_inexact(); + self.bytes.drain(..self.len() - len); + } + + /// Returns true if it is believe that this literal is likely to match very + /// frequently, and is thus not a good candidate for a prefilter. + fn is_poisonous(&self) -> bool { + self.is_empty() || (self.len() == 1 && rank(self.as_bytes()[0]) >= 250) + } +} + +impl From for Literal { + fn from(byte: u8) -> Literal { + Literal::exact(vec![byte]) + } +} + +impl From for Literal { + fn from(ch: char) -> Literal { + use alloc::string::ToString; + Literal::exact(ch.encode_utf8(&mut [0; 4]).to_string()) + } +} + +impl AsRef<[u8]> for Literal { + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + +impl core::fmt::Debug for Literal { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let tag = if self.exact { "E" } else { "I" }; + f.debug_tuple(tag) + .field(&crate::debug::Bytes(self.as_bytes())) + .finish() + } +} + +/// A "preference" trie that rejects literals that will never match when +/// executing a leftmost first or "preference" search. +/// +/// For example, if 'sam' is inserted, then trying to insert 'samwise' will be +/// rejected because 'samwise' can never match since 'sam' will always take +/// priority. However, if 'samwise' is inserted first, then inserting 'sam' +/// after it is accepted. In this case, either 'samwise' or 'sam' can match in +/// a "preference" search. +/// +/// Note that we only use this trie as a "set." That is, given a sequence of +/// literals, we insert each one in order. An `insert` will reject a literal +/// if a prefix of that literal already exists in the trie. Thus, to rebuild +/// the "minimal" sequence, we simply only keep literals that were successfully +/// inserted. (Since we don't need traversal, one wonders whether we can make +/// some simplifications here, but I haven't given it a ton of thought and I've +/// never seen this show up on a profile. Because of the heuristic limits +/// imposed on literal extractions, the size of the inputs here is usually +/// very small.) +#[derive(Debug, Default)] +struct PreferenceTrie { + /// The states in this trie. The index of a state in this vector is its ID. + states: Vec, + /// The index to allocate to the next literal added to this trie. Starts at + /// 0 and increments by 1 for every literal successfully added to the trie. + next_literal_index: usize, +} + +/// A single state in a trie. Uses a sparse representation for its transitions. +#[derive(Debug, Default)] +struct State { + /// Sparse representation of the transitions out of this state. Transitions + /// are sorted by byte. There is at most one such transition for any + /// particular byte. + trans: Vec<(u8, usize)>, + /// Whether this is a matching state or not. If it is, then it contains the + /// index to the matching literal. + literal_index: Option, +} + +impl PreferenceTrie { + /// Minimizes the given sequence of literals while preserving preference + /// order semantics. + /// + /// When `keep_exact` is true, the exactness of every literal retained is + /// kept. This is useful when dealing with a fully extracted `Seq` that + /// only contains exact literals. In that case, we can keep all retained + /// literals as exact because we know we'll never need to match anything + /// after them and because any removed literals are guaranteed to never + /// match. + fn minimize(literals: &mut Vec, keep_exact: bool) { + use core::cell::RefCell; + + // MSRV(1.61): Use retain_mut here to avoid interior mutability. + let trie = RefCell::new(PreferenceTrie::default()); + let mut make_inexact = vec![]; + literals.retain(|lit| { + match trie.borrow_mut().insert(lit.as_bytes()) { + Ok(_) => true, + Err(i) => { + if !keep_exact { + make_inexact.push(i); + } + false + } + } + }); + for i in make_inexact { + literals[i].make_inexact(); + } + } + + /// Returns `Ok` if the given byte string is accepted into this trie and + /// `Err` otherwise. The index for the success case corresponds to the + /// index of the literal added. The index for the error case corresponds to + /// the index of the literal already in the trie that prevented the given + /// byte string from being added. (Which implies it is a prefix of the one + /// given.) + /// + /// In short, the byte string given is accepted into the trie if and only + /// if it is possible for it to match when executing a preference order + /// search. + fn insert(&mut self, bytes: &[u8]) -> Result { + let mut prev = self.root(); + if let Some(idx) = self.states[prev].literal_index { + return Err(idx); + } + for &b in bytes.iter() { + match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) { + Ok(i) => { + prev = self.states[prev].trans[i].1; + if let Some(idx) = self.states[prev].literal_index { + return Err(idx); + } + } + Err(i) => { + let next = self.create_state(); + self.states[prev].trans.insert(i, (b, next)); + prev = next; + } + } + } + let idx = self.next_literal_index; + self.next_literal_index += 1; + self.states[prev].literal_index = Some(idx); + Ok(idx) + } + + /// Returns the root state ID, and if it doesn't exist, creates it. + fn root(&mut self) -> usize { + if !self.states.is_empty() { + 0 + } else { + self.create_state() + } + } + + /// Creates a new empty state and returns its ID. + fn create_state(&mut self) -> usize { + let id = self.states.len(); + self.states.push(State::default()); + id + } +} + +/// Returns the "rank" of the given byte. +/// +/// The minimum rank value is `0` and the maximum rank value is `255`. +/// +/// The rank of a byte is derived from a heuristic background distribution of +/// relative frequencies of bytes. The heuristic says that lower the rank of a +/// byte, the less likely that byte is to appear in any arbitrary haystack. +pub fn rank(byte: u8) -> u8 { + crate::rank::BYTE_FREQUENCIES[usize::from(byte)] +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse(pattern: &str) -> Hir { + crate::ParserBuilder::new().utf8(false).build().parse(pattern).unwrap() + } + + fn prefixes(pattern: &str) -> Seq { + Extractor::new().kind(ExtractKind::Prefix).extract(&parse(pattern)) + } + + fn suffixes(pattern: &str) -> Seq { + Extractor::new().kind(ExtractKind::Suffix).extract(&parse(pattern)) + } + + fn e(pattern: &str) -> (Seq, Seq) { + (prefixes(pattern), suffixes(pattern)) + } + + #[allow(non_snake_case)] + fn E(x: &str) -> Literal { + Literal::exact(x.as_bytes()) + } + + #[allow(non_snake_case)] + fn I(x: &str) -> Literal { + Literal::inexact(x.as_bytes()) + } + + fn seq>(it: I) -> Seq { + Seq::from_iter(it) + } + + fn infinite() -> (Seq, Seq) { + (Seq::infinite(), Seq::infinite()) + } + + fn inexact(it1: I1, it2: I2) -> (Seq, Seq) + where + I1: IntoIterator, + I2: IntoIterator, + { + (Seq::from_iter(it1), Seq::from_iter(it2)) + } + + fn exact, I: IntoIterator>(it: I) -> (Seq, Seq) { + let s1 = Seq::new(it); + let s2 = s1.clone(); + (s1, s2) + } + + fn opt, I: IntoIterator>(it: I) -> (Seq, Seq) { + let (mut p, mut s) = exact(it); + p.optimize_for_prefix_by_preference(); + s.optimize_for_suffix_by_preference(); + (p, s) + } + + #[test] + fn literal() { + assert_eq!(exact(["a"]), e("a")); + assert_eq!(exact(["aaaaa"]), e("aaaaa")); + assert_eq!(exact(["A", "a"]), e("(?i-u)a")); + assert_eq!(exact(["AB", "Ab", "aB", "ab"]), e("(?i-u)ab")); + assert_eq!(exact(["abC", "abc"]), e("ab(?i-u)c")); + + assert_eq!(exact([b"\xFF"]), e(r"(?-u:\xFF)")); + + #[cfg(feature = "unicode-case")] + { + assert_eq!(exact(["☃"]), e("☃")); + assert_eq!(exact(["☃"]), e("(?i)☃")); + assert_eq!(exact(["☃☃☃☃☃"]), e("☃☃☃☃☃")); + + assert_eq!(exact(["Δ"]), e("Δ")); + assert_eq!(exact(["δ"]), e("δ")); + assert_eq!(exact(["Δ", "δ"]), e("(?i)Δ")); + assert_eq!(exact(["Δ", "δ"]), e("(?i)δ")); + + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)S")); + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)s")); + assert_eq!(exact(["S", "s", "ſ"]), e("(?i)ſ")); + } + + let letters = "ͱͳͷΐάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋ"; + assert_eq!(exact([letters]), e(letters)); + } + + #[test] + fn class() { + assert_eq!(exact(["a", "b", "c"]), e("[abc]")); + assert_eq!(exact(["a1b", "a2b", "a3b"]), e("a[123]b")); + assert_eq!(exact(["δ", "ε"]), e("[εδ]")); + #[cfg(feature = "unicode-case")] + { + assert_eq!(exact(["Δ", "Ε", "δ", "ε", "ϵ"]), e(r"(?i)[εδ]")); + } + } + + #[test] + fn look() { + assert_eq!(exact(["ab"]), e(r"a\Ab")); + assert_eq!(exact(["ab"]), e(r"a\zb")); + assert_eq!(exact(["ab"]), e(r"a(?m:^)b")); + assert_eq!(exact(["ab"]), e(r"a(?m:$)b")); + assert_eq!(exact(["ab"]), e(r"a\bb")); + assert_eq!(exact(["ab"]), e(r"a\Bb")); + assert_eq!(exact(["ab"]), e(r"a(?-u:\b)b")); + assert_eq!(exact(["ab"]), e(r"a(?-u:\B)b")); + + assert_eq!(exact(["ab"]), e(r"^ab")); + assert_eq!(exact(["ab"]), e(r"$ab")); + assert_eq!(exact(["ab"]), e(r"(?m:^)ab")); + assert_eq!(exact(["ab"]), e(r"(?m:$)ab")); + assert_eq!(exact(["ab"]), e(r"\bab")); + assert_eq!(exact(["ab"]), e(r"\Bab")); + assert_eq!(exact(["ab"]), e(r"(?-u:\b)ab")); + assert_eq!(exact(["ab"]), e(r"(?-u:\B)ab")); + + assert_eq!(exact(["ab"]), e(r"ab^")); + assert_eq!(exact(["ab"]), e(r"ab$")); + assert_eq!(exact(["ab"]), e(r"ab(?m:^)")); + assert_eq!(exact(["ab"]), e(r"ab(?m:$)")); + assert_eq!(exact(["ab"]), e(r"ab\b")); + assert_eq!(exact(["ab"]), e(r"ab\B")); + assert_eq!(exact(["ab"]), e(r"ab(?-u:\b)")); + assert_eq!(exact(["ab"]), e(r"ab(?-u:\B)")); + + let expected = (seq([I("aZ"), E("ab")]), seq([I("Zb"), E("ab")])); + assert_eq!(expected, e(r"^aZ*b")); + } + + #[test] + fn repetition() { + assert_eq!(exact(["a", ""]), e(r"a?")); + assert_eq!(exact(["", "a"]), e(r"a??")); + assert_eq!(inexact([I("a"), E("")], [I("a"), E("")]), e(r"a*")); + assert_eq!(inexact([E(""), I("a")], [E(""), I("a")]), e(r"a*?")); + assert_eq!(inexact([I("a")], [I("a")]), e(r"a+")); + assert_eq!(inexact([I("a")], [I("a")]), e(r"(a+)+")); + + assert_eq!(exact(["ab"]), e(r"aZ{0}b")); + assert_eq!(exact(["aZb", "ab"]), e(r"aZ?b")); + assert_eq!(exact(["ab", "aZb"]), e(r"aZ??b")); + assert_eq!( + inexact([I("aZ"), E("ab")], [I("Zb"), E("ab")]), + e(r"aZ*b") + ); + assert_eq!( + inexact([E("ab"), I("aZ")], [E("ab"), I("Zb")]), + e(r"aZ*?b") + ); + assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+b")); + assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+?b")); + + assert_eq!(exact(["aZZb"]), e(r"aZ{2}b")); + assert_eq!(inexact([I("aZZ")], [I("ZZb")]), e(r"aZ{2,3}b")); + + assert_eq!(exact(["abc", ""]), e(r"(abc)?")); + assert_eq!(exact(["", "abc"]), e(r"(abc)??")); + + assert_eq!(inexact([I("a"), E("b")], [I("ab"), E("b")]), e(r"a*b")); + assert_eq!(inexact([E("b"), I("a")], [E("b"), I("ab")]), e(r"a*?b")); + assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+")); + assert_eq!(inexact([I("a"), I("b")], [I("b")]), e(r"a*b+")); + + // FIXME: The suffixes for this don't look quite right to me. I think + // the right suffixes would be: [I(ac), I(bc), E(c)]. The main issue I + // think is that suffixes are computed by iterating over concatenations + // in reverse, and then [bc, ac, c] ordering is indeed correct from + // that perspective. We also test a few more equivalent regexes, and + // we get the same result, so it is consistent at least I suppose. + // + // The reason why this isn't an issue is that it only messes up + // preference order, and currently, suffixes are never used in a + // context where preference order matters. For prefixes it matters + // because we sometimes want to use prefilters without confirmation + // when all of the literals are exact (and there's no look-around). But + // we never do that for suffixes. Any time we use suffixes, we always + // include a confirmation step. If that ever changes, then it's likely + // this bug will need to be fixed, but last time I looked, it appears + // hard to do so. + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"a*b*c") + ); + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"(a+)?(b+)?c") + ); + assert_eq!( + inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), + e(r"(a+|)(b+|)c") + ); + // A few more similarish but not identical regexes. These may have a + // similar problem as above. + assert_eq!( + inexact( + [I("a"), I("b"), I("c"), E("")], + [I("c"), I("b"), I("a"), E("")] + ), + e(r"a*b*c*") + ); + assert_eq!(inexact([I("a"), I("b"), I("c")], [I("c")]), e(r"a*b*c+")); + assert_eq!(inexact([I("a"), I("b")], [I("bc")]), e(r"a*b+c")); + assert_eq!(inexact([I("a"), I("b")], [I("c"), I("b")]), e(r"a*b+c*")); + assert_eq!(inexact([I("ab"), E("a")], [I("b"), E("a")]), e(r"ab*")); + assert_eq!( + inexact([I("ab"), E("ac")], [I("bc"), E("ac")]), + e(r"ab*c") + ); + assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+")); + assert_eq!(inexact([I("ab")], [I("bc")]), e(r"ab+c")); + + assert_eq!( + inexact([I("z"), E("azb")], [I("zazb"), E("azb")]), + e(r"z*azb") + ); + + let expected = + exact(["aaa", "aab", "aba", "abb", "baa", "bab", "bba", "bbb"]); + assert_eq!(expected, e(r"[ab]{3}")); + let expected = inexact( + [ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb"), + ], + [ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb"), + ], + ); + assert_eq!(expected, e(r"[ab]{3,4}")); + } + + #[test] + fn concat() { + let empty: [&str; 0] = []; + + assert_eq!(exact(["abcxyz"]), e(r"abc()xyz")); + assert_eq!(exact(["abcxyz"]), e(r"(abc)(xyz)")); + assert_eq!(exact(["abcmnoxyz"]), e(r"abc()mno()xyz")); + assert_eq!(exact(empty), e(r"abc[a&&b]xyz")); + assert_eq!(exact(["abcxyz"]), e(r"abc[a&&b]*xyz")); + } + + #[test] + fn alternation() { + assert_eq!(exact(["abc", "mno", "xyz"]), e(r"abc|mno|xyz")); + assert_eq!( + inexact( + [E("abc"), I("mZ"), E("mo"), E("xyz")], + [E("abc"), I("Zo"), E("mo"), E("xyz")] + ), + e(r"abc|mZ*o|xyz") + ); + assert_eq!(exact(["abc", "xyz"]), e(r"abc|M[a&&b]N|xyz")); + assert_eq!(exact(["abc", "MN", "xyz"]), e(r"abc|M[a&&b]*N|xyz")); + + assert_eq!(exact(["aaa", "aaaaa"]), e(r"(?:|aa)aaa")); + assert_eq!( + inexact( + [I("aaa"), E(""), I("aaaaa"), E("aa")], + [I("aaa"), E(""), E("aa")] + ), + e(r"(?:|aa)(?:aaa)*") + ); + assert_eq!( + inexact( + [E(""), I("aaa"), E("aa"), I("aaaaa")], + [E(""), I("aaa"), E("aa")] + ), + e(r"(?:|aa)(?:aaa)*?") + ); + + assert_eq!( + inexact([E("a"), I("b"), E("")], [E("a"), I("b"), E("")]), + e(r"a|b*") + ); + assert_eq!(inexact([E("a"), I("b")], [E("a"), I("b")]), e(r"a|b+")); + + assert_eq!( + inexact([I("a"), E("b"), E("c")], [I("ab"), E("b"), E("c")]), + e(r"a*b|c") + ); + + assert_eq!( + inexact( + [E("a"), E("b"), I("c"), E("")], + [E("a"), E("b"), I("c"), E("")] + ), + e(r"a|(?:b|c*)") + ); + + assert_eq!( + inexact( + [I("a"), I("b"), E("c"), I("a"), I("ab"), E("c")], + [I("ac"), I("bc"), E("c"), I("ac"), I("abc"), E("c")], + ), + e(r"(a|b)*c|(a|ab)*c") + ); + + assert_eq!( + exact(["abef", "abgh", "cdef", "cdgh"]), + e(r"(ab|cd)(ef|gh)") + ); + assert_eq!( + exact([ + "abefij", "abefkl", "abghij", "abghkl", "cdefij", "cdefkl", + "cdghij", "cdghkl", + ]), + e(r"(ab|cd)(ef|gh)(ij|kl)") + ); + } + + #[test] + fn impossible() { + let empty: [&str; 0] = []; + + assert_eq!(exact(empty), e(r"[a&&b]")); + assert_eq!(exact(empty), e(r"a[a&&b]")); + assert_eq!(exact(empty), e(r"[a&&b]b")); + assert_eq!(exact(empty), e(r"a[a&&b]b")); + assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]|b")); + assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]|b")); + assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]d|b")); + assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]d|b")); + assert_eq!(exact([""]), e(r"[a&&b]*")); + assert_eq!(exact(["MN"]), e(r"M[a&&b]*N")); + } + + // This tests patterns that contain something that defeats literal + // detection, usually because it would blow some limit on the total number + // of literals that can be returned. + // + // The main idea is that when literal extraction sees something that + // it knows will blow a limit, it replaces it with a marker that says + // "any literal will match here." While not necessarily true, the + // over-estimation is just fine for the purposes of literal extraction, + // because the imprecision doesn't matter: too big is too big. + // + // This is one of the trickier parts of literal extraction, since we need + // to make sure all of our literal extraction operations correctly compose + // with the markers. + #[test] + fn anything() { + assert_eq!(infinite(), e(r".")); + assert_eq!(infinite(), e(r"(?s).")); + assert_eq!(infinite(), e(r"[A-Za-z]")); + assert_eq!(infinite(), e(r"[A-Z]")); + assert_eq!(exact([""]), e(r"[A-Z]{0}")); + assert_eq!(infinite(), e(r"[A-Z]?")); + assert_eq!(infinite(), e(r"[A-Z]*")); + assert_eq!(infinite(), e(r"[A-Z]+")); + assert_eq!((seq([I("1")]), Seq::infinite()), e(r"1[A-Z]")); + assert_eq!((seq([I("1")]), seq([I("2")])), e(r"1[A-Z]2")); + assert_eq!((Seq::infinite(), seq([I("123")])), e(r"[A-Z]+123")); + assert_eq!(infinite(), e(r"[A-Z]+123[A-Z]+")); + assert_eq!(infinite(), e(r"1|[A-Z]|3")); + assert_eq!( + (seq([E("1"), I("2"), E("3")]), Seq::infinite()), + e(r"1|2[A-Z]|3"), + ); + assert_eq!( + (Seq::infinite(), seq([E("1"), I("2"), E("3")])), + e(r"1|[A-Z]2|3"), + ); + assert_eq!( + (seq([E("1"), I("2"), E("4")]), seq([E("1"), I("3"), E("4")])), + e(r"1|2[A-Z]3|4"), + ); + assert_eq!((Seq::infinite(), seq([I("2")])), e(r"(?:|1)[A-Z]2")); + assert_eq!(inexact([I("a")], [I("z")]), e(r"a.z")); + } + + // Like the 'anything' test, but it uses smaller limits in order to test + // the logic for effectively aborting literal extraction when the seqs get + // too big. + #[test] + fn anything_small_limits() { + fn prefixes(pattern: &str) -> Seq { + Extractor::new() + .kind(ExtractKind::Prefix) + .limit_total(10) + .extract(&parse(pattern)) + } + + fn suffixes(pattern: &str) -> Seq { + Extractor::new() + .kind(ExtractKind::Suffix) + .limit_total(10) + .extract(&parse(pattern)) + } + + fn e(pattern: &str) -> (Seq, Seq) { + (prefixes(pattern), suffixes(pattern)) + } + + assert_eq!( + ( + seq([ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb") + ]), + seq([ + I("aaa"), + I("aab"), + I("aba"), + I("abb"), + I("baa"), + I("bab"), + I("bba"), + I("bbb") + ]) + ), + e(r"[ab]{3}{3}") + ); + + assert_eq!(infinite(), e(r"ab|cd|ef|gh|ij|kl|mn|op|qr|st|uv|wx|yz")); + } + + #[test] + fn empty() { + assert_eq!(exact([""]), e(r"")); + assert_eq!(exact([""]), e(r"^")); + assert_eq!(exact([""]), e(r"$")); + assert_eq!(exact([""]), e(r"(?m:^)")); + assert_eq!(exact([""]), e(r"(?m:$)")); + assert_eq!(exact([""]), e(r"\b")); + assert_eq!(exact([""]), e(r"\B")); + assert_eq!(exact([""]), e(r"(?-u:\b)")); + assert_eq!(exact([""]), e(r"(?-u:\B)")); + } + + #[test] + fn odds_and_ends() { + assert_eq!((Seq::infinite(), seq([I("a")])), e(r".a")); + assert_eq!((seq([I("a")]), Seq::infinite()), e(r"a.")); + assert_eq!(infinite(), e(r"a|.")); + assert_eq!(infinite(), e(r".|a")); + + let pat = r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]"; + let expected = inexact( + ["Mo'am", "Moam", "Mu'am", "Muam"].map(I), + [ + "ddafi", "ddafy", "dhafi", "dhafy", "dzafi", "dzafy", "dafi", + "dafy", "tdafi", "tdafy", "thafi", "thafy", "tzafi", "tzafy", + "tafi", "tafy", "zdafi", "zdafy", "zhafi", "zhafy", "zzafi", + "zzafy", "zafi", "zafy", + ] + .map(I), + ); + assert_eq!(expected, e(pat)); + + assert_eq!( + (seq(["fn is_", "fn as_"].map(I)), Seq::infinite()), + e(r"fn is_([A-Z]+)|fn as_([A-Z]+)"), + ); + assert_eq!( + inexact([I("foo")], [I("quux")]), + e(r"foo[A-Z]+bar[A-Z]+quux") + ); + assert_eq!(infinite(), e(r"[A-Z]+bar[A-Z]+")); + assert_eq!( + exact(["Sherlock Holmes"]), + e(r"(?m)^Sherlock Holmes|Sherlock Holmes$") + ); + + assert_eq!(exact(["sa", "sb"]), e(r"\bs(?:[ab])")); + } + + // This tests a specific regex along with some heuristic steps to reduce + // the sequences extracted. This is meant to roughly correspond to the + // types of heuristics used to shrink literal sets in practice. (Shrinking + // is done because you want to balance "spend too much work looking for + // too many literals" and "spend too much work processing false positive + // matches from short literals.") + #[test] + #[cfg(feature = "unicode-case")] + fn holmes() { + let expected = inexact( + ["HOL", "HOl", "HoL", "Hol", "hOL", "hOl", "hoL", "hol"].map(I), + [ + "MES", "MEs", "Eſ", "MeS", "Mes", "eſ", "mES", "mEs", "meS", + "mes", + ] + .map(I), + ); + let (mut prefixes, mut suffixes) = e(r"(?i)Holmes"); + prefixes.keep_first_bytes(3); + suffixes.keep_last_bytes(3); + prefixes.minimize_by_preference(); + suffixes.minimize_by_preference(); + assert_eq!(expected, (prefixes, suffixes)); + } + + // This tests that we get some kind of literals extracted for a beefier + // alternation with case insensitive mode enabled. At one point during + // development, this returned nothing, and motivated some special case + // code in Extractor::union to try and trim down the literal sequences + // if the union would blow the limits set. + #[test] + #[cfg(feature = "unicode-case")] + fn holmes_alt() { + let mut pre = + prefixes(r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker"); + assert!(pre.len().unwrap() > 0); + pre.optimize_for_prefix_by_preference(); + assert!(pre.len().unwrap() > 0); + } + + // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 + // See: CVE-2022-24713 + // + // We test this here to ensure literal extraction completes in reasonable + // time and isn't materially impacted by these sorts of pathological + // repeats. + #[test] + fn crazy_repeats() { + assert_eq!(inexact([I("")], [I("")]), e(r"(?:){4294967295}")); + assert_eq!( + inexact([I("")], [I("")]), + e(r"(?:){64}{64}{64}{64}{64}{64}") + ); + assert_eq!(inexact([I("")], [I("")]), e(r"x{0}{4294967295}")); + assert_eq!(inexact([I("")], [I("")]), e(r"(?:|){4294967295}")); + + assert_eq!( + inexact([E("")], [E("")]), + e(r"(?:){8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") + ); + let repa = "a".repeat(100); + assert_eq!( + inexact([I(&repa)], [I(&repa)]), + e(r"a{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") + ); + } + + #[test] + fn huge() { + let pat = r#"(?-u) + 2(?: + [45]\d{3}| + 7(?: + 1[0-267]| + 2[0-289]| + 3[0-29]| + 4[01]| + 5[1-3]| + 6[013]| + 7[0178]| + 91 + )| + 8(?: + 0[125]| + [139][1-6]| + 2[0157-9]| + 41| + 6[1-35]| + 7[1-5]| + 8[1-8]| + 90 + )| + 9(?: + 0[0-2]| + 1[0-4]| + 2[568]| + 3[3-6]| + 5[5-7]| + 6[0167]| + 7[15]| + 8[0146-9] + ) + )\d{4}| + 3(?: + 12?[5-7]\d{2}| + 0(?: + 2(?: + [025-79]\d| + [348]\d{1,2} + )| + 3(?: + [2-4]\d| + [56]\d? + ) + )| + 2(?: + 1\d{2}| + 2(?: + [12]\d| + [35]\d{1,2}| + 4\d? + ) + )| + 3(?: + 1\d{2}| + 2(?: + [2356]\d| + 4\d{1,2} + ) + )| + 4(?: + 1\d{2}| + 2(?: + 2\d{1,2}| + [47]| + 5\d{2} + ) + )| + 5(?: + 1\d{2}| + 29 + )| + [67]1\d{2}| + 8(?: + 1\d{2}| + 2(?: + 2\d{2}| + 3| + 4\d + ) + ) + )\d{3}| + 4(?: + 0(?: + 2(?: + [09]\d| + 7 + )| + 33\d{2} + )| + 1\d{3}| + 2(?: + 1\d{2}| + 2(?: + [25]\d?| + [348]\d| + [67]\d{1,2} + ) + )| + 3(?: + 1\d{2}(?: + \d{2} + )?| + 2(?: + [045]\d| + [236-9]\d{1,2} + )| + 32\d{2} + )| + 4(?: + [18]\d{2}| + 2(?: + [2-46]\d{2}| + 3 + )| + 5[25]\d{2} + )| + 5(?: + 1\d{2}| + 2(?: + 3\d| + 5 + ) + )| + 6(?: + [18]\d{2}| + 2(?: + 3(?: + \d{2} + )?| + [46]\d{1,2}| + 5\d{2}| + 7\d + )| + 5(?: + 3\d?| + 4\d| + [57]\d{1,2}| + 6\d{2}| + 8 + ) + )| + 71\d{2}| + 8(?: + [18]\d{2}| + 23\d{2}| + 54\d{2} + )| + 9(?: + [18]\d{2}| + 2[2-5]\d{2}| + 53\d{1,2} + ) + )\d{3}| + 5(?: + 02[03489]\d{2}| + 1\d{2}| + 2(?: + 1\d{2}| + 2(?: + 2(?: + \d{2} + )?| + [457]\d{2} + ) + )| + 3(?: + 1\d{2}| + 2(?: + [37](?: + \d{2} + )?| + [569]\d{2} + ) + )| + 4(?: + 1\d{2}| + 2[46]\d{2} + )| + 5(?: + 1\d{2}| + 26\d{1,2} + )| + 6(?: + [18]\d{2}| + 2| + 53\d{2} + )| + 7(?: + 1| + 24 + )\d{2}| + 8(?: + 1| + 26 + )\d{2}| + 91\d{2} + )\d{3}| + 6(?: + 0(?: + 1\d{2}| + 2(?: + 3\d{2}| + 4\d{1,2} + ) + )| + 2(?: + 2[2-5]\d{2}| + 5(?: + [3-5]\d{2}| + 7 + )| + 8\d{2} + )| + 3(?: + 1| + 2[3478] + )\d{2}| + 4(?: + 1| + 2[34] + )\d{2}| + 5(?: + 1| + 2[47] + )\d{2}| + 6(?: + [18]\d{2}| + 6(?: + 2(?: + 2\d| + [34]\d{2} + )| + 5(?: + [24]\d{2}| + 3\d| + 5\d{1,2} + ) + ) + )| + 72[2-5]\d{2}| + 8(?: + 1\d{2}| + 2[2-5]\d{2} + )| + 9(?: + 1\d{2}| + 2[2-6]\d{2} + ) + )\d{3}| + 7(?: + (?: + 02| + [3-589]1| + 6[12]| + 72[24] + )\d{2}| + 21\d{3}| + 32 + )\d{3}| + 8(?: + (?: + 4[12]| + [5-7]2| + 1\d? + )| + (?: + 0| + 3[12]| + [5-7]1| + 217 + )\d + )\d{4}| + 9(?: + [35]1| + (?: + [024]2| + 81 + )\d| + (?: + 1| + [24]1 + )\d{2} + )\d{3} + "#; + // TODO: This is a good candidate of a seq of literals that could be + // shrunk quite a bit and still be very productive with respect to + // literal optimizations. + let (prefixes, suffixes) = e(pat); + assert!(!suffixes.is_finite()); + assert_eq!(Some(243), prefixes.len()); + } + + #[test] + fn optimize() { + // This gets a common prefix that isn't too short. + let (p, s) = + opt(["foobarfoobar", "foobar", "foobarzfoobar", "foobarfoobar"]); + assert_eq!(seq([I("foobar")]), p); + assert_eq!(seq([I("foobar")]), s); + + // This also finds a common prefix, but since it's only one byte, it + // prefers the multiple literals. + let (p, s) = opt(["abba", "akka", "abccba"]); + assert_eq!(exact(["abba", "akka", "abccba"]), (p, s)); + + let (p, s) = opt(["sam", "samwise"]); + assert_eq!((seq([E("sam")]), seq([E("sam"), E("samwise")])), (p, s)); + + // The empty string is poisonous, so our seq becomes infinite, even + // though all literals are exact. + let (p, s) = opt(["foobarfoo", "foo", "", "foozfoo", "foofoo"]); + assert!(!p.is_finite()); + assert!(!s.is_finite()); + + // A space is also poisonous, so our seq becomes infinite. But this + // only gets triggered when we don't have a completely exact sequence. + // When the sequence is exact, spaces are okay, since we presume that + // any prefilter will match a space more quickly than the regex engine. + // (When the sequence is exact, there's a chance of the prefilter being + // used without needing the regex engine at all.) + let mut p = seq([E("foobarfoo"), I("foo"), E(" "), E("foofoo")]); + p.optimize_for_prefix_by_preference(); + assert!(!p.is_finite()); + } +} diff --git a/regex-syntax/src/hir/literal/mod.rs b/regex-syntax/src/hir/literal/mod.rs deleted file mode 100644 index fbc5d3c975..0000000000 --- a/regex-syntax/src/hir/literal/mod.rs +++ /dev/null @@ -1,1686 +0,0 @@ -/*! -Provides routines for extracting literal prefixes and suffixes from an `Hir`. -*/ - -use std::cmp; -use std::fmt; -use std::iter; -use std::mem; -use std::ops; - -use crate::hir::{self, Hir, HirKind}; - -/// A set of literal byte strings extracted from a regular expression. -/// -/// Every member of the set is a `Literal`, which is represented by a -/// `Vec`. (Notably, it may contain invalid UTF-8.) Every member is -/// said to be either *complete* or *cut*. A complete literal means that -/// it extends until the beginning (or end) of the regular expression. In -/// some circumstances, this can be used to indicate a match in the regular -/// expression. -/// -/// A key aspect of literal extraction is knowing when to stop. It is not -/// feasible to blindly extract all literals from a regular expression, even if -/// there are finitely many. For example, the regular expression `[0-9]{10}` -/// has `10^10` distinct literals. For this reason, literal extraction is -/// bounded to some low number by default using heuristics, but the limits can -/// be tweaked. -/// -/// **WARNING**: Literal extraction uses stack space proportional to the size -/// of the `Hir` expression. At some point, this drawback will be eliminated. -/// To protect yourself, set a reasonable -/// [`nest_limit` on your `Parser`](../../struct.ParserBuilder.html#method.nest_limit). -/// This is done for you by default. -#[derive(Clone, Eq, PartialEq)] -pub struct Literals { - lits: Vec, - limit_size: usize, - limit_class: usize, -} - -/// A single member of a set of literals extracted from a regular expression. -/// -/// This type has `Deref` and `DerefMut` impls to `Vec` so that all slice -/// and `Vec` operations are available. -#[derive(Clone, Eq, Ord)] -pub struct Literal { - v: Vec, - cut: bool, -} - -impl Literals { - /// Returns a new empty set of literals using default limits. - pub fn empty() -> Literals { - Literals { lits: vec![], limit_size: 250, limit_class: 10 } - } - - /// Returns a set of literal prefixes extracted from the given `Hir`. - pub fn prefixes(expr: &Hir) -> Literals { - let mut lits = Literals::empty(); - lits.union_prefixes(expr); - lits - } - - /// Returns a set of literal suffixes extracted from the given `Hir`. - pub fn suffixes(expr: &Hir) -> Literals { - let mut lits = Literals::empty(); - lits.union_suffixes(expr); - lits - } - - /// Get the approximate size limit (in bytes) of this set. - pub fn limit_size(&self) -> usize { - self.limit_size - } - - /// Set the approximate size limit (in bytes) of this set. - /// - /// If extracting a literal would put the set over this limit, then - /// extraction stops. - /// - /// The new limits will only apply to additions to this set. Existing - /// members remain unchanged, even if the set exceeds the new limit. - pub fn set_limit_size(&mut self, size: usize) -> &mut Literals { - self.limit_size = size; - self - } - - /// Get the character class size limit for this set. - pub fn limit_class(&self) -> usize { - self.limit_class - } - - /// Limits the size of character(or byte) classes considered. - /// - /// A value of `0` prevents all character classes from being considered. - /// - /// This limit also applies to case insensitive literals, since each - /// character in the case insensitive literal is converted to a class, and - /// then case folded. - /// - /// The new limits will only apply to additions to this set. Existing - /// members remain unchanged, even if the set exceeds the new limit. - pub fn set_limit_class(&mut self, size: usize) -> &mut Literals { - self.limit_class = size; - self - } - - /// Returns the set of literals as a slice. Its order is unspecified. - pub fn literals(&self) -> &[Literal] { - &self.lits - } - - /// Returns the length of the smallest literal. - /// - /// Returns None is there are no literals in the set. - pub fn min_len(&self) -> Option { - let mut min = None; - for lit in &self.lits { - match min { - None => min = Some(lit.len()), - Some(m) if lit.len() < m => min = Some(lit.len()), - _ => {} - } - } - min - } - - /// Returns true if all members in this set are complete. - pub fn all_complete(&self) -> bool { - !self.lits.is_empty() && self.lits.iter().all(|l| !l.is_cut()) - } - - /// Returns true if any member in this set is complete. - pub fn any_complete(&self) -> bool { - self.lits.iter().any(|lit| !lit.is_cut()) - } - - /// Returns true if this set contains an empty literal. - pub fn contains_empty(&self) -> bool { - self.lits.iter().any(|lit| lit.is_empty()) - } - - /// Returns true if this set is empty or if all of its members is empty. - pub fn is_empty(&self) -> bool { - self.lits.is_empty() || self.lits.iter().all(|lit| lit.is_empty()) - } - - /// Returns a new empty set of literals using this set's limits. - pub fn to_empty(&self) -> Literals { - let mut lits = Literals::empty(); - lits.set_limit_size(self.limit_size).set_limit_class(self.limit_class); - lits - } - - /// Returns the longest common prefix of all members in this set. - pub fn longest_common_prefix(&self) -> &[u8] { - if self.is_empty() { - return &[]; - } - let lit0 = &*self.lits[0]; - let mut len = lit0.len(); - for lit in &self.lits[1..] { - len = cmp::min( - len, - lit.iter().zip(lit0).take_while(|&(a, b)| a == b).count(), - ); - } - &self.lits[0][..len] - } - - /// Returns the longest common suffix of all members in this set. - pub fn longest_common_suffix(&self) -> &[u8] { - if self.is_empty() { - return &[]; - } - let lit0 = &*self.lits[0]; - let mut len = lit0.len(); - for lit in &self.lits[1..] { - len = cmp::min( - len, - lit.iter() - .rev() - .zip(lit0.iter().rev()) - .take_while(|&(a, b)| a == b) - .count(), - ); - } - &self.lits[0][self.lits[0].len() - len..] - } - - /// Returns a new set of literals with the given number of bytes trimmed - /// from the suffix of each literal. - /// - /// If any literal would be cut out completely by trimming, then None is - /// returned. - /// - /// Any duplicates that are created as a result of this transformation are - /// removed. - pub fn trim_suffix(&self, num_bytes: usize) -> Option { - if self.min_len().map(|len| len <= num_bytes).unwrap_or(true) { - return None; - } - let mut new = self.to_empty(); - for mut lit in self.lits.iter().cloned() { - let new_len = lit.len() - num_bytes; - lit.truncate(new_len); - lit.cut(); - new.lits.push(lit); - } - new.lits.sort(); - new.lits.dedup(); - Some(new) - } - - /// Returns a new set of prefixes of this set of literals that are - /// guaranteed to be unambiguous. - /// - /// Any substring match with a member of the set is returned is guaranteed - /// to never overlap with a substring match of another member of the set - /// at the same starting position. - /// - /// Given any two members of the returned set, neither is a substring of - /// the other. - pub fn unambiguous_prefixes(&self) -> Literals { - if self.lits.is_empty() { - return self.to_empty(); - } - let mut old = self.lits.to_vec(); - let mut new = self.to_empty(); - 'OUTER: while let Some(mut candidate) = old.pop() { - if candidate.is_empty() { - continue; - } - if new.lits.is_empty() { - new.lits.push(candidate); - continue; - } - for lit2 in &mut new.lits { - if lit2.is_empty() { - continue; - } - if &candidate == lit2 { - // If the literal is already in the set, then we can - // just drop it. But make sure that cut literals are - // infectious! - candidate.cut = candidate.cut || lit2.cut; - lit2.cut = candidate.cut; - continue 'OUTER; - } - if candidate.len() < lit2.len() { - if let Some(i) = position(&candidate, &lit2) { - candidate.cut(); - let mut lit3 = lit2.clone(); - lit3.truncate(i); - lit3.cut(); - old.push(lit3); - lit2.clear(); - } - } else if let Some(i) = position(&lit2, &candidate) { - lit2.cut(); - let mut new_candidate = candidate.clone(); - new_candidate.truncate(i); - new_candidate.cut(); - old.push(new_candidate); - candidate.clear(); - } - // Oops, the candidate is already represented in the set. - if candidate.is_empty() { - continue 'OUTER; - } - } - new.lits.push(candidate); - } - new.lits.retain(|lit| !lit.is_empty()); - new.lits.sort(); - new.lits.dedup(); - new - } - - /// Returns a new set of suffixes of this set of literals that are - /// guaranteed to be unambiguous. - /// - /// Any substring match with a member of the set is returned is guaranteed - /// to never overlap with a substring match of another member of the set - /// at the same ending position. - /// - /// Given any two members of the returned set, neither is a substring of - /// the other. - pub fn unambiguous_suffixes(&self) -> Literals { - // This is a touch wasteful... - let mut lits = self.clone(); - lits.reverse(); - let mut unamb = lits.unambiguous_prefixes(); - unamb.reverse(); - unamb - } - - /// Unions the prefixes from the given expression to this set. - /// - /// If prefixes could not be added (for example, this set would exceed its - /// size limits or the set of prefixes from `expr` includes the empty - /// string), then false is returned. - /// - /// Note that prefix literals extracted from `expr` are said to be complete - /// if and only if the literal extends from the beginning of `expr` to the - /// end of `expr`. - pub fn union_prefixes(&mut self, expr: &Hir) -> bool { - let mut lits = self.to_empty(); - prefixes(expr, &mut lits); - !lits.is_empty() && !lits.contains_empty() && self.union(lits) - } - - /// Unions the suffixes from the given expression to this set. - /// - /// If suffixes could not be added (for example, this set would exceed its - /// size limits or the set of suffixes from `expr` includes the empty - /// string), then false is returned. - /// - /// Note that prefix literals extracted from `expr` are said to be complete - /// if and only if the literal extends from the end of `expr` to the - /// beginning of `expr`. - pub fn union_suffixes(&mut self, expr: &Hir) -> bool { - let mut lits = self.to_empty(); - suffixes(expr, &mut lits); - lits.reverse(); - !lits.is_empty() && !lits.contains_empty() && self.union(lits) - } - - /// Unions this set with another set. - /// - /// If the union would cause the set to exceed its limits, then the union - /// is skipped and it returns false. Otherwise, if the union succeeds, it - /// returns true. - pub fn union(&mut self, lits: Literals) -> bool { - if self.num_bytes() + lits.num_bytes() > self.limit_size { - return false; - } - if lits.is_empty() { - self.lits.push(Literal::empty()); - } else { - self.lits.extend(lits.lits); - } - true - } - - /// Extends this set with another set. - /// - /// The set of literals is extended via a cross product. - /// - /// If a cross product would cause this set to exceed its limits, then the - /// cross product is skipped and it returns false. Otherwise, if the cross - /// product succeeds, it returns true. - pub fn cross_product(&mut self, lits: &Literals) -> bool { - if lits.is_empty() { - return true; - } - // Check that we make sure we stay in our limits. - let mut size_after; - if self.is_empty() || !self.any_complete() { - size_after = self.num_bytes(); - for lits_lit in lits.literals() { - size_after += lits_lit.len(); - } - } else { - size_after = self.lits.iter().fold(0, |accum, lit| { - accum + if lit.is_cut() { lit.len() } else { 0 } - }); - for lits_lit in lits.literals() { - for self_lit in self.literals() { - if !self_lit.is_cut() { - size_after += self_lit.len() + lits_lit.len(); - } - } - } - } - if size_after > self.limit_size { - return false; - } - - let mut base = self.remove_complete(); - if base.is_empty() { - base = vec![Literal::empty()]; - } - for lits_lit in lits.literals() { - for mut self_lit in base.clone() { - self_lit.extend(&**lits_lit); - self_lit.cut = lits_lit.cut; - self.lits.push(self_lit); - } - } - true - } - - /// Extends each literal in this set with the bytes given. - /// - /// If the set is empty, then the given literal is added to the set. - /// - /// If adding any number of bytes to all members of this set causes a limit - /// to be exceeded, then no bytes are added and false is returned. If a - /// prefix of `bytes` can be fit into this set, then it is used and all - /// resulting literals are cut. - pub fn cross_add(&mut self, bytes: &[u8]) -> bool { - // N.B. This could be implemented by simply calling cross_product with - // a literal set containing just `bytes`, but we can be smarter about - // taking shorter prefixes of `bytes` if they'll fit. - if bytes.is_empty() { - return true; - } - if self.lits.is_empty() { - let i = cmp::min(self.limit_size, bytes.len()); - self.lits.push(Literal::new(bytes[..i].to_owned())); - self.lits[0].cut = i < bytes.len(); - return !self.lits[0].is_cut(); - } - let size = self.num_bytes(); - if size + self.lits.len() >= self.limit_size { - return false; - } - let mut i = 1; - while size + (i * self.lits.len()) <= self.limit_size - && i < bytes.len() - { - i += 1; - } - for lit in &mut self.lits { - if !lit.is_cut() { - lit.extend(&bytes[..i]); - if i < bytes.len() { - lit.cut(); - } - } - } - true - } - - /// Adds the given literal to this set. - /// - /// Returns false if adding this literal would cause the class to be too - /// big. - pub fn add(&mut self, lit: Literal) -> bool { - if self.num_bytes() + lit.len() > self.limit_size { - return false; - } - self.lits.push(lit); - true - } - - /// Extends each literal in this set with the character class given. - /// - /// Returns false if the character class was too big to add. - pub fn add_char_class(&mut self, cls: &hir::ClassUnicode) -> bool { - self._add_char_class(cls, false) - } - - /// Extends each literal in this set with the character class given, - /// writing the bytes of each character in reverse. - /// - /// Returns false if the character class was too big to add. - fn add_char_class_reverse(&mut self, cls: &hir::ClassUnicode) -> bool { - self._add_char_class(cls, true) - } - - fn _add_char_class( - &mut self, - cls: &hir::ClassUnicode, - reverse: bool, - ) -> bool { - use std::char; - - if self.class_exceeds_limits(cls_char_count(cls)) { - return false; - } - let mut base = self.remove_complete(); - if base.is_empty() { - base = vec![Literal::empty()]; - } - for r in cls.iter() { - let (s, e) = (r.start as u32, r.end as u32 + 1); - for c in (s..e).filter_map(char::from_u32) { - for mut lit in base.clone() { - let mut bytes = c.to_string().into_bytes(); - if reverse { - bytes.reverse(); - } - lit.extend(&bytes); - self.lits.push(lit); - } - } - } - true - } - - /// Extends each literal in this set with the byte class given. - /// - /// Returns false if the byte class was too big to add. - pub fn add_byte_class(&mut self, cls: &hir::ClassBytes) -> bool { - if self.class_exceeds_limits(cls_byte_count(cls)) { - return false; - } - let mut base = self.remove_complete(); - if base.is_empty() { - base = vec![Literal::empty()]; - } - for r in cls.iter() { - let (s, e) = (r.start as u32, r.end as u32 + 1); - for b in (s..e).map(|b| b as u8) { - for mut lit in base.clone() { - lit.push(b); - self.lits.push(lit); - } - } - } - true - } - - /// Cuts every member of this set. When a member is cut, it can never - /// be extended. - pub fn cut(&mut self) { - for lit in &mut self.lits { - lit.cut(); - } - } - - /// Reverses all members in place. - pub fn reverse(&mut self) { - for lit in &mut self.lits { - lit.reverse(); - } - } - - /// Clears this set of all members. - pub fn clear(&mut self) { - self.lits.clear(); - } - - /// Pops all complete literals out of this set. - fn remove_complete(&mut self) -> Vec { - let mut base = vec![]; - for lit in mem::replace(&mut self.lits, vec![]) { - if lit.is_cut() { - self.lits.push(lit); - } else { - base.push(lit); - } - } - base - } - - /// Returns the total number of bytes in this set. - fn num_bytes(&self) -> usize { - self.lits.iter().fold(0, |accum, lit| accum + lit.len()) - } - - /// Returns true if a character class with the given size would cause this - /// set to exceed its limits. - /// - /// The size given should correspond to the number of items in the class. - fn class_exceeds_limits(&self, size: usize) -> bool { - if size > self.limit_class { - return true; - } - // This is an approximation since codepoints in a char class can encode - // to 1-4 bytes. - let new_byte_count = if self.lits.is_empty() { - size - } else { - self.lits.iter().fold(0, |accum, lit| { - accum - + if lit.is_cut() { - // If the literal is cut, then we'll never add - // anything to it, so don't count it. - 0 - } else { - (lit.len() + 1) * size - } - }) - }; - new_byte_count > self.limit_size - } -} - -fn prefixes(expr: &Hir, lits: &mut Literals) { - match *expr.kind() { - HirKind::Literal(hir::Literal::Unicode(c)) => { - let mut buf = [0; 4]; - lits.cross_add(c.encode_utf8(&mut buf).as_bytes()); - } - HirKind::Literal(hir::Literal::Byte(b)) => { - lits.cross_add(&[b]); - } - HirKind::Class(hir::Class::Unicode(ref cls)) => { - if !lits.add_char_class(cls) { - lits.cut(); - } - } - HirKind::Class(hir::Class::Bytes(ref cls)) => { - if !lits.add_byte_class(cls) { - lits.cut(); - } - } - HirKind::Group(hir::Group { ref hir, .. }) => { - prefixes(&**hir, lits); - } - HirKind::Repetition(ref x) => match x.kind { - hir::RepetitionKind::ZeroOrOne => { - repeat_zero_or_one_literals(&x.hir, lits, prefixes); - } - hir::RepetitionKind::ZeroOrMore => { - repeat_zero_or_more_literals(&x.hir, lits, prefixes); - } - hir::RepetitionKind::OneOrMore => { - repeat_one_or_more_literals(&x.hir, lits, prefixes); - } - hir::RepetitionKind::Range(ref rng) => { - let (min, max) = match *rng { - hir::RepetitionRange::Exactly(m) => (m, Some(m)), - hir::RepetitionRange::AtLeast(m) => (m, None), - hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), - }; - repeat_range_literals( - &x.hir, min, max, x.greedy, lits, prefixes, - ) - } - }, - HirKind::Concat(ref es) if es.is_empty() => {} - HirKind::Concat(ref es) if es.len() == 1 => prefixes(&es[0], lits), - HirKind::Concat(ref es) => { - for e in es { - if let HirKind::Anchor(hir::Anchor::StartText) = *e.kind() { - if !lits.is_empty() { - lits.cut(); - break; - } - lits.add(Literal::empty()); - continue; - } - let mut lits2 = lits.to_empty(); - prefixes(e, &mut lits2); - if !lits.cross_product(&lits2) || !lits2.any_complete() { - // If this expression couldn't yield any literal that - // could be extended, then we need to quit. Since we're - // short-circuiting, we also need to freeze every member. - lits.cut(); - break; - } - } - } - HirKind::Alternation(ref es) => { - alternate_literals(es, lits, prefixes); - } - _ => lits.cut(), - } -} - -fn suffixes(expr: &Hir, lits: &mut Literals) { - match *expr.kind() { - HirKind::Literal(hir::Literal::Unicode(c)) => { - let mut buf = [0u8; 4]; - let i = c.encode_utf8(&mut buf).len(); - let buf = &mut buf[..i]; - buf.reverse(); - lits.cross_add(buf); - } - HirKind::Literal(hir::Literal::Byte(b)) => { - lits.cross_add(&[b]); - } - HirKind::Class(hir::Class::Unicode(ref cls)) => { - if !lits.add_char_class_reverse(cls) { - lits.cut(); - } - } - HirKind::Class(hir::Class::Bytes(ref cls)) => { - if !lits.add_byte_class(cls) { - lits.cut(); - } - } - HirKind::Group(hir::Group { ref hir, .. }) => { - suffixes(&**hir, lits); - } - HirKind::Repetition(ref x) => match x.kind { - hir::RepetitionKind::ZeroOrOne => { - repeat_zero_or_one_literals(&x.hir, lits, suffixes); - } - hir::RepetitionKind::ZeroOrMore => { - repeat_zero_or_more_literals(&x.hir, lits, suffixes); - } - hir::RepetitionKind::OneOrMore => { - repeat_one_or_more_literals(&x.hir, lits, suffixes); - } - hir::RepetitionKind::Range(ref rng) => { - let (min, max) = match *rng { - hir::RepetitionRange::Exactly(m) => (m, Some(m)), - hir::RepetitionRange::AtLeast(m) => (m, None), - hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), - }; - repeat_range_literals( - &x.hir, min, max, x.greedy, lits, suffixes, - ) - } - }, - HirKind::Concat(ref es) if es.is_empty() => {} - HirKind::Concat(ref es) if es.len() == 1 => suffixes(&es[0], lits), - HirKind::Concat(ref es) => { - for e in es.iter().rev() { - if let HirKind::Anchor(hir::Anchor::EndText) = *e.kind() { - if !lits.is_empty() { - lits.cut(); - break; - } - lits.add(Literal::empty()); - continue; - } - let mut lits2 = lits.to_empty(); - suffixes(e, &mut lits2); - if !lits.cross_product(&lits2) || !lits2.any_complete() { - // If this expression couldn't yield any literal that - // could be extended, then we need to quit. Since we're - // short-circuiting, we also need to freeze every member. - lits.cut(); - break; - } - } - } - HirKind::Alternation(ref es) => { - alternate_literals(es, lits, suffixes); - } - _ => lits.cut(), - } -} - -fn repeat_zero_or_one_literals( - e: &Hir, - lits: &mut Literals, - mut f: F, -) { - f( - &Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, - // FIXME: Our literal extraction doesn't care about greediness. - // Which is partially why we're treating 'e?' as 'e*'. Namely, - // 'ab??' yields [Complete(ab), Complete(a)], but it should yield - // [Complete(a), Complete(ab)] because of the non-greediness. - greedy: true, - hir: Box::new(e.clone()), - }), - lits, - ); -} - -fn repeat_zero_or_more_literals( - e: &Hir, - lits: &mut Literals, - mut f: F, -) { - let (mut lits2, mut lits3) = (lits.clone(), lits.to_empty()); - lits3.set_limit_size(lits.limit_size() / 2); - f(e, &mut lits3); - - if lits3.is_empty() || !lits2.cross_product(&lits3) { - lits.cut(); - return; - } - lits2.cut(); - lits2.add(Literal::empty()); - if !lits.union(lits2) { - lits.cut(); - } -} - -fn repeat_one_or_more_literals( - e: &Hir, - lits: &mut Literals, - mut f: F, -) { - f(e, lits); - lits.cut(); -} - -fn repeat_range_literals( - e: &Hir, - min: u32, - max: Option, - greedy: bool, - lits: &mut Literals, - mut f: F, -) { - if min == 0 { - // This is a bit conservative. If `max` is set, then we could - // treat this as a finite set of alternations. For now, we - // just treat it as `e*`. - f( - &Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, - greedy, - hir: Box::new(e.clone()), - }), - lits, - ); - } else { - if min > 0 { - let n = cmp::min(lits.limit_size, min as usize); - let es = iter::repeat(e.clone()).take(n).collect(); - f(&Hir::concat(es), lits); - if n < min as usize || lits.contains_empty() { - lits.cut(); - } - } - if max.map_or(true, |max| min < max) { - lits.cut(); - } - } -} - -fn alternate_literals( - es: &[Hir], - lits: &mut Literals, - mut f: F, -) { - let mut lits2 = lits.to_empty(); - for e in es { - let mut lits3 = lits.to_empty(); - lits3.set_limit_size(lits.limit_size() / 5); - f(e, &mut lits3); - if lits3.is_empty() || !lits2.union(lits3) { - // If we couldn't find suffixes for *any* of the - // alternates, then the entire alternation has to be thrown - // away and any existing members must be frozen. Similarly, - // if the union couldn't complete, stop and freeze. - lits.cut(); - return; - } - } - if !lits.cross_product(&lits2) { - lits.cut(); - } -} - -impl fmt::Debug for Literals { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("Literals") - .field("lits", &self.lits) - .field("limit_size", &self.limit_size) - .field("limit_class", &self.limit_class) - .finish() - } -} - -impl Literal { - /// Returns a new complete literal with the bytes given. - pub fn new(bytes: Vec) -> Literal { - Literal { v: bytes, cut: false } - } - - /// Returns a new complete empty literal. - pub fn empty() -> Literal { - Literal { v: vec![], cut: false } - } - - /// Returns true if this literal was "cut." - pub fn is_cut(&self) -> bool { - self.cut - } - - /// Cuts this literal. - pub fn cut(&mut self) { - self.cut = true; - } -} - -impl PartialEq for Literal { - fn eq(&self, other: &Literal) -> bool { - self.v == other.v - } -} - -impl PartialOrd for Literal { - fn partial_cmp(&self, other: &Literal) -> Option { - self.v.partial_cmp(&other.v) - } -} - -impl fmt::Debug for Literal { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if self.is_cut() { - write!(f, "Cut({})", escape_unicode(&self.v)) - } else { - write!(f, "Complete({})", escape_unicode(&self.v)) - } - } -} - -impl AsRef<[u8]> for Literal { - fn as_ref(&self) -> &[u8] { - &self.v - } -} - -impl ops::Deref for Literal { - type Target = Vec; - fn deref(&self) -> &Vec { - &self.v - } -} - -impl ops::DerefMut for Literal { - fn deref_mut(&mut self) -> &mut Vec { - &mut self.v - } -} - -fn position(needle: &[u8], mut haystack: &[u8]) -> Option { - let mut i = 0; - while haystack.len() >= needle.len() { - if needle == &haystack[..needle.len()] { - return Some(i); - } - i += 1; - haystack = &haystack[1..]; - } - None -} - -fn escape_unicode(bytes: &[u8]) -> String { - let show = match ::std::str::from_utf8(bytes) { - Ok(v) => v.to_string(), - Err(_) => escape_bytes(bytes), - }; - let mut space_escaped = String::new(); - for c in show.chars() { - if c.is_whitespace() { - let escaped = if c as u32 <= 0x7F { - escape_byte(c as u8) - } else if c as u32 <= 0xFFFF { - format!(r"\u{{{:04x}}}", c as u32) - } else { - format!(r"\U{{{:08x}}}", c as u32) - }; - space_escaped.push_str(&escaped); - } else { - space_escaped.push(c); - } - } - space_escaped -} - -fn escape_bytes(bytes: &[u8]) -> String { - let mut s = String::new(); - for &b in bytes { - s.push_str(&escape_byte(b)); - } - s -} - -fn escape_byte(byte: u8) -> String { - use std::ascii::escape_default; - - let escaped: Vec = escape_default(byte).collect(); - String::from_utf8_lossy(&escaped).into_owned() -} - -fn cls_char_count(cls: &hir::ClassUnicode) -> usize { - cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::() - as usize -} - -fn cls_byte_count(cls: &hir::ClassBytes) -> usize { - cls.iter().map(|&r| 1 + (r.end as u32) - (r.start as u32)).sum::() - as usize -} - -#[cfg(test)] -mod tests { - use std::fmt; - - use super::{escape_bytes, Literal, Literals}; - use crate::hir::Hir; - use crate::ParserBuilder; - - // To make test failures easier to read. - #[derive(Debug, Eq, PartialEq)] - struct Bytes(Vec); - #[derive(Debug, Eq, PartialEq)] - struct Unicode(Vec); - - fn escape_lits(blits: &[Literal]) -> Vec { - let mut ulits = vec![]; - for blit in blits { - ulits - .push(ULiteral { v: escape_bytes(&blit), cut: blit.is_cut() }); - } - ulits - } - - fn create_lits>(it: I) -> Literals { - Literals { - lits: it.into_iter().collect(), - limit_size: 0, - limit_class: 0, - } - } - - // Needs to be pub for 1.3? - #[derive(Clone, Eq, PartialEq)] - pub struct ULiteral { - v: String, - cut: bool, - } - - impl ULiteral { - fn is_cut(&self) -> bool { - self.cut - } - } - - impl fmt::Debug for ULiteral { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if self.is_cut() { - write!(f, "Cut({})", self.v) - } else { - write!(f, "Complete({})", self.v) - } - } - } - - impl PartialEq for ULiteral { - fn eq(&self, other: &Literal) -> bool { - self.v.as_bytes() == &*other.v && self.is_cut() == other.is_cut() - } - } - - impl PartialEq for Literal { - fn eq(&self, other: &ULiteral) -> bool { - &*self.v == other.v.as_bytes() && self.is_cut() == other.is_cut() - } - } - - #[allow(non_snake_case)] - fn C(s: &'static str) -> ULiteral { - ULiteral { v: s.to_owned(), cut: true } - } - #[allow(non_snake_case)] - fn M(s: &'static str) -> ULiteral { - ULiteral { v: s.to_owned(), cut: false } - } - - fn prefixes(lits: &mut Literals, expr: &Hir) { - lits.union_prefixes(expr); - } - - fn suffixes(lits: &mut Literals, expr: &Hir) { - lits.union_suffixes(expr); - } - - macro_rules! assert_lit_eq { - ($which:ident, $got_lits:expr, $($expected_lit:expr),*) => {{ - let expected: Vec = vec![$($expected_lit),*]; - let lits = $got_lits; - assert_eq!( - $which(expected.clone()), - $which(escape_lits(lits.literals()))); - assert_eq!( - !expected.is_empty() && expected.iter().all(|l| !l.is_cut()), - lits.all_complete()); - assert_eq!( - expected.iter().any(|l| !l.is_cut()), - lits.any_complete()); - }}; - } - - macro_rules! test_lit { - ($name:ident, $which:ident, $re:expr) => { - test_lit!($name, $which, $re,); - }; - ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { - #[test] - fn $name() { - let expr = ParserBuilder::new() - .build() - .parse($re) - .unwrap(); - let lits = Literals::$which(&expr); - assert_lit_eq!(Unicode, lits, $($lit),*); - - let expr = ParserBuilder::new() - .allow_invalid_utf8(true) - .unicode(false) - .build() - .parse($re) - .unwrap(); - let lits = Literals::$which(&expr); - assert_lit_eq!(Bytes, lits, $($lit),*); - } - }; - } - - // ************************************************************************ - // Tests for prefix literal extraction. - // ************************************************************************ - - // Elementary tests. - test_lit!(pfx_one_lit1, prefixes, "a", M("a")); - test_lit!(pfx_one_lit2, prefixes, "abc", M("abc")); - test_lit!(pfx_one_lit3, prefixes, "(?u)☃", M("\\xe2\\x98\\x83")); - #[cfg(feature = "unicode-case")] - test_lit!(pfx_one_lit4, prefixes, "(?ui)☃", M("\\xe2\\x98\\x83")); - test_lit!(pfx_class1, prefixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); - test_lit!( - pfx_class2, - prefixes, - "(?u)[☃Ⅰ]", - M("\\xe2\\x85\\xa0"), - M("\\xe2\\x98\\x83") - ); - #[cfg(feature = "unicode-case")] - test_lit!( - pfx_class3, - prefixes, - "(?ui)[☃Ⅰ]", - M("\\xe2\\x85\\xa0"), - M("\\xe2\\x85\\xb0"), - M("\\xe2\\x98\\x83") - ); - test_lit!(pfx_one_lit_casei1, prefixes, "(?i-u)a", M("A"), M("a")); - test_lit!( - pfx_one_lit_casei2, - prefixes, - "(?i-u)abc", - M("ABC"), - M("aBC"), - M("AbC"), - M("abC"), - M("ABc"), - M("aBc"), - M("Abc"), - M("abc") - ); - test_lit!(pfx_group1, prefixes, "(a)", M("a")); - test_lit!(pfx_rep_zero_or_one1, prefixes, "a?"); - test_lit!(pfx_rep_zero_or_one2, prefixes, "(?:abc)?"); - test_lit!(pfx_rep_zero_or_one_cat1, prefixes, "ab?", C("ab"), M("a")); - // FIXME: This should return [M("a"), M("ab")] because of the non-greedy - // repetition. As a work-around, we rewrite ab?? as ab*?, and thus we get - // a cut literal. - test_lit!(pfx_rep_zero_or_one_cat2, prefixes, "ab??", C("ab"), M("a")); - test_lit!(pfx_rep_zero_or_more1, prefixes, "a*"); - test_lit!(pfx_rep_zero_or_more2, prefixes, "(?:abc)*"); - test_lit!(pfx_rep_one_or_more1, prefixes, "a+", C("a")); - test_lit!(pfx_rep_one_or_more2, prefixes, "(?:abc)+", C("abc")); - test_lit!(pfx_rep_nested_one_or_more, prefixes, "(?:a+)+", C("a")); - test_lit!(pfx_rep_range1, prefixes, "a{0}"); - test_lit!(pfx_rep_range2, prefixes, "a{0,}"); - test_lit!(pfx_rep_range3, prefixes, "a{0,1}"); - test_lit!(pfx_rep_range4, prefixes, "a{1}", M("a")); - test_lit!(pfx_rep_range5, prefixes, "a{2}", M("aa")); - test_lit!(pfx_rep_range6, prefixes, "a{1,2}", C("a")); - test_lit!(pfx_rep_range7, prefixes, "a{2,3}", C("aa")); - - // Test regexes with concatenations. - test_lit!(pfx_cat1, prefixes, "(?:a)(?:b)", M("ab")); - test_lit!(pfx_cat2, prefixes, "[ab]z", M("az"), M("bz")); - test_lit!( - pfx_cat3, - prefixes, - "(?i-u)[ab]z", - M("AZ"), - M("BZ"), - M("aZ"), - M("bZ"), - M("Az"), - M("Bz"), - M("az"), - M("bz") - ); - test_lit!( - pfx_cat4, - prefixes, - "[ab][yz]", - M("ay"), - M("by"), - M("az"), - M("bz") - ); - test_lit!(pfx_cat5, prefixes, "a*b", C("a"), M("b")); - test_lit!(pfx_cat6, prefixes, "a*b*c", C("a"), C("b"), M("c")); - test_lit!(pfx_cat7, prefixes, "a*b*c+", C("a"), C("b"), C("c")); - test_lit!(pfx_cat8, prefixes, "a*b+c", C("a"), C("b")); - test_lit!(pfx_cat9, prefixes, "a*b+c*", C("a"), C("b")); - test_lit!(pfx_cat10, prefixes, "ab*", C("ab"), M("a")); - test_lit!(pfx_cat11, prefixes, "ab*c", C("ab"), M("ac")); - test_lit!(pfx_cat12, prefixes, "ab+", C("ab")); - test_lit!(pfx_cat13, prefixes, "ab+c", C("ab")); - test_lit!(pfx_cat14, prefixes, "a^", C("a")); - test_lit!(pfx_cat15, prefixes, "$a"); - test_lit!(pfx_cat16, prefixes, r"ab*c", C("ab"), M("ac")); - test_lit!(pfx_cat17, prefixes, r"ab+c", C("ab")); - test_lit!(pfx_cat18, prefixes, r"z*azb", C("z"), M("azb")); - test_lit!(pfx_cat19, prefixes, "a.z", C("a")); - - // Test regexes with alternations. - test_lit!(pfx_alt1, prefixes, "a|b", M("a"), M("b")); - test_lit!(pfx_alt2, prefixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); - test_lit!(pfx_alt3, prefixes, "y(?:a|b)z", M("yaz"), M("ybz")); - test_lit!(pfx_alt4, prefixes, "a|b*"); - test_lit!(pfx_alt5, prefixes, "a|b+", M("a"), C("b")); - test_lit!(pfx_alt6, prefixes, "a|(?:b|c*)"); - test_lit!( - pfx_alt7, - prefixes, - "(a|b)*c|(a|ab)*c", - C("a"), - C("b"), - M("c"), - C("a"), - C("ab"), - M("c") - ); - test_lit!(pfx_alt8, prefixes, "a*b|c", C("a"), M("b"), M("c")); - - // Test regexes with empty assertions. - test_lit!(pfx_empty1, prefixes, "^a", M("a")); - test_lit!(pfx_empty2, prefixes, "a${2}", C("a")); - test_lit!(pfx_empty3, prefixes, "^abc", M("abc")); - test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); - - // Make sure some curious regexes have no prefixes. - test_lit!(pfx_nothing1, prefixes, "."); - test_lit!(pfx_nothing2, prefixes, "(?s)."); - test_lit!(pfx_nothing3, prefixes, "^"); - test_lit!(pfx_nothing4, prefixes, "$"); - test_lit!(pfx_nothing6, prefixes, "(?m)$"); - test_lit!(pfx_nothing7, prefixes, r"\b"); - test_lit!(pfx_nothing8, prefixes, r"\B"); - - // Test a few regexes that defeat any prefix literal detection. - test_lit!(pfx_defeated1, prefixes, ".a"); - test_lit!(pfx_defeated2, prefixes, "(?s).a"); - test_lit!(pfx_defeated3, prefixes, "a*b*c*"); - test_lit!(pfx_defeated4, prefixes, "a|."); - test_lit!(pfx_defeated5, prefixes, ".|a"); - test_lit!(pfx_defeated6, prefixes, "a|^"); - test_lit!(pfx_defeated7, prefixes, ".(?:a(?:b)(?:c))"); - test_lit!(pfx_defeated8, prefixes, "$a"); - test_lit!(pfx_defeated9, prefixes, "(?m)$a"); - test_lit!(pfx_defeated10, prefixes, r"\ba"); - test_lit!(pfx_defeated11, prefixes, r"\Ba"); - test_lit!(pfx_defeated12, prefixes, "^*a"); - test_lit!(pfx_defeated13, prefixes, "^+a"); - - test_lit!( - pfx_crazy1, - prefixes, - r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]", - C("Mo\\'"), - C("Mu\\'"), - C("Moam"), - C("Muam") - ); - - // ************************************************************************ - // Tests for quiting prefix literal search. - // ************************************************************************ - - macro_rules! test_exhausted { - ($name:ident, $which:ident, $re:expr) => { - test_exhausted!($name, $which, $re,); - }; - ($name:ident, $which:ident, $re:expr, $($lit:expr),*) => { - #[test] - fn $name() { - let expr = ParserBuilder::new() - .build() - .parse($re) - .unwrap(); - let mut lits = Literals::empty(); - lits.set_limit_size(20).set_limit_class(10); - $which(&mut lits, &expr); - assert_lit_eq!(Unicode, lits, $($lit),*); - - let expr = ParserBuilder::new() - .allow_invalid_utf8(true) - .unicode(false) - .build() - .parse($re) - .unwrap(); - let mut lits = Literals::empty(); - lits.set_limit_size(20).set_limit_class(10); - $which(&mut lits, &expr); - assert_lit_eq!(Bytes, lits, $($lit),*); - } - }; - } - - // These test use a much lower limit than the default so that we can - // write test cases of reasonable size. - test_exhausted!(pfx_exhausted1, prefixes, "[a-z]"); - test_exhausted!(pfx_exhausted2, prefixes, "[a-z]*A"); - test_exhausted!(pfx_exhausted3, prefixes, "A[a-z]Z", C("A")); - test_exhausted!( - pfx_exhausted4, - prefixes, - "(?i-u)foobar", - C("FO"), - C("fO"), - C("Fo"), - C("fo") - ); - test_exhausted!( - pfx_exhausted5, - prefixes, - "(?:ab){100}", - C("abababababababababab") - ); - test_exhausted!( - pfx_exhausted6, - prefixes, - "(?:(?:ab){100})*cd", - C("ababababab"), - M("cd") - ); - test_exhausted!( - pfx_exhausted7, - prefixes, - "z(?:(?:ab){100})*cd", - C("zababababab"), - M("zcd") - ); - test_exhausted!( - pfx_exhausted8, - prefixes, - "aaaaaaaaaaaaaaaaaaaaz", - C("aaaaaaaaaaaaaaaaaaaa") - ); - - // ************************************************************************ - // Tests for suffix literal extraction. - // ************************************************************************ - - // Elementary tests. - test_lit!(sfx_one_lit1, suffixes, "a", M("a")); - test_lit!(sfx_one_lit2, suffixes, "abc", M("abc")); - test_lit!(sfx_one_lit3, suffixes, "(?u)☃", M("\\xe2\\x98\\x83")); - #[cfg(feature = "unicode-case")] - test_lit!(sfx_one_lit4, suffixes, "(?ui)☃", M("\\xe2\\x98\\x83")); - test_lit!(sfx_class1, suffixes, "[1-4]", M("1"), M("2"), M("3"), M("4")); - test_lit!( - sfx_class2, - suffixes, - "(?u)[☃Ⅰ]", - M("\\xe2\\x85\\xa0"), - M("\\xe2\\x98\\x83") - ); - #[cfg(feature = "unicode-case")] - test_lit!( - sfx_class3, - suffixes, - "(?ui)[☃Ⅰ]", - M("\\xe2\\x85\\xa0"), - M("\\xe2\\x85\\xb0"), - M("\\xe2\\x98\\x83") - ); - test_lit!(sfx_one_lit_casei1, suffixes, "(?i-u)a", M("A"), M("a")); - test_lit!( - sfx_one_lit_casei2, - suffixes, - "(?i-u)abc", - M("ABC"), - M("ABc"), - M("AbC"), - M("Abc"), - M("aBC"), - M("aBc"), - M("abC"), - M("abc") - ); - test_lit!(sfx_group1, suffixes, "(a)", M("a")); - test_lit!(sfx_rep_zero_or_one1, suffixes, "a?"); - test_lit!(sfx_rep_zero_or_one2, suffixes, "(?:abc)?"); - test_lit!(sfx_rep_zero_or_more1, suffixes, "a*"); - test_lit!(sfx_rep_zero_or_more2, suffixes, "(?:abc)*"); - test_lit!(sfx_rep_one_or_more1, suffixes, "a+", C("a")); - test_lit!(sfx_rep_one_or_more2, suffixes, "(?:abc)+", C("abc")); - test_lit!(sfx_rep_nested_one_or_more, suffixes, "(?:a+)+", C("a")); - test_lit!(sfx_rep_range1, suffixes, "a{0}"); - test_lit!(sfx_rep_range2, suffixes, "a{0,}"); - test_lit!(sfx_rep_range3, suffixes, "a{0,1}"); - test_lit!(sfx_rep_range4, suffixes, "a{1}", M("a")); - test_lit!(sfx_rep_range5, suffixes, "a{2}", M("aa")); - test_lit!(sfx_rep_range6, suffixes, "a{1,2}", C("a")); - test_lit!(sfx_rep_range7, suffixes, "a{2,3}", C("aa")); - - // Test regexes with concatenations. - test_lit!(sfx_cat1, suffixes, "(?:a)(?:b)", M("ab")); - test_lit!(sfx_cat2, suffixes, "[ab]z", M("az"), M("bz")); - test_lit!( - sfx_cat3, - suffixes, - "(?i-u)[ab]z", - M("AZ"), - M("Az"), - M("BZ"), - M("Bz"), - M("aZ"), - M("az"), - M("bZ"), - M("bz") - ); - test_lit!( - sfx_cat4, - suffixes, - "[ab][yz]", - M("ay"), - M("az"), - M("by"), - M("bz") - ); - test_lit!(sfx_cat5, suffixes, "a*b", C("ab"), M("b")); - test_lit!(sfx_cat6, suffixes, "a*b*c", C("bc"), C("ac"), M("c")); - test_lit!(sfx_cat7, suffixes, "a*b*c+", C("c")); - test_lit!(sfx_cat8, suffixes, "a*b+c", C("bc")); - test_lit!(sfx_cat9, suffixes, "a*b+c*", C("c"), C("b")); - test_lit!(sfx_cat10, suffixes, "ab*", C("b"), M("a")); - test_lit!(sfx_cat11, suffixes, "ab*c", C("bc"), M("ac")); - test_lit!(sfx_cat12, suffixes, "ab+", C("b")); - test_lit!(sfx_cat13, suffixes, "ab+c", C("bc")); - test_lit!(sfx_cat14, suffixes, "a^"); - test_lit!(sfx_cat15, suffixes, "$a", C("a")); - test_lit!(sfx_cat16, suffixes, r"ab*c", C("bc"), M("ac")); - test_lit!(sfx_cat17, suffixes, r"ab+c", C("bc")); - test_lit!(sfx_cat18, suffixes, r"z*azb", C("zazb"), M("azb")); - test_lit!(sfx_cat19, suffixes, "a.z", C("z")); - - // Test regexes with alternations. - test_lit!(sfx_alt1, suffixes, "a|b", M("a"), M("b")); - test_lit!(sfx_alt2, suffixes, "[1-3]|b", M("1"), M("2"), M("3"), M("b")); - test_lit!(sfx_alt3, suffixes, "y(?:a|b)z", M("yaz"), M("ybz")); - test_lit!(sfx_alt4, suffixes, "a|b*"); - test_lit!(sfx_alt5, suffixes, "a|b+", M("a"), C("b")); - test_lit!(sfx_alt6, suffixes, "a|(?:b|c*)"); - test_lit!( - sfx_alt7, - suffixes, - "(a|b)*c|(a|ab)*c", - C("ac"), - C("bc"), - M("c"), - C("ac"), - C("abc"), - M("c") - ); - test_lit!(sfx_alt8, suffixes, "a*b|c", C("ab"), M("b"), M("c")); - - // Test regexes with empty assertions. - test_lit!(sfx_empty1, suffixes, "a$", M("a")); - test_lit!(sfx_empty2, suffixes, "${2}a", C("a")); - - // Make sure some curious regexes have no suffixes. - test_lit!(sfx_nothing1, suffixes, "."); - test_lit!(sfx_nothing2, suffixes, "(?s)."); - test_lit!(sfx_nothing3, suffixes, "^"); - test_lit!(sfx_nothing4, suffixes, "$"); - test_lit!(sfx_nothing6, suffixes, "(?m)$"); - test_lit!(sfx_nothing7, suffixes, r"\b"); - test_lit!(sfx_nothing8, suffixes, r"\B"); - - // Test a few regexes that defeat any suffix literal detection. - test_lit!(sfx_defeated1, suffixes, "a."); - test_lit!(sfx_defeated2, suffixes, "(?s)a."); - test_lit!(sfx_defeated3, suffixes, "a*b*c*"); - test_lit!(sfx_defeated4, suffixes, "a|."); - test_lit!(sfx_defeated5, suffixes, ".|a"); - test_lit!(sfx_defeated6, suffixes, "a|^"); - test_lit!(sfx_defeated7, suffixes, "(?:a(?:b)(?:c))."); - test_lit!(sfx_defeated8, suffixes, "a^"); - test_lit!(sfx_defeated9, suffixes, "(?m)a$"); - test_lit!(sfx_defeated10, suffixes, r"a\b"); - test_lit!(sfx_defeated11, suffixes, r"a\B"); - test_lit!(sfx_defeated12, suffixes, "a^*"); - test_lit!(sfx_defeated13, suffixes, "a^+"); - - // These test use a much lower limit than the default so that we can - // write test cases of reasonable size. - test_exhausted!(sfx_exhausted1, suffixes, "[a-z]"); - test_exhausted!(sfx_exhausted2, suffixes, "A[a-z]*"); - test_exhausted!(sfx_exhausted3, suffixes, "A[a-z]Z", C("Z")); - test_exhausted!( - sfx_exhausted4, - suffixes, - "(?i-u)foobar", - C("AR"), - C("Ar"), - C("aR"), - C("ar") - ); - test_exhausted!( - sfx_exhausted5, - suffixes, - "(?:ab){100}", - C("abababababababababab") - ); - test_exhausted!( - sfx_exhausted6, - suffixes, - "cd(?:(?:ab){100})*", - C("ababababab"), - M("cd") - ); - test_exhausted!( - sfx_exhausted7, - suffixes, - "cd(?:(?:ab){100})*z", - C("abababababz"), - M("cdz") - ); - test_exhausted!( - sfx_exhausted8, - suffixes, - "zaaaaaaaaaaaaaaaaaaaa", - C("aaaaaaaaaaaaaaaaaaaa") - ); - - // ************************************************************************ - // Tests for generating unambiguous literal sets. - // ************************************************************************ - - macro_rules! test_unamb { - ($name:ident, $given:expr, $expected:expr) => { - #[test] - fn $name() { - let given: Vec = $given - .into_iter() - .map(|ul| { - let cut = ul.is_cut(); - Literal { v: ul.v.into_bytes(), cut: cut } - }) - .collect(); - let lits = create_lits(given); - let got = lits.unambiguous_prefixes(); - assert_eq!($expected, escape_lits(got.literals())); - } - }; - } - - test_unamb!(unambiguous1, vec![M("z"), M("azb")], vec![C("a"), C("z")]); - test_unamb!( - unambiguous2, - vec![M("zaaaaaa"), M("aa")], - vec![C("aa"), C("z")] - ); - test_unamb!( - unambiguous3, - vec![M("Sherlock"), M("Watson")], - vec![M("Sherlock"), M("Watson")] - ); - test_unamb!(unambiguous4, vec![M("abc"), M("bc")], vec![C("a"), C("bc")]); - test_unamb!(unambiguous5, vec![M("bc"), M("abc")], vec![C("a"), C("bc")]); - test_unamb!(unambiguous6, vec![M("a"), M("aa")], vec![C("a")]); - test_unamb!(unambiguous7, vec![M("aa"), M("a")], vec![C("a")]); - test_unamb!(unambiguous8, vec![M("ab"), M("a")], vec![C("a")]); - test_unamb!( - unambiguous9, - vec![M("ac"), M("bc"), M("c"), M("ac"), M("abc"), M("c")], - vec![C("a"), C("b"), C("c")] - ); - test_unamb!( - unambiguous10, - vec![M("Mo'"), M("Mu'"), M("Mo"), M("Mu")], - vec![C("Mo"), C("Mu")] - ); - test_unamb!( - unambiguous11, - vec![M("zazb"), M("azb")], - vec![C("a"), C("z")] - ); - test_unamb!(unambiguous12, vec![M("foo"), C("foo")], vec![C("foo")]); - test_unamb!( - unambiguous13, - vec![M("ABCX"), M("CDAX"), M("BCX")], - vec![C("A"), C("BCX"), C("CD")] - ); - test_unamb!( - unambiguous14, - vec![M("IMGX"), M("MVIX"), M("MGX"), M("DSX")], - vec![M("DSX"), C("I"), C("MGX"), C("MV")] - ); - test_unamb!( - unambiguous15, - vec![M("IMG_"), M("MG_"), M("CIMG")], - vec![C("C"), C("I"), C("MG_")] - ); - - // ************************************************************************ - // Tests for suffix trimming. - // ************************************************************************ - macro_rules! test_trim { - ($name:ident, $trim:expr, $given:expr, $expected:expr) => { - #[test] - fn $name() { - let given: Vec = $given - .into_iter() - .map(|ul| { - let cut = ul.is_cut(); - Literal { v: ul.v.into_bytes(), cut: cut } - }) - .collect(); - let lits = create_lits(given); - let got = lits.trim_suffix($trim).unwrap(); - assert_eq!($expected, escape_lits(got.literals())); - } - }; - } - - test_trim!(trim1, 1, vec![M("ab"), M("yz")], vec![C("a"), C("y")]); - test_trim!(trim2, 1, vec![M("abc"), M("abd")], vec![C("ab")]); - test_trim!(trim3, 2, vec![M("abc"), M("abd")], vec![C("a")]); - test_trim!(trim4, 2, vec![M("abc"), M("ghij")], vec![C("a"), C("gh")]); - - // ************************************************************************ - // Tests for longest common prefix. - // ************************************************************************ - - macro_rules! test_lcp { - ($name:ident, $given:expr, $expected:expr) => { - #[test] - fn $name() { - let given: Vec = $given - .into_iter() - .map(|s: &str| Literal { - v: s.to_owned().into_bytes(), - cut: false, - }) - .collect(); - let lits = create_lits(given); - let got = lits.longest_common_prefix(); - assert_eq!($expected, escape_bytes(got)); - } - }; - } - - test_lcp!(lcp1, vec!["a"], "a"); - test_lcp!(lcp2, vec![], ""); - test_lcp!(lcp3, vec!["a", "b"], ""); - test_lcp!(lcp4, vec!["ab", "ab"], "ab"); - test_lcp!(lcp5, vec!["ab", "a"], "a"); - test_lcp!(lcp6, vec!["a", "ab"], "a"); - test_lcp!(lcp7, vec!["ab", "b"], ""); - test_lcp!(lcp8, vec!["b", "ab"], ""); - test_lcp!(lcp9, vec!["foobar", "foobaz"], "fooba"); - test_lcp!(lcp10, vec!["foobar", "foobaz", "a"], ""); - test_lcp!(lcp11, vec!["a", "foobar", "foobaz"], ""); - test_lcp!(lcp12, vec!["foo", "flub", "flab", "floo"], "f"); - - // ************************************************************************ - // Tests for longest common suffix. - // ************************************************************************ - - macro_rules! test_lcs { - ($name:ident, $given:expr, $expected:expr) => { - #[test] - fn $name() { - let given: Vec = $given - .into_iter() - .map(|s: &str| Literal { - v: s.to_owned().into_bytes(), - cut: false, - }) - .collect(); - let lits = create_lits(given); - let got = lits.longest_common_suffix(); - assert_eq!($expected, escape_bytes(got)); - } - }; - } - - test_lcs!(lcs1, vec!["a"], "a"); - test_lcs!(lcs2, vec![], ""); - test_lcs!(lcs3, vec!["a", "b"], ""); - test_lcs!(lcs4, vec!["ab", "ab"], "ab"); - test_lcs!(lcs5, vec!["ab", "a"], ""); - test_lcs!(lcs6, vec!["a", "ab"], ""); - test_lcs!(lcs7, vec!["ab", "b"], "b"); - test_lcs!(lcs8, vec!["b", "ab"], "b"); - test_lcs!(lcs9, vec!["barfoo", "bazfoo"], "foo"); - test_lcs!(lcs10, vec!["barfoo", "bazfoo", "a"], ""); - test_lcs!(lcs11, vec!["a", "barfoo", "bazfoo"], ""); - test_lcs!(lcs12, vec!["flub", "bub", "boob", "dub"], "b"); -} diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 156bcc2844..a198083173 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -1,19 +1,42 @@ /*! -Defines a high-level intermediate representation for regular expressions. +Defines a high-level intermediate (HIR) representation for regular expressions. + +The HIR is represented by the [`Hir`] type, and it principally constructed via +[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users +may use the smart constructors defined on `Hir` to build their own by hand. The +smart constructors simultaneously simplify and "optimize" the HIR, and are also +the same routines used by translation. + +Most regex engines only have an HIR like this, and usually construct it +directly from the concrete syntax. This crate however first parses the +concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`, +as mentioned above. It's done this way to facilitate better error reporting, +and to have a structured representation of a regex that faithfully represents +its concrete syntax. Namely, while an `Hir` value can be converted back to an +equivalent regex pattern string, it is unlikely to look like the original due +to its simplified structure. */ -use std::char; -use std::cmp; -use std::error; -use std::fmt; -use std::result; -use std::u8; -use crate::ast::Span; -use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter}; -use crate::unicode; +use core::{char, cmp}; -pub use crate::hir::visitor::{visit, Visitor}; -pub use crate::unicode::CaseFoldError; +use alloc::{ + boxed::Box, + format, + string::{String, ToString}, + vec, + vec::Vec, +}; + +use crate::{ + ast::Span, + hir::interval::{Interval, IntervalSet, IntervalSetIter}, + unicode, +}; + +pub use crate::{ + hir::visitor::{visit, Visitor}, + unicode::CaseFoldError, +}; mod interval; pub mod literal; @@ -53,13 +76,17 @@ impl Error { } /// The type of an error that occurred while building an `Hir`. +/// +/// This error type is marked as `non_exhaustive`. This means that adding a +/// new variant is not considered a breaking change. +#[non_exhaustive] #[derive(Clone, Debug, Eq, PartialEq)] pub enum ErrorKind { /// This error occurs when a Unicode feature is used when Unicode /// support is disabled. For example `(?-u:\pL)` would trigger this error. UnicodeNotAllowed, /// This error occurs when translating a pattern that could match a byte - /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled. + /// sequence that isn't UTF-8 and `utf8` was enabled. InvalidUtf8, /// This occurs when an unrecognized Unicode property name could not /// be found. @@ -75,27 +102,22 @@ pub enum ErrorKind { /// available, and the regular expression required Unicode aware case /// insensitivity. UnicodeCaseUnavailable, - /// This occurs when the translator attempts to construct a character class - /// that is empty. - /// - /// Note that this restriction in the translator may be removed in the - /// future. - EmptyClassNotAllowed, - /// Hints that destructuring should not be exhaustive. - /// - /// This enum may grow additional variants, so this makes sure clients - /// don't count on exhaustive matching. (Otherwise, adding a new variant - /// could break existing code.) - #[doc(hidden)] - __Nonexhaustive, } -impl ErrorKind { - // TODO: Remove this method entirely on the next breaking semver release. - #[allow(deprecated)] - fn description(&self) -> &str { +#[cfg(feature = "std")] +impl std::error::Error for Error {} + +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + crate::error::Formatter::from(self).fmt(f) + } +} + +impl core::fmt::Display for ErrorKind { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use self::ErrorKind::*; - match *self { + + let msg = match *self { UnicodeNotAllowed => "Unicode not allowed here", InvalidUtf8 => "pattern can match invalid UTF-8", UnicodePropertyNotFound => "Unicode property not found", @@ -108,112 +130,82 @@ impl ErrorKind { "Unicode-aware case insensitivity matching is not available \ (make sure the unicode-case feature is enabled)" } - EmptyClassNotAllowed => "empty character classes are not allowed", - __Nonexhaustive => unreachable!(), - } - } -} - -impl error::Error for Error { - // TODO: Remove this method entirely on the next breaking semver release. - #[allow(deprecated)] - fn description(&self) -> &str { - self.kind.description() - } -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - crate::error::Formatter::from(self).fmt(f) - } -} - -impl fmt::Display for ErrorKind { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // TODO: Remove this on the next breaking semver release. - #[allow(deprecated)] - f.write_str(self.description()) + }; + f.write_str(msg) } } /// A high-level intermediate representation (HIR) for a regular expression. /// -/// The HIR of a regular expression represents an intermediate step between its -/// abstract syntax (a structured description of the concrete syntax) and -/// compiled byte codes. The purpose of HIR is to make regular expressions +/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`]. +/// An `HirKind` indicates what kind of regular expression it is (a literal, +/// a repetition, a look-around assertion, etc.), where as a `Properties` +/// describes various facts about the regular expression. For example, whether +/// it matches UTF-8 or if it matches the empty string. +/// +/// The HIR of a regular expression represents an intermediate step between +/// its abstract syntax (a structured description of the concrete syntax) and +/// an actual regex matcher. The purpose of HIR is to make regular expressions /// easier to analyze. In particular, the AST is much more complex than the /// HIR. For example, while an AST supports arbitrarily nested character /// classes, the HIR will flatten all nested classes into a single set. The HIR /// will also "compile away" every flag present in the concrete syntax. For /// example, users of HIR expressions never need to worry about case folding; -/// it is handled automatically by the translator (e.g., by translating `(?i)A` -/// to `[aA]`). -/// -/// If the HIR was produced by a translator that disallows invalid UTF-8, then -/// the HIR is guaranteed to match UTF-8 exclusively. -/// -/// This type defines its own destructor that uses constant stack space and -/// heap space proportional to the size of the HIR. +/// it is handled automatically by the translator (e.g., by translating +/// `(?i:A)` to `[aA]`). /// /// The specific type of an HIR expression can be accessed via its `kind` /// or `into_kind` methods. This extra level of indirection exists for two /// reasons: /// -/// 1. Construction of an HIR expression *must* use the constructor methods -/// on this `Hir` type instead of building the `HirKind` values directly. -/// This permits construction to enforce invariants like "concatenations -/// always consist of two or more sub-expressions." +/// 1. Construction of an HIR expression *must* use the constructor methods on +/// this `Hir` type instead of building the `HirKind` values directly. This +/// permits construction to enforce invariants like "concatenations always +/// consist of two or more sub-expressions." /// 2. Every HIR expression contains attributes that are defined inductively, -/// and can be computed cheaply during the construction process. For -/// example, one such attribute is whether the expression must match at the -/// beginning of the text. +/// and can be computed cheaply during the construction process. For example, +/// one such attribute is whether the expression must match at the beginning of +/// the haystack. +/// +/// In particular, if you have an `HirKind` value, then there is intentionally +/// no way to build an `Hir` value from it. You instead need to do case +/// analysis on the `HirKind` value and build the `Hir` value using its smart +/// constructors. +/// +/// # UTF-8 +/// +/// If the HIR was produced by a translator with +/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled, +/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty +/// matches. +/// +/// For empty matches, those can occur at any position. It is the +/// repsonsibility of the regex engine to determine whether empty matches are +/// permitted between the code units of a single codepoint. +/// +/// # Stack space +/// +/// This type defines its own destructor that uses constant stack space and +/// heap space proportional to the size of the HIR. /// /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular /// expression pattern string, and uses constant stack space and heap space -/// proportional to the size of the `Hir`. -#[derive(Clone, Debug, Eq, PartialEq)] +/// proportional to the size of the `Hir`. The regex it prints is guaranteed to +/// be _semantically_ equivalent to the original concrete syntax, but it may +/// look very different. (And potentially not practically readable by a human.) +/// +/// An `Hir`'s `fmt::Debug` implementation currently does not use constant +/// stack space. The implementation will also suppress some details (such as +/// the `Properties` inlined into every `Hir` value to make it less noisy). +#[derive(Clone, Eq, PartialEq)] pub struct Hir { /// The underlying HIR kind. kind: HirKind, /// Analysis info about this HIR, computed during construction. - info: HirInfo, -} - -/// The kind of an arbitrary `Hir` expression. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum HirKind { - /// The empty regular expression, which matches everything, including the - /// empty string. - Empty, - /// A single literal character that matches exactly this character. - Literal(Literal), - /// A single character class that matches any of the characters in the - /// class. A class can either consist of Unicode scalar values as - /// characters, or it can use bytes. - Class(Class), - /// An anchor assertion. An anchor assertion match always has zero length. - Anchor(Anchor), - /// A word boundary assertion, which may or may not be Unicode aware. A - /// word boundary assertion match always has zero length. - WordBoundary(WordBoundary), - /// A repetition operation applied to a child expression. - Repetition(Repetition), - /// A possibly capturing group, which contains a child expression. - Group(Group), - /// A concatenation of expressions. A concatenation always has at least two - /// child expressions. - /// - /// A concatenation matches only if each of its child expression matches - /// one after the other. - Concat(Vec), - /// An alternation of expressions. An alternation always has at least two - /// child expressions. - /// - /// An alternation matches only if at least one of its child expression - /// matches. If multiple expressions match, then the leftmost is preferred. - Alternation(Vec), + props: Properties, } +/// Methods for accessing the underlying `HirKind` and `Properties`. impl Hir { /// Returns a reference to the underlying HIR kind. pub fn kind(&self) -> &HirKind { @@ -223,543 +215,560 @@ impl Hir { /// Consumes ownership of this HIR expression and returns its underlying /// `HirKind`. pub fn into_kind(mut self) -> HirKind { - use std::mem; - mem::replace(&mut self.kind, HirKind::Empty) + core::mem::replace(&mut self.kind, HirKind::Empty) + } + + /// Returns the properties computed for this `Hir`. + pub fn properties(&self) -> &Properties { + &self.props } + /// Splits this HIR into its constituent parts. + /// + /// This is useful because `let Hir { kind, props } = hir;` does not work + /// because of `Hir`'s custom `Drop` implementation. + fn into_parts(mut self) -> (HirKind, Properties) { + ( + core::mem::replace(&mut self.kind, HirKind::Empty), + core::mem::replace(&mut self.props, Properties::empty()), + ) + } +} + +/// Smart constructors for HIR values. +/// +/// These constructors are called "smart" because they do inductive work or +/// simplifications. For example, calling `Hir::repetition` with a repetition +/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind +/// since it is equivalent to an empty regex. Another example is calling +/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll +/// just get back the original `expr` since it's precisely equivalent. +/// +/// Smart constructors enable maintaining invariants about the HIR data type +/// while also simulanteously keeping the representation as simple as possible. +impl Hir { /// Returns an empty HIR expression. /// /// An empty HIR expression always matches, including the empty string. + #[inline] pub fn empty() -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(true); - info.set_literal(false); - info.set_alternation_literal(false); - Hir { kind: HirKind::Empty, info } + let props = Properties::empty(); + Hir { kind: HirKind::Empty, props } + } + + /// Returns an HIR expression that can never match anything. That is, + /// the size of the set of strings in the language described by the HIR + /// returned is `0`. + /// + /// This is distinct from [`Hir::empty`] in that the empty string matches + /// the HIR returned by `Hir::empty`. That is, the set of strings in the + /// language describe described by `Hir::empty` is non-empty. + /// + /// Note that currently, the HIR returned uses an empty character class to + /// indicate that nothing can match. An equivalent expression that cannot + /// match is an empty alternation, but all such "fail" expressions are + /// normalized (via smart constructors) to empty character classes. This is + /// because empty character classes can be spelled in the concrete syntax + /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but + /// empty alternations cannot. + #[inline] + pub fn fail() -> Hir { + let class = Class::Bytes(ClassBytes::empty()); + let props = Properties::class(&class); + // We can't just call Hir::class here because it defers to Hir::fail + // in order to canonicalize the Hir value used to represent "cannot + // match." + Hir { kind: HirKind::Class(class), props } } /// Creates a literal HIR expression. /// - /// If the given literal has a `Byte` variant with an ASCII byte, then this - /// method panics. This enforces the invariant that `Byte` variants are - /// only used to express matching of invalid UTF-8. - pub fn literal(lit: Literal) -> Hir { - if let Literal::Byte(b) = lit { - assert!(b > 0x7F); + /// This accepts anything that can be converted into a `Box<[u8]>`. + /// + /// Note that there is no mechanism for storing a `char` or a `Box` + /// in an HIR. Everything is "just bytes." Whether a `Literal` (or + /// any HIR node) matches valid UTF-8 exclusively can be queried via + /// [`Properties::is_utf8`]. + /// + /// # Example + /// + /// This example shows that concatenations of `Literal` HIR values will + /// automatically get flattened and combined together. So for example, even + /// if you concat multiple `Literal` values that are themselves not valid + /// UTF-8, they might add up to valid UTF-8. This also demonstrates just + /// how "smart" Hir's smart constructors are. + /// + /// ``` + /// use regex_syntax::hir::{Hir, HirKind, Literal}; + /// + /// let literals = vec![ + /// Hir::literal([0xE2]), + /// Hir::literal([0x98]), + /// Hir::literal([0x83]), + /// ]; + /// // Each literal, on its own, is invalid UTF-8. + /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8())); + /// + /// let concat = Hir::concat(literals); + /// // But the concatenation is valid UTF-8! + /// assert!(concat.properties().is_utf8()); + /// + /// // And also notice that the literals have been concatenated into a + /// // single `Literal`, to the point where there is no explicit `Concat`! + /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); + /// assert_eq!(&expected, concat.kind()); + /// ``` + #[inline] + pub fn literal>>(lit: B) -> Hir { + let bytes = lit.into(); + if bytes.is_empty() { + return Hir::empty(); } - let mut info = HirInfo::new(); - info.set_always_utf8(lit.is_unicode()); - info.set_all_assertions(false); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(false); - info.set_literal(true); - info.set_alternation_literal(true); - Hir { kind: HirKind::Literal(lit), info } - } - - /// Creates a class HIR expression. + let lit = Literal(bytes); + let props = Properties::literal(&lit); + Hir { kind: HirKind::Literal(lit), props } + } + + /// Creates a class HIR expression. The class may either be defined over + /// ranges of Unicode codepoints or ranges of raw byte values. + /// + /// Note that an empty class is permitted. An empty class is equivalent to + /// `Hir::fail()`. + #[inline] pub fn class(class: Class) -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(class.is_always_utf8()); - info.set_all_assertions(false); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(false); - info.set_literal(false); - info.set_alternation_literal(false); - Hir { kind: HirKind::Class(class), info } - } - - /// Creates an anchor assertion HIR expression. - pub fn anchor(anchor: Anchor) -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(true); - info.set_literal(false); - info.set_alternation_literal(false); - if let Anchor::StartText = anchor { - info.set_anchored_start(true); - info.set_line_anchored_start(true); - info.set_any_anchored_start(true); - } - if let Anchor::EndText = anchor { - info.set_anchored_end(true); - info.set_line_anchored_end(true); - info.set_any_anchored_end(true); - } - if let Anchor::StartLine = anchor { - info.set_line_anchored_start(true); - } - if let Anchor::EndLine = anchor { - info.set_line_anchored_end(true); + if class.is_empty() { + return Hir::fail(); + } else if let Some(bytes) = class.literal() { + return Hir::literal(bytes); } - Hir { kind: HirKind::Anchor(anchor), info } - } - - /// Creates a word boundary assertion HIR expression. - pub fn word_boundary(word_boundary: WordBoundary) -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_anchored_start(false); - info.set_anchored_end(false); - info.set_line_anchored_start(false); - info.set_line_anchored_end(false); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_literal(false); - info.set_alternation_literal(false); - // A negated word boundary matches '', so that's fine. But \b does not - // match \b, so why do we say it can match the empty string? Well, - // because, if you search for \b against 'a', it will report [0, 0) and - // [1, 1) as matches, and both of those matches correspond to the empty - // string. Thus, only *certain* empty strings match \b, which similarly - // applies to \B. - info.set_match_empty(true); - // Negated ASCII word boundaries can match invalid UTF-8. - if let WordBoundary::AsciiNegate = word_boundary { - info.set_always_utf8(false); - } - Hir { kind: HirKind::WordBoundary(word_boundary), info } + let props = Properties::class(&class); + Hir { kind: HirKind::Class(class), props } + } + + /// Creates a look-around assertion HIR expression. + #[inline] + pub fn look(look: Look) -> Hir { + let props = Properties::look(look); + Hir { kind: HirKind::Look(look), props } } /// Creates a repetition HIR expression. + #[inline] pub fn repetition(rep: Repetition) -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(rep.hir.is_always_utf8()); - info.set_all_assertions(rep.hir.is_all_assertions()); - // If this operator can match the empty string, then it can never - // be anchored. - info.set_anchored_start( - !rep.is_match_empty() && rep.hir.is_anchored_start(), - ); - info.set_anchored_end( - !rep.is_match_empty() && rep.hir.is_anchored_end(), - ); - info.set_line_anchored_start( - !rep.is_match_empty() && rep.hir.is_anchored_start(), - ); - info.set_line_anchored_end( - !rep.is_match_empty() && rep.hir.is_anchored_end(), - ); - info.set_any_anchored_start(rep.hir.is_any_anchored_start()); - info.set_any_anchored_end(rep.hir.is_any_anchored_end()); - info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty()); - info.set_literal(false); - info.set_alternation_literal(false); - Hir { kind: HirKind::Repetition(rep), info } - } - - /// Creates a group HIR expression. - pub fn group(group: Group) -> Hir { - let mut info = HirInfo::new(); - info.set_always_utf8(group.hir.is_always_utf8()); - info.set_all_assertions(group.hir.is_all_assertions()); - info.set_anchored_start(group.hir.is_anchored_start()); - info.set_anchored_end(group.hir.is_anchored_end()); - info.set_line_anchored_start(group.hir.is_line_anchored_start()); - info.set_line_anchored_end(group.hir.is_line_anchored_end()); - info.set_any_anchored_start(group.hir.is_any_anchored_start()); - info.set_any_anchored_end(group.hir.is_any_anchored_end()); - info.set_match_empty(group.hir.is_match_empty()); - info.set_literal(false); - info.set_alternation_literal(false); - Hir { kind: HirKind::Group(group), info } + // The regex 'a{0}' is always equivalent to the empty regex. This is + // true even when 'a' is an expression that never matches anything + // (like '\P{any}'). + // + // Additionally, the regex 'a{1}' is always equivalent to 'a'. + if rep.min == 0 && rep.max == Some(0) { + return Hir::empty(); + } else if rep.min == 1 && rep.max == Some(1) { + return *rep.sub; + } + let props = Properties::repetition(&rep); + Hir { kind: HirKind::Repetition(rep), props } + } + + /// Creates a capture HIR expression. + /// + /// Note that there is no explicit HIR value for a non-capturing group. + /// Since a non-capturing group only exists to override precedence in the + /// concrete syntax and since an HIR already does its own grouping based on + /// what is parsed, there is no need to explicitly represent non-capturing + /// groups in the HIR. + #[inline] + pub fn capture(capture: Capture) -> Hir { + let props = Properties::capture(&capture); + Hir { kind: HirKind::Capture(capture), props } } /// Returns the concatenation of the given expressions. /// - /// This flattens the concatenation as appropriate. - pub fn concat(mut exprs: Vec) -> Hir { - match exprs.len() { - 0 => Hir::empty(), - 1 => exprs.pop().unwrap(), - _ => { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(true); - info.set_literal(true); - info.set_alternation_literal(true); - - // Some attributes require analyzing all sub-expressions. - for e in &exprs { - let x = info.is_always_utf8() && e.is_always_utf8(); - info.set_always_utf8(x); - - let x = info.is_all_assertions() && e.is_all_assertions(); - info.set_all_assertions(x); - - let x = info.is_any_anchored_start() - || e.is_any_anchored_start(); - info.set_any_anchored_start(x); - - let x = - info.is_any_anchored_end() || e.is_any_anchored_end(); - info.set_any_anchored_end(x); - - let x = info.is_match_empty() && e.is_match_empty(); - info.set_match_empty(x); - - let x = info.is_literal() && e.is_literal(); - info.set_literal(x); - - let x = info.is_alternation_literal() - && e.is_alternation_literal(); - info.set_alternation_literal(x); + /// This attempts to flatten and simplify the concatenation as appropriate. + /// + /// # Example + /// + /// This shows a simple example of basic flattening of both concatenations + /// and literals. + /// + /// ``` + /// use regex_syntax::hir::Hir; + /// + /// let hir = Hir::concat(vec![ + /// Hir::concat(vec![ + /// Hir::literal([b'a']), + /// Hir::literal([b'b']), + /// Hir::literal([b'c']), + /// ]), + /// Hir::concat(vec![ + /// Hir::literal([b'x']), + /// Hir::literal([b'y']), + /// Hir::literal([b'z']), + /// ]), + /// ]); + /// let expected = Hir::literal("abcxyz".as_bytes()); + /// assert_eq!(expected, hir); + /// ``` + pub fn concat(subs: Vec) -> Hir { + // We rebuild the concatenation by simplifying it. Would be nice to do + // it in place, but that seems a little tricky? + let mut new = vec![]; + // This gobbles up any adjacent literals in a concatenation and smushes + // them together. Basically, when we see a literal, we add its bytes + // to 'prior_lit', and whenever we see anything else, we first take + // any bytes in 'prior_lit' and add it to the 'new' concatenation. + let mut prior_lit: Option> = None; + for sub in subs { + let (kind, props) = sub.into_parts(); + match kind { + HirKind::Literal(Literal(bytes)) => { + if let Some(ref mut prior_bytes) = prior_lit { + prior_bytes.extend_from_slice(&bytes); + } else { + prior_lit = Some(bytes.to_vec()); + } + } + // We also flatten concats that are direct children of another + // concat. We only need to do this one level deep since + // Hir::concat is the only way to build concatenations, and so + // flattening happens inductively. + HirKind::Concat(subs2) => { + for sub2 in subs2 { + let (kind2, props2) = sub2.into_parts(); + match kind2 { + HirKind::Literal(Literal(bytes)) => { + if let Some(ref mut prior_bytes) = prior_lit { + prior_bytes.extend_from_slice(&bytes); + } else { + prior_lit = Some(bytes.to_vec()); + } + } + kind2 => { + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + new.push(Hir { kind: kind2, props: props2 }); + } + } + } + } + // We can just skip empty HIRs. + HirKind::Empty => {} + kind => { + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + new.push(Hir { kind, props }); } - // Anchored attributes require something slightly more - // sophisticated. Normally, WLOG, to determine whether an - // expression is anchored to the start, we'd only need to check - // the first expression of a concatenation. However, - // expressions like `$\b^` are still anchored to the start, - // but the first expression in the concatenation *isn't* - // anchored to the start. So the "first" expression to look at - // is actually one that is either not an assertion or is - // specifically the StartText assertion. - info.set_anchored_start( - exprs - .iter() - .take_while(|e| { - e.is_anchored_start() || e.is_all_assertions() - }) - .any(|e| e.is_anchored_start()), - ); - // Similarly for the end anchor, but in reverse. - info.set_anchored_end( - exprs - .iter() - .rev() - .take_while(|e| { - e.is_anchored_end() || e.is_all_assertions() - }) - .any(|e| e.is_anchored_end()), - ); - // Repeat the process for line anchors. - info.set_line_anchored_start( - exprs - .iter() - .take_while(|e| { - e.is_line_anchored_start() || e.is_all_assertions() - }) - .any(|e| e.is_line_anchored_start()), - ); - info.set_line_anchored_end( - exprs - .iter() - .rev() - .take_while(|e| { - e.is_line_anchored_end() || e.is_all_assertions() - }) - .any(|e| e.is_line_anchored_end()), - ); - Hir { kind: HirKind::Concat(exprs), info } } } + if let Some(prior_bytes) = prior_lit.take() { + new.push(Hir::literal(prior_bytes)); + } + if new.is_empty() { + return Hir::empty(); + } else if new.len() == 1 { + return new.pop().unwrap(); + } + let props = Properties::concat(&new); + Hir { kind: HirKind::Concat(new), props } } /// Returns the alternation of the given expressions. /// - /// This flattens the alternation as appropriate. - pub fn alternation(mut exprs: Vec) -> Hir { - match exprs.len() { - 0 => Hir::empty(), - 1 => exprs.pop().unwrap(), - _ => { - let mut info = HirInfo::new(); - info.set_always_utf8(true); - info.set_all_assertions(true); - info.set_anchored_start(true); - info.set_anchored_end(true); - info.set_line_anchored_start(true); - info.set_line_anchored_end(true); - info.set_any_anchored_start(false); - info.set_any_anchored_end(false); - info.set_match_empty(false); - info.set_literal(false); - info.set_alternation_literal(true); - - // Some attributes require analyzing all sub-expressions. - for e in &exprs { - let x = info.is_always_utf8() && e.is_always_utf8(); - info.set_always_utf8(x); - - let x = info.is_all_assertions() && e.is_all_assertions(); - info.set_all_assertions(x); - - let x = info.is_anchored_start() && e.is_anchored_start(); - info.set_anchored_start(x); - - let x = info.is_anchored_end() && e.is_anchored_end(); - info.set_anchored_end(x); - - let x = info.is_line_anchored_start() - && e.is_line_anchored_start(); - info.set_line_anchored_start(x); - - let x = info.is_line_anchored_end() - && e.is_line_anchored_end(); - info.set_line_anchored_end(x); - - let x = info.is_any_anchored_start() - || e.is_any_anchored_start(); - info.set_any_anchored_start(x); - - let x = - info.is_any_anchored_end() || e.is_any_anchored_end(); - info.set_any_anchored_end(x); - - let x = info.is_match_empty() || e.is_match_empty(); - info.set_match_empty(x); - - let x = info.is_alternation_literal() && e.is_literal(); - info.set_alternation_literal(x); + /// This flattens and simplifies the alternation as appropriate. This may + /// include factoring out common prefixes or even rewriting the alternation + /// as a character class. + /// + /// Note that an empty alternation is equivalent to `Hir::fail()`. (It + /// is not possible for one to write an empty alternation, or even an + /// alternation with a single sub-expression, in the concrete syntax of a + /// regex.) + /// + /// # Example + /// + /// This is a simple example showing how an alternation might get + /// simplified. + /// + /// ``` + /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange}; + /// + /// let hir = Hir::alternation(vec![ + /// Hir::literal([b'a']), + /// Hir::literal([b'b']), + /// Hir::literal([b'c']), + /// Hir::literal([b'd']), + /// Hir::literal([b'e']), + /// Hir::literal([b'f']), + /// ]); + /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'f'), + /// ]))); + /// assert_eq!(expected, hir); + /// ``` + /// + /// And another example showing how common prefixes might get factored + /// out. + /// + /// ``` + /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange}; + /// + /// let hir = Hir::alternation(vec![ + /// Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('A', 'Z'), + /// ]))), + /// ]), + /// Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'z'), + /// ]))), + /// ]), + /// ]); + /// let expected = Hir::concat(vec![ + /// Hir::literal("abc".as_bytes()), + /// Hir::alternation(vec![ + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('A', 'Z'), + /// ]))), + /// Hir::class(Class::Unicode(ClassUnicode::new([ + /// ClassUnicodeRange::new('a', 'z'), + /// ]))), + /// ]), + /// ]); + /// assert_eq!(expected, hir); + /// ``` + /// + /// Note that these sorts of simplifications are not guaranteed. + pub fn alternation(subs: Vec) -> Hir { + // We rebuild the alternation by simplifying it. We proceed similarly + // as the concatenation case. But in this case, there's no literal + // simplification happening. We're just flattening alternations. + let mut new = vec![]; + for sub in subs { + let (kind, props) = sub.into_parts(); + match kind { + HirKind::Alternation(subs2) => { + new.extend(subs2); + } + kind => { + new.push(Hir { kind, props }); } - Hir { kind: HirKind::Alternation(exprs), info } } } - } - - /// Build an HIR expression for `.`. - /// - /// A `.` expression matches any character except for `\n`. To build an - /// expression that matches any character, including `\n`, use the `any` - /// method. - /// - /// If `bytes` is `true`, then this assumes characters are limited to a - /// single byte. - pub fn dot(bytes: bool) -> Hir { - if bytes { - let mut cls = ClassBytes::empty(); - cls.push(ClassBytesRange::new(b'\0', b'\x09')); - cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); - Hir::class(Class::Bytes(cls)) - } else { - let mut cls = ClassUnicode::empty(); - cls.push(ClassUnicodeRange::new('\0', '\x09')); - cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); - Hir::class(Class::Unicode(cls)) + if new.is_empty() { + return Hir::fail(); + } else if new.len() == 1 { + return new.pop().unwrap(); + } + // Now that it's completely flattened, look for the special case of + // 'char1|char2|...|charN' and collapse that into a class. Note that + // we look for 'char' first and then bytes. The issue here is that if + // we find both non-ASCII codepoints and non-ASCII singleton bytes, + // then it isn't actually possible to smush them into a single class. + // (Because classes are either "all codepoints" or "all bytes." You + // can have a class that both matches non-ASCII but valid UTF-8 and + // invalid UTF-8.) So we look for all chars and then all bytes, and + // don't handle anything else. + if let Some(singletons) = singleton_chars(&new) { + let it = singletons + .into_iter() + .map(|ch| ClassUnicodeRange { start: ch, end: ch }); + return Hir::class(Class::Unicode(ClassUnicode::new(it))); + } + if let Some(singletons) = singleton_bytes(&new) { + let it = singletons + .into_iter() + .map(|b| ClassBytesRange { start: b, end: b }); + return Hir::class(Class::Bytes(ClassBytes::new(it))); } + // Similar to singleton chars, we can also look for alternations of + // classes. Those can be smushed into a single class. + if let Some(cls) = class_chars(&new) { + return Hir::class(cls); + } + if let Some(cls) = class_bytes(&new) { + return Hir::class(cls); + } + // Factor out a common prefix if we can, which might potentially + // simplify the expression and unlock other optimizations downstream. + // It also might generally make NFA matching and DFA construction + // faster by reducing the scope of branching in the regex. + new = match lift_common_prefix(new) { + Ok(hir) => return hir, + Err(unchanged) => unchanged, + }; + let props = Properties::alternation(&new); + Hir { kind: HirKind::Alternation(new), props } } - /// Build an HIR expression for `(?s).`. + /// Returns an HIR expression for `.`. /// - /// A `(?s).` expression matches any character, including `\n`. To build an - /// expression that matches any character except for `\n`, then use the - /// `dot` method. + /// * [`Dot::AnyChar`] maps to `(?su-R:.)`. + /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`. + /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`. + /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`. + /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`. + /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`. /// - /// If `bytes` is `true`, then this assumes characters are limited to a - /// single byte. - pub fn any(bytes: bool) -> Hir { - if bytes { - let mut cls = ClassBytes::empty(); - cls.push(ClassBytesRange::new(b'\0', b'\xFF')); - Hir::class(Class::Bytes(cls)) - } else { - let mut cls = ClassUnicode::empty(); - cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); - Hir::class(Class::Unicode(cls)) - } - } - - /// Return true if and only if this HIR will always match valid UTF-8. + /// # Example /// - /// When this returns false, then it is possible for this HIR expression - /// to match invalid UTF-8. - pub fn is_always_utf8(&self) -> bool { - self.info.is_always_utf8() - } - - /// Returns true if and only if this entire HIR expression is made up of - /// zero-width assertions. + /// Note that this is a convenience routine for constructing the correct + /// character class based on the value of `Dot`. There is no explicit "dot" + /// HIR value. It is just an abbreviation for a common character class. /// - /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but - /// not `^a`. - pub fn is_all_assertions(&self) -> bool { - self.info.is_all_assertions() - } - - /// Return true if and only if this HIR is required to match from the - /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`, - /// `^foo|^bar` but not `^foo|bar`. - pub fn is_anchored_start(&self) -> bool { - self.info.is_anchored_start() - } - - /// Return true if and only if this HIR is required to match at the end - /// of text. This includes expressions like `foo$`, `(foo|bar)$`, - /// `foo$|bar$` but not `foo$|bar`. - pub fn is_anchored_end(&self) -> bool { - self.info.is_anchored_end() - } - - /// Return true if and only if this HIR is required to match from the - /// beginning of text or the beginning of a line. This includes expressions - /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar` - /// but not `^foo|bar` or `(?m)^foo|bar`. + /// ``` + /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange}; /// - /// Note that if `is_anchored_start` is `true`, then - /// `is_line_anchored_start` will also be `true`. The reverse implication - /// is not true. For example, `(?m)^foo` is line anchored, but not - /// `is_anchored_start`. - pub fn is_line_anchored_start(&self) -> bool { - self.info.is_line_anchored_start() + /// let hir = Hir::dot(Dot::AnyByte); + /// let expected = Hir::class(Class::Bytes(ClassBytes::new([ + /// ClassBytesRange::new(0x00, 0xFF), + /// ]))); + /// assert_eq!(expected, hir); + /// ``` + #[inline] + pub fn dot(dot: Dot) -> Hir { + match dot { + Dot::AnyChar => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyByte => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + Dot::AnyCharExceptLF => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyCharExceptCRLF => { + let mut cls = ClassUnicode::empty(); + cls.push(ClassUnicodeRange::new('\0', '\x09')); + cls.push(ClassUnicodeRange::new('\x0B', '\x0C')); + cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}')); + Hir::class(Class::Unicode(cls)) + } + Dot::AnyByteExceptLF => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + Dot::AnyByteExceptCRLF => { + let mut cls = ClassBytes::empty(); + cls.push(ClassBytesRange::new(b'\0', b'\x09')); + cls.push(ClassBytesRange::new(b'\x0B', b'\x0C')); + cls.push(ClassBytesRange::new(b'\x0E', b'\xFF')); + Hir::class(Class::Bytes(cls)) + } + } } +} - /// Return true if and only if this HIR is required to match at the - /// end of text or the end of a line. This includes expressions like - /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`, - /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`. +/// The underlying kind of an arbitrary [`Hir`] expression. +/// +/// An `HirKind` is principally useful for doing case analysis on the type +/// of a regular expression. If you're looking to build new `Hir` values, +/// then you _must_ use the smart constructors defined on `Hir`, like +/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does +/// not expose any way of building an `Hir` directly from an `HirKind`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum HirKind { + /// The empty regular expression, which matches everything, including the + /// empty string. + Empty, + /// A literalstring that matches exactly these bytes. + Literal(Literal), + /// A single character class that matches any of the characters in the + /// class. A class can either consist of Unicode scalar values as + /// characters, or it can use bytes. /// - /// Note that if `is_anchored_end` is `true`, then - /// `is_line_anchored_end` will also be `true`. The reverse implication - /// is not true. For example, `(?m)foo$` is line anchored, but not - /// `is_anchored_end`. - pub fn is_line_anchored_end(&self) -> bool { - self.info.is_line_anchored_end() - } - - /// Return true if and only if this HIR contains any sub-expression that - /// is required to match at the beginning of text. Specifically, this - /// returns true if the `^` symbol (when multiline mode is disabled) or the - /// `\A` escape appear anywhere in the regex. - pub fn is_any_anchored_start(&self) -> bool { - self.info.is_any_anchored_start() - } - - /// Return true if and only if this HIR contains any sub-expression that is - /// required to match at the end of text. Specifically, this returns true - /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape - /// appear anywhere in the regex. - pub fn is_any_anchored_end(&self) -> bool { - self.info.is_any_anchored_end() - } - - /// Return true if and only if the empty string is part of the language - /// matched by this regular expression. + /// A class may be empty. In which case, it matches nothing. + Class(Class), + /// A look-around assertion. A look-around match always has zero length. + Look(Look), + /// A repetition operation applied to a sub-expression. + Repetition(Repetition), + /// A capturing group, which contains a sub-expression. + Capture(Capture), + /// A concatenation of expressions. /// - /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b` - /// and `\B`, but not `a` or `a+`. - pub fn is_match_empty(&self) -> bool { - self.info.is_match_empty() - } - - /// Return true if and only if this HIR is a simple literal. This is only - /// true when this HIR expression is either itself a `Literal` or a - /// concatenation of only `Literal`s. + /// A concatenation matches only if each of its sub-expressions match one + /// after the other. /// - /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`, - /// `` are not (even though that contain sub-expressions that are literals). - pub fn is_literal(&self) -> bool { - self.info.is_literal() - } - - /// Return true if and only if this HIR is either a simple literal or an - /// alternation of simple literals. This is only - /// true when this HIR expression is either itself a `Literal` or a - /// concatenation of only `Literal`s or an alternation of only `Literal`s. + /// Concatenations are guaranteed by `Hir`'s smart constructors to always + /// have at least two sub-expressions. + Concat(Vec), + /// An alternation of expressions. /// - /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation - /// literals, but `f+`, `(foo)`, `foo()`, `` - /// are not (even though that contain sub-expressions that are literals). - pub fn is_alternation_literal(&self) -> bool { - self.info.is_alternation_literal() - } + /// An alternation matches only if at least one of its sub-expressions + /// match. If multiple sub-expressions match, then the leftmost is + /// preferred. + /// + /// Alternations are guaranteed by `Hir`'s smart constructors to always + /// have at least two sub-expressions. + Alternation(Vec), } impl HirKind { - /// Return true if and only if this HIR is the empty regular expression. - /// - /// Note that this is not defined inductively. That is, it only tests if - /// this kind is the `Empty` variant. To get the inductive definition, - /// use the `is_match_empty` method on [`Hir`](struct.Hir.html). - pub fn is_empty(&self) -> bool { - match *self { - HirKind::Empty => true, - _ => false, - } - } + /// Returns a slice of this kind's sub-expressions, if any. + pub fn subs(&self) -> &[Hir] { + use core::slice::from_ref; - /// Returns true if and only if this kind has any (including possibly - /// empty) subexpressions. - pub fn has_subexprs(&self) -> bool { match *self { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) - | HirKind::Anchor(_) - | HirKind::WordBoundary(_) => false, - HirKind::Group(_) - | HirKind::Repetition(_) - | HirKind::Concat(_) - | HirKind::Alternation(_) => true, + | HirKind::Look(_) => &[], + HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), + HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), + HirKind::Concat(ref subs) => subs, + HirKind::Alternation(ref subs) => subs, } } } +impl core::fmt::Debug for Hir { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + self.kind.fmt(f) + } +} + /// Print a display representation of this Hir. /// /// The result of this is a valid regular expression pattern string. /// /// This implementation uses constant stack space and heap space proportional /// to the size of the `Hir`. -impl fmt::Display for Hir { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - use crate::hir::print::Printer; - Printer::new().print(self, f) +impl core::fmt::Display for Hir { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + crate::hir::print::Printer::new().print(self, f) } } /// The high-level intermediate representation of a literal. /// -/// A literal corresponds to a single character, where a character is either -/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters -/// are preferred whenever possible. In particular, a `Byte` variant is only -/// ever produced when it could match invalid UTF-8. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum Literal { - /// A single character represented by a Unicode scalar value. - Unicode(char), - /// A single character represented by an arbitrary byte. - Byte(u8), -} +/// A literal corresponds to `0` or more bytes that should be matched +/// literally. The smart constructors defined on `Hir` will automatically +/// concatenate adjacent literals into one literal, and will even automatically +/// replace empty literals with `Hir::empty()`. +/// +/// Note that despite a literal being represented by a sequence of bytes, its +/// `Debug` implementation will attempt to print it as a normal string. (That +/// is, not a sequence of decimal numbers.) +#[derive(Clone, Eq, PartialEq)] +pub struct Literal(pub Box<[u8]>); -impl Literal { - /// Returns true if and only if this literal corresponds to a Unicode - /// scalar value. - pub fn is_unicode(&self) -> bool { - match *self { - Literal::Unicode(_) => true, - Literal::Byte(b) if b <= 0x7F => true, - Literal::Byte(_) => false, - } +impl core::fmt::Debug for Literal { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + crate::debug::Bytes(&self.0).fmt(f) } } @@ -773,13 +782,12 @@ impl Literal { /// A character class, regardless of its character type, is represented by a /// sequence of non-overlapping non-adjacent ranges of characters. /// -/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may -/// be produced even when it exclusively matches valid UTF-8. This is because -/// a `Bytes` variant represents an intention by the author of the regular -/// expression to disable Unicode mode, which in turn impacts the semantics of -/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not -/// match the same set of strings. -#[derive(Clone, Debug, Eq, PartialEq)] +/// Note that `Bytes` variant may be produced even when it exclusively matches +/// valid UTF-8. This is because a `Bytes` variant represents an intention by +/// the author of the regular expression to disable Unicode mode, which in turn +/// impacts the semantics of case insensitive matching. For example, `(?i)k` +/// and `(?i-u)k` will not match the same set of strings. +#[derive(Clone, Eq, PartialEq)] pub enum Class { /// A set of characters represented by Unicode scalar values. Unicode(ClassUnicode), @@ -795,6 +803,15 @@ impl Class { /// /// If this is a byte oriented character class, then this will be limited /// to the ASCII ranges `A-Z` and `a-z`. + /// + /// # Panics + /// + /// This routine panics when the case mapping data necessary for this + /// routine to complete is unavailable. This occurs when the `unicode-case` + /// feature is not enabled and the underlying class is Unicode oriented. + /// + /// Callers should prefer using `try_case_fold_simple` instead, which will + /// return an error instead of panicking. pub fn case_fold_simple(&mut self) { match *self { Class::Unicode(ref mut x) => x.case_fold_simple(), @@ -802,6 +819,29 @@ impl Class { } } + /// Apply Unicode simple case folding to this character class, in place. + /// The character class will be expanded to include all simple case folded + /// character variants. + /// + /// If this is a byte oriented character class, then this will be limited + /// to the ASCII ranges `A-Z` and `a-z`. + /// + /// # Error + /// + /// This routine returns an error when the case mapping data necessary + /// for this routine to complete is unavailable. This occurs when the + /// `unicode-case` feature is not enabled and the underlying class is + /// Unicode oriented. + pub fn try_case_fold_simple( + &mut self, + ) -> core::result::Result<(), CaseFoldError> { + match *self { + Class::Unicode(ref mut x) => x.try_case_fold_simple()?, + Class::Bytes(ref mut x) => x.case_fold_simple(), + } + Ok(()) + } + /// Negate this character class in place. /// /// After completion, this character class will contain precisely the @@ -824,38 +864,177 @@ impl Class { /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete /// syntax or in the parser builder. By default, Unicode mode is /// enabled. - pub fn is_always_utf8(&self) -> bool { + pub fn is_utf8(&self) -> bool { match *self { Class::Unicode(_) => true, - Class::Bytes(ref x) => x.is_all_ascii(), + Class::Bytes(ref x) => x.is_ascii(), } } -} - -/// A set of characters represented by Unicode scalar values. -#[derive(Clone, Debug, Eq, PartialEq)] -pub struct ClassUnicode { - set: IntervalSet, -} -impl ClassUnicode { - /// Create a new class from a sequence of ranges. + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. /// - /// The given ranges do not need to be in any specific order, and ranges - /// may overlap. - pub fn new(ranges: I) -> ClassUnicode - where - I: IntoIterator, - { - ClassUnicode { set: IntervalSet::new(ranges) } + /// For non-empty byte oriented classes, this always returns `1`. For + /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or + /// `4`. For empty classes, `None` is returned. It is impossible for `0` to + /// be returned. + /// + /// # Example + /// + /// This example shows some examples of regexes and their corresponding + /// minimum length, if any. + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// // The empty string has a min length of 0. + /// let hir = parse(r"")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // As do other types of regexes that only match the empty string. + /// let hir = parse(r"^$\b\B")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // A regex that can match the empty string but match more is still 0. + /// let hir = parse(r"a*")?; + /// assert_eq!(Some(0), hir.properties().minimum_len()); + /// // A regex that matches nothing has no minimum defined. + /// let hir = parse(r"[a&&b]")?; + /// assert_eq!(None, hir.properties().minimum_len()); + /// // Character classes usually have a minimum length of 1. + /// let hir = parse(r"\w")?; + /// assert_eq!(Some(1), hir.properties().minimum_len()); + /// // But sometimes Unicode classes might be bigger! + /// let hir = parse(r"\p{Cyrillic}")?; + /// assert_eq!(Some(2), hir.properties().minimum_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn minimum_len(&self) -> Option { + match *self { + Class::Unicode(ref x) => x.minimum_len(), + Class::Bytes(ref x) => x.minimum_len(), + } } - /// Create a new class with no ranges. - pub fn empty() -> ClassUnicode { - ClassUnicode::new(vec![]) + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// For non-empty byte oriented classes, this always returns `1`. For + /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or + /// `4`. For empty classes, `None` is returned. It is impossible for `0` to + /// be returned. + /// + /// # Example + /// + /// This example shows some examples of regexes and their corresponding + /// maximum length, if any. + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// // The empty string has a max length of 0. + /// let hir = parse(r"")?; + /// assert_eq!(Some(0), hir.properties().maximum_len()); + /// // As do other types of regexes that only match the empty string. + /// let hir = parse(r"^$\b\B")?; + /// assert_eq!(Some(0), hir.properties().maximum_len()); + /// // A regex that matches nothing has no maximum defined. + /// let hir = parse(r"[a&&b]")?; + /// assert_eq!(None, hir.properties().maximum_len()); + /// // Bounded repeats work as you expect. + /// let hir = parse(r"x{2,10}")?; + /// assert_eq!(Some(10), hir.properties().maximum_len()); + /// // An unbounded repeat means there is no maximum. + /// let hir = parse(r"x{2,}")?; + /// assert_eq!(None, hir.properties().maximum_len()); + /// // With Unicode enabled, \w can match up to 4 bytes! + /// let hir = parse(r"\w")?; + /// assert_eq!(Some(4), hir.properties().maximum_len()); + /// // Without Unicode enabled, \w matches at most 1 byte. + /// let hir = parse(r"(?-u)\w")?; + /// assert_eq!(Some(1), hir.properties().maximum_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn maximum_len(&self) -> Option { + match *self { + Class::Unicode(ref x) => x.maximum_len(), + Class::Bytes(ref x) => x.maximum_len(), + } } - /// Add a new range to this set. + /// Returns true if and only if this character class is empty. That is, + /// it has no elements. + /// + /// An empty character can never match anything, including an empty string. + pub fn is_empty(&self) -> bool { + match *self { + Class::Unicode(ref x) => x.ranges().is_empty(), + Class::Bytes(ref x) => x.ranges().is_empty(), + } + } + + /// If this class consists of exactly one element (whether a codepoint or a + /// byte), then return it as a literal byte string. + /// + /// If this class is empty or contains more than one element, then `None` + /// is returned. + pub fn literal(&self) -> Option> { + match *self { + Class::Unicode(ref x) => x.literal(), + Class::Bytes(ref x) => x.literal(), + } + } +} + +impl core::fmt::Debug for Class { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use crate::debug::Byte; + + let mut fmter = f.debug_set(); + match *self { + Class::Unicode(ref cls) => { + for r in cls.ranges().iter() { + fmter.entry(&(r.start..=r.end)); + } + } + Class::Bytes(ref cls) => { + for r in cls.ranges().iter() { + fmter.entry(&(Byte(r.start)..=Byte(r.end))); + } + } + } + fmter.finish() + } +} + +/// A set of characters represented by Unicode scalar values. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ClassUnicode { + set: IntervalSet, +} + +impl ClassUnicode { + /// Create a new class from a sequence of ranges. + /// + /// The given ranges do not need to be in any specific order, and ranges + /// may overlap. Ranges will automatically be sorted into a canonical + /// non-overlapping order. + pub fn new(ranges: I) -> ClassUnicode + where + I: IntoIterator, + { + ClassUnicode { set: IntervalSet::new(ranges) } + } + + /// Create a new class with no ranges. + /// + /// An empty class matches nothing. That is, it is equivalent to + /// [`Hir::fail`]. + pub fn empty() -> ClassUnicode { + ClassUnicode::new(vec![]) + } + + /// Add a new range to this set. pub fn push(&mut self, range: ClassUnicodeRange) { self.set.push(range); } @@ -903,7 +1082,7 @@ impl ClassUnicode { /// `unicode-case` feature is not enabled. pub fn try_case_fold_simple( &mut self, - ) -> result::Result<(), CaseFoldError> { + ) -> core::result::Result<(), CaseFoldError> { self.set.case_fold_simple() } @@ -946,9 +1125,60 @@ impl ClassUnicode { /// Returns true if and only if this character class will either match /// nothing or only ASCII bytes. Stated differently, this returns false /// if and only if this class contains a non-ASCII codepoint. - pub fn is_all_ascii(&self) -> bool { + pub fn is_ascii(&self) -> bool { self.set.intervals().last().map_or(true, |r| r.end <= '\x7F') } + + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn minimum_len(&self) -> Option { + let first = self.ranges().get(0)?; + // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). + Some(first.start.len_utf8()) + } + + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn maximum_len(&self) -> Option { + let last = self.ranges().last()?; + // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). + Some(last.end.len_utf8()) + } + + /// If this class consists of exactly one codepoint, then return it as + /// a literal byte string. + /// + /// If this class is empty or contains more than one codepoint, then `None` + /// is returned. + pub fn literal(&self) -> Option> { + let rs = self.ranges(); + if rs.len() == 1 && rs[0].start == rs[0].end { + Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes()) + } else { + None + } + } + + /// If this class consists of only ASCII ranges, then return its + /// corresponding and equivalent byte class. + pub fn to_byte_class(&self) -> Option { + if !self.is_ascii() { + return None; + } + Some(ClassBytes::new(self.ranges().iter().map(|r| { + // Since we are guaranteed that our codepoint range is ASCII, the + // 'u8::try_from' calls below are guaranteed to be correct. + ClassBytesRange { + // MSRV(1.59): Use 'u8::try_from(c)' instead. + start: u8::try_from(u32::from(r.start)).unwrap(), + end: u8::try_from(u32::from(r.end)).unwrap(), + } + }))) + } } /// An iterator over all ranges in a Unicode character class. @@ -975,18 +1205,18 @@ pub struct ClassUnicodeRange { end: char, } -impl fmt::Debug for ClassUnicodeRange { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Debug for ClassUnicodeRange { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let start = if !self.start.is_whitespace() && !self.start.is_control() { self.start.to_string() } else { - format!("0x{:X}", self.start as u32) + format!("0x{:X}", u32::from(self.start)) }; let end = if !self.end.is_whitespace() && !self.end.is_control() { self.end.to_string() } else { - format!("0x{:X}", self.end as u32) + format!("0x{:X}", u32::from(self.end)) }; f.debug_struct("ClassUnicodeRange") .field("start", &start) @@ -1023,24 +1253,13 @@ impl Interval for ClassUnicodeRange { &self, ranges: &mut Vec, ) -> Result<(), unicode::CaseFoldError> { - if !unicode::contains_simple_case_mapping(self.start, self.end)? { + let mut folder = unicode::SimpleCaseFolder::new()?; + if !folder.overlaps(self.start, self.end) { return Ok(()); } - let start = self.start as u32; - let end = (self.end as u32).saturating_add(1); - let mut next_simple_cp = None; - for cp in (start..end).filter_map(char::from_u32) { - if next_simple_cp.map_or(false, |next| cp < next) { - continue; - } - let it = match unicode::simple_fold(cp)? { - Ok(it) => it, - Err(next) => { - next_simple_cp = next; - continue; - } - }; - for cp_folded in it { + let (start, end) = (u32::from(self.start), u32::from(self.end)); + for cp in (start..=end).filter_map(char::from_u32) { + for &cp_folded in folder.mapping(cp) { ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); } } @@ -1072,6 +1291,18 @@ impl ClassUnicodeRange { pub fn end(&self) -> char { self.end } + + /// Returns the number of codepoints in this range. + pub fn len(&self) -> usize { + let diff = 1 + u32::from(self.end) - u32::from(self.start); + // This is likely to panic in 16-bit targets since a usize can only fit + // 2^16. It's not clear what to do here, other than to return an error + // when building a Unicode class that contains a range whose length + // overflows usize. (Which, to be honest, is probably quite common on + // 16-bit targets. For example, this would imply that '.' and '\p{any}' + // would be impossible to build.) + usize::try_from(diff).expect("char class len fits in usize") + } } /// A set of characters represented by arbitrary bytes (where one byte @@ -1085,7 +1316,8 @@ impl ClassBytes { /// Create a new class from a sequence of ranges. /// /// The given ranges do not need to be in any specific order, and ranges - /// may overlap. + /// may overlap. Ranges will automatically be sorted into a canonical + /// non-overlapping order. pub fn new(ranges: I) -> ClassBytes where I: IntoIterator, @@ -1094,6 +1326,9 @@ impl ClassBytes { } /// Create a new class with no ranges. + /// + /// An empty class matches nothing. That is, it is equivalent to + /// [`Hir::fail`]. pub fn empty() -> ClassBytes { ClassBytes::new(vec![]) } @@ -1115,410 +1350,1535 @@ impl ClassBytes { self.set.intervals() } - /// Expand this character class such that it contains all case folded - /// characters. For example, if this class consists of the range `a-z`, - /// then applying case folding will result in the class containing both the - /// ranges `a-z` and `A-Z`. - /// - /// Note that this only applies ASCII case folding, which is limited to the - /// characters `a-z` and `A-Z`. - pub fn case_fold_simple(&mut self) { - self.set.case_fold_simple().expect("ASCII case folding never fails"); + /// Expand this character class such that it contains all case folded + /// characters. For example, if this class consists of the range `a-z`, + /// then applying case folding will result in the class containing both the + /// ranges `a-z` and `A-Z`. + /// + /// Note that this only applies ASCII case folding, which is limited to the + /// characters `a-z` and `A-Z`. + pub fn case_fold_simple(&mut self) { + self.set.case_fold_simple().expect("ASCII case folding never fails"); + } + + /// Negate this byte class. + /// + /// For all `b` where `b` is a any byte, if `b` was in this set, then it + /// will not be in this set after negation. + pub fn negate(&mut self) { + self.set.negate(); + } + + /// Union this byte class with the given byte class, in place. + pub fn union(&mut self, other: &ClassBytes) { + self.set.union(&other.set); + } + + /// Intersect this byte class with the given byte class, in place. + pub fn intersect(&mut self, other: &ClassBytes) { + self.set.intersect(&other.set); + } + + /// Subtract the given byte class from this byte class, in place. + pub fn difference(&mut self, other: &ClassBytes) { + self.set.difference(&other.set); + } + + /// Compute the symmetric difference of the given byte classes, in place. + /// + /// This computes the symmetric difference of two byte classes. This + /// removes all elements in this class that are also in the given class, + /// but all adds all elements from the given class that aren't in this + /// class. That is, the class will contain all elements in either class, + /// but will not contain any elements that are in both classes. + pub fn symmetric_difference(&mut self, other: &ClassBytes) { + self.set.symmetric_difference(&other.set); + } + + /// Returns true if and only if this character class will either match + /// nothing or only ASCII bytes. Stated differently, this returns false + /// if and only if this class contains a non-ASCII byte. + pub fn is_ascii(&self) -> bool { + self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) + } + + /// Returns the length, in bytes, of the smallest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn minimum_len(&self) -> Option { + if self.ranges().is_empty() { + None + } else { + Some(1) + } + } + + /// Returns the length, in bytes, of the longest string matched by this + /// character class. + /// + /// Returns `None` when the class is empty. + pub fn maximum_len(&self) -> Option { + if self.ranges().is_empty() { + None + } else { + Some(1) + } + } + + /// If this class consists of exactly one byte, then return it as + /// a literal byte string. + /// + /// If this class is empty or contains more than one byte, then `None` + /// is returned. + pub fn literal(&self) -> Option> { + let rs = self.ranges(); + if rs.len() == 1 && rs[0].start == rs[0].end { + Some(vec![rs[0].start]) + } else { + None + } + } + + /// If this class consists of only ASCII ranges, then return its + /// corresponding and equivalent Unicode class. + pub fn to_unicode_class(&self) -> Option { + if !self.is_ascii() { + return None; + } + Some(ClassUnicode::new(self.ranges().iter().map(|r| { + // Since we are guaranteed that our byte range is ASCII, the + // 'char::from' calls below are correct and will not erroneously + // convert a raw byte value into its corresponding codepoint. + ClassUnicodeRange { + start: char::from(r.start), + end: char::from(r.end), + } + }))) + } +} + +/// An iterator over all ranges in a byte character class. +/// +/// The lifetime `'a` refers to the lifetime of the underlying class. +#[derive(Debug)] +pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>); + +impl<'a> Iterator for ClassBytesIter<'a> { + type Item = &'a ClassBytesRange; + + fn next(&mut self) -> Option<&'a ClassBytesRange> { + self.0.next() + } +} + +/// A single range of characters represented by arbitrary bytes. +/// +/// The range is closed. That is, the start and end of the range are included +/// in the range. +#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] +pub struct ClassBytesRange { + start: u8, + end: u8, +} + +impl Interval for ClassBytesRange { + type Bound = u8; + + #[inline] + fn lower(&self) -> u8 { + self.start + } + #[inline] + fn upper(&self) -> u8 { + self.end + } + #[inline] + fn set_lower(&mut self, bound: u8) { + self.start = bound; + } + #[inline] + fn set_upper(&mut self, bound: u8) { + self.end = bound; + } + + /// Apply simple case folding to this byte range. Only ASCII case mappings + /// (for a-z) are applied. + /// + /// Additional ranges are appended to the given vector. Canonical ordering + /// is *not* maintained in the given vector. + fn case_fold_simple( + &self, + ranges: &mut Vec, + ) -> Result<(), unicode::CaseFoldError> { + if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) { + let lower = cmp::max(self.start, b'a'); + let upper = cmp::min(self.end, b'z'); + ranges.push(ClassBytesRange::new(lower - 32, upper - 32)); + } + if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) { + let lower = cmp::max(self.start, b'A'); + let upper = cmp::min(self.end, b'Z'); + ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); + } + Ok(()) + } +} + +impl ClassBytesRange { + /// Create a new byte range for a character class. + /// + /// The returned range is always in a canonical form. That is, the range + /// returned always satisfies the invariant that `start <= end`. + pub fn new(start: u8, end: u8) -> ClassBytesRange { + ClassBytesRange::create(start, end) + } + + /// Return the start of this range. + /// + /// The start of a range is always less than or equal to the end of the + /// range. + pub fn start(&self) -> u8 { + self.start + } + + /// Return the end of this range. + /// + /// The end of a range is always greater than or equal to the start of the + /// range. + pub fn end(&self) -> u8 { + self.end + } + + /// Returns the number of bytes in this range. + pub fn len(&self) -> usize { + usize::from(self.end.checked_sub(self.start).unwrap()) + .checked_add(1) + .unwrap() + } +} + +impl core::fmt::Debug for ClassBytesRange { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("ClassBytesRange") + .field("start", &crate::debug::Byte(self.start)) + .field("end", &crate::debug::Byte(self.end)) + .finish() + } +} + +/// The high-level intermediate representation for a look-around assertion. +/// +/// An assertion match is always zero-length. Also called an "empty match." +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Look { + /// Match the beginning of text. Specifically, this matches at the starting + /// position of the input. + Start = 1 << 0, + /// Match the end of text. Specifically, this matches at the ending + /// position of the input. + End = 1 << 1, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following a `\n` character. + StartLF = 1 << 2, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\n` character. + EndLF = 1 << 3, + /// Match the beginning of a line or the beginning of text. Specifically, + /// this matches at the starting position of the input, or at the position + /// immediately following either a `\r` or `\n` character, but never after + /// a `\r` when a `\n` follows. + StartCRLF = 1 << 4, + /// Match the end of a line or the end of text. Specifically, this matches + /// at the end position of the input, or at the position immediately + /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` + /// precedes it. + EndCRLF = 1 << 5, + /// Match an ASCII-only word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + WordAscii = 1 << 6, + /// Match an ASCII-only negation of a word boundary. + WordAsciiNegate = 1 << 7, + /// Match a Unicode-aware word boundary. That is, this matches a position + /// where the left adjacent character and right adjacent character + /// correspond to a word and non-word or a non-word and word character. + WordUnicode = 1 << 8, + /// Match a Unicode-aware negation of a word boundary. + WordUnicodeNegate = 1 << 9, +} + +impl Look { + /// Flip the look-around assertion to its equivalent for reverse searches. + /// For example, `StartLF` gets translated to `EndLF`. + /// + /// Some assertions, such as `WordUnicode`, remain the same since they + /// match the same positions regardless of the direction of the search. + #[inline] + pub const fn reversed(self) -> Look { + match self { + Look::Start => Look::End, + Look::End => Look::Start, + Look::StartLF => Look::EndLF, + Look::EndLF => Look::StartLF, + Look::StartCRLF => Look::EndCRLF, + Look::EndCRLF => Look::StartCRLF, + Look::WordAscii => Look::WordAscii, + Look::WordAsciiNegate => Look::WordAsciiNegate, + Look::WordUnicode => Look::WordUnicode, + Look::WordUnicodeNegate => Look::WordUnicodeNegate, + } + } + + /// Return the underlying representation of this look-around enumeration + /// as an integer. Giving the return value to the [`Look::from_repr`] + /// constructor is guaranteed to return the same look-around variant that + /// one started with within a semver compatible release of this crate. + #[inline] + pub const fn as_repr(self) -> u16 { + // AFAIK, 'as' is the only way to zero-cost convert an int enum to an + // actual int. + self as u16 + } + + /// Given the underlying representation of a `Look` value, return the + /// corresponding `Look` value if the representation is valid. Otherwise + /// `None` is returned. + #[inline] + pub const fn from_repr(repr: u16) -> Option { + match repr { + 0b00_0000_0001 => Some(Look::Start), + 0b00_0000_0010 => Some(Look::End), + 0b00_0000_0100 => Some(Look::StartLF), + 0b00_0000_1000 => Some(Look::EndLF), + 0b00_0001_0000 => Some(Look::StartCRLF), + 0b00_0010_0000 => Some(Look::EndCRLF), + 0b00_0100_0000 => Some(Look::WordAscii), + 0b00_1000_0000 => Some(Look::WordAsciiNegate), + 0b01_0000_0000 => Some(Look::WordUnicode), + 0b10_0000_0000 => Some(Look::WordUnicodeNegate), + _ => None, + } + } + + /// Returns a convenient single codepoint representation of this + /// look-around assertion. Each assertion is guaranteed to be represented + /// by a distinct character. + /// + /// This is useful for succinctly representing a look-around assertion in + /// human friendly but succinct output intended for a programmer working on + /// regex internals. + #[inline] + pub const fn as_char(self) -> char { + match self { + Look::Start => 'A', + Look::End => 'z', + Look::StartLF => '^', + Look::EndLF => '$', + Look::StartCRLF => 'r', + Look::EndCRLF => 'R', + Look::WordAscii => 'b', + Look::WordAsciiNegate => 'B', + Look::WordUnicode => '𝛃', + Look::WordUnicodeNegate => '𝚩', + } + } +} + +/// The high-level intermediate representation for a capturing group. +/// +/// A capturing group always has an index and a child expression. It may +/// also have a name associated with it (e.g., `(?P\w)`), but it's not +/// necessary. +/// +/// Note that there is no explicit representation of a non-capturing group +/// in a `Hir`. Instead, non-capturing grouping is handled automatically by +/// the recursive structure of the `Hir` itself. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Capture { + /// The capture index of the capture. + pub index: u32, + /// The name of the capture, if it exists. + pub name: Option>, + /// The expression inside the capturing group, which may be empty. + pub sub: Box, +} + +/// The high-level intermediate representation of a repetition operator. +/// +/// A repetition operator permits the repetition of an arbitrary +/// sub-expression. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Repetition { + /// The minimum range of the repetition. + /// + /// Note that special cases like `?`, `+` and `*` all get translated into + /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively. + /// + /// When `min` is zero, this expression can match the empty string + /// regardless of what its sub-expression is. + pub min: u32, + /// The maximum range of the repetition. + /// + /// Note that when `max` is `None`, `min` acts as a lower bound but where + /// there is no upper bound. For something like `x{5}` where the min and + /// max are equivalent, `min` will be set to `5` and `max` will be set to + /// `Some(5)`. + pub max: Option, + /// Whether this repetition operator is greedy or not. A greedy operator + /// will match as much as it can. A non-greedy operator will match as + /// little as it can. + /// + /// Typically, operators are greedy by default and are only non-greedy when + /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is + /// not. However, this can be inverted via the `U` "ungreedy" flag. + pub greedy: bool, + /// The expression being repeated. + pub sub: Box, +} + +impl Repetition { + /// Returns a new repetition with the same `min`, `max` and `greedy` + /// values, but with its sub-expression replaced with the one given. + pub fn with(&self, sub: Hir) -> Repetition { + Repetition { + min: self.min, + max: self.max, + greedy: self.greedy, + sub: Box::new(sub), + } + } +} + +/// A type describing the different flavors of `.`. +/// +/// This type is meant to be used with [`Hir::dot`], which is a convenience +/// routine for building HIR values derived from the `.` regex. +#[non_exhaustive] +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Dot { + /// Matches the UTF-8 encoding of any Unicode scalar value. + /// + /// This is equivalent to `(?su:.)` and also `\p{any}`. + AnyChar, + /// Matches any byte value. + /// + /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`. + AnyByte, + /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`. + /// + /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`. + AnyCharExceptLF, + /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r` + /// and `\n`. + /// + /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`. + AnyCharExceptCRLF, + /// Matches any byte value except for `\n`. + /// + /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`. + AnyByteExceptLF, + /// Matches any byte value except for `\r` and `\n`. + /// + /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`. + AnyByteExceptCRLF, +} + +/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack +/// space but heap space proportional to the depth of the total `Hir`. +impl Drop for Hir { + fn drop(&mut self) { + use core::mem; + + match *self.kind() { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Look(_) => return, + HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, + HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { + return + } + HirKind::Concat(ref x) if x.is_empty() => return, + HirKind::Alternation(ref x) if x.is_empty() => return, + _ => {} + } + + let mut stack = vec![mem::replace(self, Hir::empty())]; + while let Some(mut expr) = stack.pop() { + match expr.kind { + HirKind::Empty + | HirKind::Literal(_) + | HirKind::Class(_) + | HirKind::Look(_) => {} + HirKind::Capture(ref mut x) => { + stack.push(mem::replace(&mut x.sub, Hir::empty())); + } + HirKind::Repetition(ref mut x) => { + stack.push(mem::replace(&mut x.sub, Hir::empty())); + } + HirKind::Concat(ref mut x) => { + stack.extend(x.drain(..)); + } + HirKind::Alternation(ref mut x) => { + stack.extend(x.drain(..)); + } + } + } + } +} + +/// A type that collects various properties of an HIR value. +/// +/// Properties are always scalar values and represent meta data that is +/// computed inductively on an HIR value. Properties are defined for all +/// HIR values. +/// +/// All methods on a `Properties` value take constant time and are meant to +/// be cheap to call. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Properties(Box); + +/// The property definition. It is split out so that we can box it, and +/// there by make `Properties` use less stack size. This is kind-of important +/// because every HIR value has a `Properties` attached to it. +/// +/// This does have the unfortunate consequence that creating any HIR value +/// always leads to at least one alloc for properties, but this is generally +/// true anyway (for pretty much all HirKinds except for look-arounds). +#[derive(Clone, Debug, Eq, PartialEq)] +struct PropertiesI { + minimum_len: Option, + maximum_len: Option, + look_set: LookSet, + look_set_prefix: LookSet, + look_set_suffix: LookSet, + utf8: bool, + explicit_captures_len: usize, + static_explicit_captures_len: Option, + literal: bool, + alternation_literal: bool, +} + +impl Properties { + /// Returns the length (in bytes) of the smallest string matched by this + /// HIR. + /// + /// A return value of `0` is possible and occurs when the HIR can match an + /// empty string. + /// + /// `None` is returned when there is no minimum length. This occurs in + /// precisely the cases where the HIR matches nothing. i.e., The language + /// the regex matches is empty. An example of such a regex is `\P{any}`. + #[inline] + pub fn minimum_len(&self) -> Option { + self.0.minimum_len + } + + /// Returns the length (in bytes) of the longest string matched by this + /// HIR. + /// + /// A return value of `0` is possible and occurs when nothing longer than + /// the empty string is in the language described by this HIR. + /// + /// `None` is returned when there is no longest matching string. This + /// occurs when the HIR matches nothing or when there is no upper bound on + /// the length of matching strings. Example of such regexes are `\P{any}` + /// (matches nothing) and `a+` (has no upper bound). + #[inline] + pub fn maximum_len(&self) -> Option { + self.0.maximum_len + } + + /// Returns a set of all look-around assertions that appear at least once + /// in this HIR value. + #[inline] + pub fn look_set(&self) -> LookSet { + self.0.look_set + } + + /// Returns a set of all look-around assertions that appear as a prefix for + /// this HIR value. That is, the set returned corresponds to the set of + /// assertions that must be passed before matching any bytes in a haystack. + /// + /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true + /// if and only if the HIR is fully anchored at the start. + #[inline] + pub fn look_set_prefix(&self) -> LookSet { + self.0.look_set_prefix + } + + /// Returns a set of all look-around assertions that appear as a suffix for + /// this HIR value. That is, the set returned corresponds to the set of + /// assertions that must be passed in order to be considered a match after + /// all other consuming HIR expressions. + /// + /// For example, `hir.look_set_suffix().contains(Look::End)` returns true + /// if and only if the HIR is fully anchored at the end. + #[inline] + pub fn look_set_suffix(&self) -> LookSet { + self.0.look_set_suffix + } + + /// Return true if and only if the corresponding HIR will always match + /// valid UTF-8. + /// + /// When this returns false, then it is possible for this HIR expression to + /// match invalid UTF-8, including by matching between the code units of + /// a single UTF-8 encoded codepoint. + /// + /// Note that this returns true even when the corresponding HIR can match + /// the empty string. Since an empty string can technically appear between + /// UTF-8 code units, it is possible for a match to be reported that splits + /// a codepoint which could in turn be considered matching invalid UTF-8. + /// However, it is generally assumed that such empty matches are handled + /// specially by the search routine if it is absolutely required that + /// matches not split a codepoint. + /// + /// # Example + /// + /// This code example shows the UTF-8 property of a variety of patterns. + /// + /// ``` + /// use regex_syntax::{ParserBuilder, parse}; + /// + /// // Examples of 'is_utf8() == true'. + /// assert!(parse(r"a")?.properties().is_utf8()); + /// assert!(parse(r"[^a]")?.properties().is_utf8()); + /// assert!(parse(r".")?.properties().is_utf8()); + /// assert!(parse(r"\W")?.properties().is_utf8()); + /// assert!(parse(r"\b")?.properties().is_utf8()); + /// assert!(parse(r"\B")?.properties().is_utf8()); + /// assert!(parse(r"(?-u)\b")?.properties().is_utf8()); + /// assert!(parse(r"(?-u)\B")?.properties().is_utf8()); + /// // Unicode mode is enabled by default, and in + /// // that mode, all \x hex escapes are treated as + /// // codepoints. So this actually matches the UTF-8 + /// // encoding of U+00FF. + /// assert!(parse(r"\xFF")?.properties().is_utf8()); + /// + /// // Now we show examples of 'is_utf8() == false'. + /// // The only way to do this is to force the parser + /// // to permit invalid UTF-8, otherwise all of these + /// // would fail to parse! + /// let parse = |pattern| { + /// ParserBuilder::new().utf8(false).build().parse(pattern) + /// }; + /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8()); + /// assert!(!parse(r"(?-u).")?.properties().is_utf8()); + /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8()); + /// // Conversely to the equivalent example above, + /// // when Unicode mode is disabled, \x hex escapes + /// // are treated as their raw byte values. + /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8()); + /// // Note that just because we disabled UTF-8 in the + /// // parser doesn't mean we still can't use Unicode. + /// // It is enabled by default, so \xFF is still + /// // equivalent to matching the UTF-8 encoding of + /// // U+00FF by default. + /// assert!(parse(r"\xFF")?.properties().is_utf8()); + /// // Even though we use raw bytes that individually + /// // are not valid UTF-8, when combined together, the + /// // overall expression *does* match valid UTF-8! + /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8()); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn is_utf8(&self) -> bool { + self.0.utf8 + } + + /// Returns the total number of explicit capturing groups in the + /// corresponding HIR. + /// + /// Note that this does not include the implicit capturing group + /// corresponding to the entire match that is typically included by regex + /// engines. + /// + /// # Example + /// + /// This method will return `0` for `a` and `1` for `(a)`: + /// + /// ``` + /// use regex_syntax::parse; + /// + /// assert_eq!(0, parse("a")?.properties().explicit_captures_len()); + /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn explicit_captures_len(&self) -> usize { + self.0.explicit_captures_len + } + + /// Returns the total number of explicit capturing groups that appear in + /// every possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that this does not include the implicit capturing group + /// corresponding to the entire match. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex_syntax::parse; + /// + /// let len = |pattern| { + /// parse(pattern).map(|h| { + /// h.properties().static_explicit_captures_len() + /// }) + /// }; + /// + /// assert_eq!(Some(0), len("a")?); + /// assert_eq!(Some(1), len("(a)")?); + /// assert_eq!(Some(1), len("(a)|(b)")?); + /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(1), len("(b)+")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn static_explicit_captures_len(&self) -> Option { + self.0.static_explicit_captures_len + } + + /// Return true if and only if this HIR is a simple literal. This is + /// only true when this HIR expression is either itself a `Literal` or a + /// concatenation of only `Literal`s. + /// + /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and + /// the empty string are not (even though they contain sub-expressions that + /// are literals). + #[inline] + pub fn is_literal(&self) -> bool { + self.0.literal + } + + /// Return true if and only if this HIR is either a simple literal or an + /// alternation of simple literals. This is only + /// true when this HIR expression is either itself a `Literal` or a + /// concatenation of only `Literal`s or an alternation of only `Literal`s. + /// + /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation + /// literals, but `f+`, `(foo)`, `foo()`, `` + /// are not (even though that contain sub-expressions that are literals). + #[inline] + pub fn is_alternation_literal(&self) -> bool { + self.0.alternation_literal + } + + /// Returns the total amount of heap memory usage, in bytes, used by this + /// `Properties` value. + #[inline] + pub fn memory_usage(&self) -> usize { + core::mem::size_of::() + } + + /// Returns a new set of properties that corresponds to the union of the + /// iterator of properties given. + /// + /// This is useful when one has multiple `Hir` expressions and wants + /// to combine them into a single alternation without constructing the + /// corresponding `Hir`. This routine provides a way of combining the + /// properties of each `Hir` expression into one set of properties + /// representing the union of those expressions. + /// + /// # Example: union with HIRs that never match + /// + /// This example shows that unioning properties together with one that + /// represents a regex that never matches will "poison" certain attributes, + /// like the minimum and maximum lengths. + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// let hir1 = parse("ab?c?")?; + /// assert_eq!(Some(1), hir1.properties().minimum_len()); + /// assert_eq!(Some(3), hir1.properties().maximum_len()); + /// + /// let hir2 = parse(r"[a&&b]")?; + /// assert_eq!(None, hir2.properties().minimum_len()); + /// assert_eq!(None, hir2.properties().maximum_len()); + /// + /// let hir3 = parse(r"wxy?z?")?; + /// assert_eq!(Some(2), hir3.properties().minimum_len()); + /// assert_eq!(Some(4), hir3.properties().maximum_len()); + /// + /// let unioned = Properties::union([ + /// hir1.properties(), + /// hir2.properties(), + /// hir3.properties(), + /// ]); + /// assert_eq!(None, unioned.minimum_len()); + /// assert_eq!(None, unioned.maximum_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// The maximum length can also be "poisoned" by a pattern that has no + /// upper bound on the length of a match. The minimum length remains + /// unaffected: + /// + /// ``` + /// use regex_syntax::{hir::Properties, parse}; + /// + /// let hir1 = parse("ab?c?")?; + /// assert_eq!(Some(1), hir1.properties().minimum_len()); + /// assert_eq!(Some(3), hir1.properties().maximum_len()); + /// + /// let hir2 = parse(r"a+")?; + /// assert_eq!(Some(1), hir2.properties().minimum_len()); + /// assert_eq!(None, hir2.properties().maximum_len()); + /// + /// let hir3 = parse(r"wxy?z?")?; + /// assert_eq!(Some(2), hir3.properties().minimum_len()); + /// assert_eq!(Some(4), hir3.properties().maximum_len()); + /// + /// let unioned = Properties::union([ + /// hir1.properties(), + /// hir2.properties(), + /// hir3.properties(), + /// ]); + /// assert_eq!(Some(1), unioned.minimum_len()); + /// assert_eq!(None, unioned.maximum_len()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn union(props: I) -> Properties + where + I: IntoIterator, + P: core::borrow::Borrow, + { + let mut it = props.into_iter().peekable(); + // While empty alternations aren't possible, we still behave as if they + // are. When we have an empty alternate, then clearly the look-around + // prefix and suffix is empty. Otherwise, it is the intersection of all + // prefixes and suffixes (respectively) of the branches. + let fix = if it.peek().is_none() { + LookSet::empty() + } else { + LookSet::full() + }; + // And also, an empty alternate means we have 0 static capture groups, + // but we otherwise start with the number corresponding to the first + // alternate. If any subsequent alternate has a different number of + // static capture groups, then we overall have a variation and not a + // static number of groups. + let static_explicit_captures_len = + it.peek().and_then(|p| p.borrow().static_explicit_captures_len()); + // The base case is an empty alternation, which matches nothing. + // Note though that empty alternations aren't possible, because the + // Hir::alternation smart constructor rewrites those as empty character + // classes. + let mut props = PropertiesI { + minimum_len: None, + maximum_len: None, + look_set: LookSet::empty(), + look_set_prefix: fix, + look_set_suffix: fix, + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len, + literal: false, + alternation_literal: true, + }; + let (mut min_poisoned, mut max_poisoned) = (false, false); + // Handle properties that need to visit every child hir. + for prop in it { + let p = prop.borrow(); + props.look_set.set_union(p.look_set()); + props.look_set_prefix.set_intersect(p.look_set_prefix()); + props.look_set_suffix.set_intersect(p.look_set_suffix()); + props.utf8 = props.utf8 && p.is_utf8(); + props.explicit_captures_len = props + .explicit_captures_len + .saturating_add(p.explicit_captures_len()); + if props.static_explicit_captures_len + != p.static_explicit_captures_len() + { + props.static_explicit_captures_len = None; + } + props.alternation_literal = + props.alternation_literal && p.is_alternation_literal(); + if !min_poisoned { + if let Some(xmin) = p.minimum_len() { + if props.minimum_len.map_or(true, |pmin| xmin < pmin) { + props.minimum_len = Some(xmin); + } + } else { + props.minimum_len = None; + min_poisoned = true; + } + } + if !max_poisoned { + if let Some(xmax) = p.maximum_len() { + if props.maximum_len.map_or(true, |pmax| xmax > pmax) { + props.maximum_len = Some(xmax); + } + } else { + props.maximum_len = None; + max_poisoned = true; + } + } + } + Properties(Box::new(props)) + } +} + +impl Properties { + /// Create a new set of HIR properties for an empty regex. + fn empty() -> Properties { + let inner = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + // It is debatable whether an empty regex always matches at valid + // UTF-8 boundaries. Strictly speaking, at a byte oriented view, + // it is clearly false. There are, for example, many empty strings + // between the bytes encoding a '☃'. + // + // However, when Unicode mode is enabled, the fundamental atom + // of matching is really a codepoint. And in that scenario, an + // empty regex is defined to only match at valid UTF-8 boundaries + // and to never split a codepoint. It just so happens that this + // enforcement is somewhat tricky to do for regexes that match + // the empty string inside regex engines themselves. It usually + // requires some layer above the regex engine to filter out such + // matches. + // + // In any case, 'true' is really the only coherent option. If it + // were false, for example, then 'a*' would also need to be false + // since it too can match the empty string. + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a literal regex. + fn literal(lit: &Literal) -> Properties { + let inner = PropertiesI { + minimum_len: Some(lit.0.len()), + maximum_len: Some(lit.0.len()), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + utf8: core::str::from_utf8(&lit.0).is_ok(), + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: true, + alternation_literal: true, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a character class. + fn class(class: &Class) -> Properties { + let inner = PropertiesI { + minimum_len: class.minimum_len(), + maximum_len: class.maximum_len(), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + utf8: class.is_utf8(), + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a look-around assertion. + fn look(look: Look) -> Properties { + let inner = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::singleton(look), + look_set_prefix: LookSet::singleton(look), + look_set_suffix: LookSet::singleton(look), + // This requires a little explanation. Basically, we don't consider + // matching an empty string to be equivalent to matching invalid + // UTF-8, even though technically matching every empty string will + // split the UTF-8 encoding of a single codepoint when treating a + // UTF-8 encoded string as a sequence of bytes. Our defense here is + // that in such a case, a codepoint should logically be treated as + // the fundamental atom for matching, and thus the only valid match + // points are between codepoints and not bytes. + // + // More practically, this is true here because it's also true + // for 'Hir::empty()', otherwise something like 'a*' would be + // considered to match invalid UTF-8. That in turn makes this + // property borderline useless. + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: false, + alternation_literal: false, + }; + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a repetition. + fn repetition(rep: &Repetition) -> Properties { + let p = rep.sub.properties(); + let minimum_len = p.minimum_len().map(|child_min| { + let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX); + child_min.saturating_mul(rep_min) + }); + let maximum_len = rep.max.and_then(|rep_max| { + let rep_max = usize::try_from(rep_max).ok()?; + let child_max = p.maximum_len()?; + child_max.checked_mul(rep_max) + }); + + let mut inner = PropertiesI { + minimum_len, + maximum_len, + look_set: p.look_set(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + utf8: p.is_utf8(), + explicit_captures_len: p.explicit_captures_len(), + static_explicit_captures_len: p.static_explicit_captures_len(), + literal: false, + alternation_literal: false, + }; + // The repetition operator can match the empty string, then its lookset + // prefix and suffixes themselves remain empty since they are no longer + // required to match. + if rep.min > 0 { + inner.look_set_prefix = p.look_set_prefix(); + inner.look_set_suffix = p.look_set_suffix(); + } + // If the static captures len of the sub-expression is not known or is + // zero, then it automatically propagates to the repetition, regardless + // of the repetition. Otherwise, it might change, but only when the + // repetition can match 0 times. + if rep.min == 0 + && inner.static_explicit_captures_len.map_or(false, |len| len > 0) + { + // If we require a match 0 times, then our captures len is + // guaranteed to be zero. Otherwise, if we *can* match the empty + // string, then it's impossible to know how many captures will be + // in the resulting match. + if rep.max == Some(0) { + inner.static_explicit_captures_len = Some(0); + } else { + inner.static_explicit_captures_len = None; + } + } + Properties(Box::new(inner)) + } + + /// Create a new set of HIR properties for a capture. + fn capture(capture: &Capture) -> Properties { + let p = capture.sub.properties(); + Properties(Box::new(PropertiesI { + explicit_captures_len: p.explicit_captures_len().saturating_add(1), + static_explicit_captures_len: p + .static_explicit_captures_len() + .map(|len| len.saturating_add(1)), + literal: false, + alternation_literal: false, + ..*p.0.clone() + })) + } + + /// Create a new set of HIR properties for a concatenation. + fn concat(concat: &[Hir]) -> Properties { + // The base case is an empty concatenation, which matches the empty + // string. Note though that empty concatenations aren't possible, + // because the Hir::concat smart constructor rewrites those as + // Hir::empty. + let mut props = PropertiesI { + minimum_len: Some(0), + maximum_len: Some(0), + look_set: LookSet::empty(), + look_set_prefix: LookSet::empty(), + look_set_suffix: LookSet::empty(), + utf8: true, + explicit_captures_len: 0, + static_explicit_captures_len: Some(0), + literal: true, + alternation_literal: true, + }; + // Handle properties that need to visit every child hir. + for x in concat.iter() { + let p = x.properties(); + props.look_set.set_union(p.look_set()); + props.utf8 = props.utf8 && p.is_utf8(); + props.explicit_captures_len = props + .explicit_captures_len + .saturating_add(p.explicit_captures_len()); + props.static_explicit_captures_len = p + .static_explicit_captures_len() + .and_then(|len1| { + Some((len1, props.static_explicit_captures_len?)) + }) + .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); + props.literal = props.literal && p.is_literal(); + props.alternation_literal = + props.alternation_literal && p.is_alternation_literal(); + if let Some(ref mut minimum_len) = props.minimum_len { + match p.minimum_len() { + None => props.minimum_len = None, + Some(len) => *minimum_len += len, + } + } + if let Some(ref mut maximum_len) = props.maximum_len { + match p.maximum_len() { + None => props.maximum_len = None, + Some(len) => *maximum_len += len, + } + } + } + // Handle the prefix properties, which only requires visiting + // child exprs until one matches more than the empty string. + let mut it = concat.iter(); + while let Some(x) = it.next() { + props.look_set_prefix.set_union(x.properties().look_set_prefix()); + if x.properties().maximum_len().map_or(true, |x| x > 0) { + break; + } + } + // Same thing for the suffix properties, but in reverse. + let mut it = concat.iter().rev(); + while let Some(x) = it.next() { + props.look_set_suffix.set_union(x.properties().look_set_suffix()); + if x.properties().maximum_len().map_or(true, |x| x > 0) { + break; + } + } + Properties(Box::new(props)) + } + + /// Create a new set of HIR properties for a concatenation. + fn alternation(alts: &[Hir]) -> Properties { + Properties::union(alts.iter().map(|hir| hir.properties())) + } +} + +/// A set of look-around assertions. +/// +/// This is useful for efficiently tracking look-around assertions. For +/// example, an [`Hir`] provides properties that return `LookSet`s. +#[derive(Clone, Copy, Default, Eq, PartialEq)] +pub struct LookSet { + /// The underlying representation this set is exposed to make it possible + /// to store it somewhere efficiently. The representation is that + /// of a bitset, where each assertion occupies bit `i` where `i = + /// Look::as_repr()`. + /// + /// Note that users of this internal representation must permit the full + /// range of `u16` values to be represented. For example, even if the + /// current implementation only makes use of the 10 least significant bits, + /// it may use more bits in a future semver compatible release. + pub bits: u16, +} + +impl LookSet { + /// Create an empty set of look-around assertions. + #[inline] + pub fn empty() -> LookSet { + LookSet { bits: 0 } + } + + /// Create a full set of look-around assertions. + /// + /// This set contains all possible look-around assertions. + #[inline] + pub fn full() -> LookSet { + LookSet { bits: !0 } + } + + /// Create a look-around set containing the look-around assertion given. + /// + /// This is a convenience routine for creating an empty set and inserting + /// one look-around assertions. + #[inline] + pub fn singleton(look: Look) -> LookSet { + LookSet::empty().insert(look) + } + + /// Returns the total number of look-around assertions in this set. + #[inline] + pub fn len(self) -> usize { + // OK because max value always fits in a u8, which in turn always + // fits in a usize, regardless of target. + usize::try_from(self.bits.count_ones()).unwrap() + } + + /// Returns true if and only if this set is empty. + #[inline] + pub fn is_empty(self) -> bool { + self.len() == 0 + } + + /// Returns true if and only if the given look-around assertion is in this + /// set. + #[inline] + pub fn contains(self, look: Look) -> bool { + self.bits & look.as_repr() != 0 + } + + /// Returns true if and only if this set contains any anchor assertions. + /// This includes both "start/end of haystack" and "start/end of line." + #[inline] + pub fn contains_anchor(&self) -> bool { + self.contains_anchor_haystack() || self.contains_anchor_line() + } + + /// Returns true if and only if this set contains any "start/end of + /// haystack" anchors. This doesn't include "start/end of line" anchors. + #[inline] + pub fn contains_anchor_haystack(&self) -> bool { + self.contains(Look::Start) || self.contains(Look::End) + } + + /// Returns true if and only if this set contains any "start/end of line" + /// anchors. This doesn't include "start/end of haystack" anchors. This + /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors. + #[inline] + pub fn contains_anchor_line(&self) -> bool { + self.contains(Look::StartLF) + || self.contains(Look::EndLF) + || self.contains(Look::StartCRLF) + || self.contains(Look::EndCRLF) } - /// Negate this byte class. - /// - /// For all `b` where `b` is a any byte, if `b` was in this set, then it - /// will not be in this set after negation. - pub fn negate(&mut self) { - self.set.negate(); + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that only treat `\n` as line terminators. This does not include + /// haystack anchors or CRLF aware line anchors. + #[inline] + pub fn contains_anchor_lf(&self) -> bool { + self.contains(Look::StartLF) || self.contains(Look::EndLF) } - /// Union this byte class with the given byte class, in place. - pub fn union(&mut self, other: &ClassBytes) { - self.set.union(&other.set); + /// Returns true if and only if this set contains any "start/end of line" + /// anchors that are CRLF-aware. This doesn't include "start/end of + /// haystack" or "start/end of line-feed" anchors. + #[inline] + pub fn contains_anchor_crlf(&self) -> bool { + self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) } - /// Intersect this byte class with the given byte class, in place. - pub fn intersect(&mut self, other: &ClassBytes) { - self.set.intersect(&other.set); + /// Returns true if and only if this set contains any word boundary or + /// negated word boundary assertions. This include both Unicode and ASCII + /// word boundaries. + #[inline] + pub fn contains_word(self) -> bool { + self.contains_word_unicode() || self.contains_word_ascii() } - /// Subtract the given byte class from this byte class, in place. - pub fn difference(&mut self, other: &ClassBytes) { - self.set.difference(&other.set); + /// Returns true if and only if this set contains any Unicode word boundary + /// or negated Unicode word boundary assertions. + #[inline] + pub fn contains_word_unicode(self) -> bool { + self.contains(Look::WordUnicode) + || self.contains(Look::WordUnicodeNegate) } - /// Compute the symmetric difference of the given byte classes, in place. - /// - /// This computes the symmetric difference of two byte classes. This - /// removes all elements in this class that are also in the given class, - /// but all adds all elements from the given class that aren't in this - /// class. That is, the class will contain all elements in either class, - /// but will not contain any elements that are in both classes. - pub fn symmetric_difference(&mut self, other: &ClassBytes) { - self.set.symmetric_difference(&other.set); + /// Returns true if and only if this set contains any ASCII word boundary + /// or negated ASCII word boundary assertions. + #[inline] + pub fn contains_word_ascii(self) -> bool { + self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) } - /// Returns true if and only if this character class will either match - /// nothing or only ASCII bytes. Stated differently, this returns false - /// if and only if this class contains a non-ASCII byte. - pub fn is_all_ascii(&self) -> bool { - self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) + /// Returns an iterator over all of the look-around assertions in this set. + #[inline] + pub fn iter(self) -> LookSetIter { + LookSetIter { set: self } } -} - -/// An iterator over all ranges in a byte character class. -/// -/// The lifetime `'a` refers to the lifetime of the underlying class. -#[derive(Debug)] -pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>); -impl<'a> Iterator for ClassBytesIter<'a> { - type Item = &'a ClassBytesRange; + /// Return a new set that is equivalent to the original, but with the given + /// assertion added to it. If the assertion is already in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn insert(self, look: Look) -> LookSet { + LookSet { bits: self.bits | look.as_repr() } + } - fn next(&mut self) -> Option<&'a ClassBytesRange> { - self.0.next() + /// Updates this set in place with the result of inserting the given + /// assertion into this set. + #[inline] + pub fn set_insert(&mut self, look: Look) { + *self = self.insert(look); } -} -/// A single range of characters represented by arbitrary bytes. -/// -/// The range is closed. That is, the start and end of the range are included -/// in the range. -#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] -pub struct ClassBytesRange { - start: u8, - end: u8, -} + /// Return a new set that is equivalent to the original, but with the given + /// assertion removed from it. If the assertion is not in the set, then the + /// returned set is equivalent to the original. + #[inline] + pub fn remove(self, look: Look) -> LookSet { + LookSet { bits: self.bits & !look.as_repr() } + } -impl Interval for ClassBytesRange { - type Bound = u8; + /// Updates this set in place with the result of removing the given + /// assertion from this set. + #[inline] + pub fn set_remove(&mut self, look: Look) { + *self = self.remove(look); + } + /// Returns a new set that is the result of subtracting the given set from + /// this set. #[inline] - fn lower(&self) -> u8 { - self.start + pub fn subtract(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & !other.bits } } + + /// Updates this set in place with the result of subtracting the given set + /// from this set. #[inline] - fn upper(&self) -> u8 { - self.end + pub fn set_subtract(&mut self, other: LookSet) { + *self = self.subtract(other); } + + /// Returns a new set that is the union of this and the one given. #[inline] - fn set_lower(&mut self, bound: u8) { - self.start = bound; + pub fn union(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits | other.bits } } + + /// Updates this set in place with the result of unioning it with the one + /// given. #[inline] - fn set_upper(&mut self, bound: u8) { - self.end = bound; + pub fn set_union(&mut self, other: LookSet) { + *self = self.union(other); } - /// Apply simple case folding to this byte range. Only ASCII case mappings - /// (for a-z) are applied. - /// - /// Additional ranges are appended to the given vector. Canonical ordering - /// is *not* maintained in the given vector. - fn case_fold_simple( - &self, - ranges: &mut Vec, - ) -> Result<(), unicode::CaseFoldError> { - if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) { - let lower = cmp::max(self.start, b'a'); - let upper = cmp::min(self.end, b'z'); - ranges.push(ClassBytesRange::new(lower - 32, upper - 32)); - } - if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) { - let lower = cmp::max(self.start, b'A'); - let upper = cmp::min(self.end, b'Z'); - ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); - } - Ok(()) + /// Returns a new set that is the intersection of this and the one given. + #[inline] + pub fn intersect(self, other: LookSet) -> LookSet { + LookSet { bits: self.bits & other.bits } } -} -impl ClassBytesRange { - /// Create a new byte range for a character class. - /// - /// The returned range is always in a canonical form. That is, the range - /// returned always satisfies the invariant that `start <= end`. - pub fn new(start: u8, end: u8) -> ClassBytesRange { - ClassBytesRange::create(start, end) + /// Updates this set in place with the result of intersecting it with the + /// one given. + #[inline] + pub fn set_intersect(&mut self, other: LookSet) { + *self = self.intersect(other); } - /// Return the start of this range. + /// Return a `LookSet` from the slice given as a native endian 16-bit + /// integer. /// - /// The start of a range is always less than or equal to the end of the - /// range. - pub fn start(&self) -> u8 { - self.start + /// # Panics + /// + /// This panics if `slice.len() < 2`. + #[inline] + pub fn read_repr(slice: &[u8]) -> LookSet { + let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap()); + LookSet { bits } } - /// Return the end of this range. + /// Write a `LookSet` as a native endian 16-bit integer to the beginning + /// of the slice given. /// - /// The end of a range is always greater than or equal to the start of the - /// range. - pub fn end(&self) -> u8 { - self.end + /// # Panics + /// + /// This panics if `slice.len() < 2`. + #[inline] + pub fn write_repr(self, slice: &mut [u8]) { + let raw = self.bits.to_ne_bytes(); + slice[0] = raw[0]; + slice[1] = raw[1]; } } -impl fmt::Debug for ClassBytesRange { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let mut debug = f.debug_struct("ClassBytesRange"); - if self.start <= 0x7F { - debug.field("start", &(self.start as char)); - } else { - debug.field("start", &self.start); +impl core::fmt::Debug for LookSet { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + if self.is_empty() { + return write!(f, "∅"); } - if self.end <= 0x7F { - debug.field("end", &(self.end as char)); - } else { - debug.field("end", &self.end); + for look in self.iter() { + write!(f, "{}", look.as_char())?; } - debug.finish() + Ok(()) } } -/// The high-level intermediate representation for an anchor assertion. +/// An iterator over all look-around assertions in a [`LookSet`]. /// -/// A matching anchor assertion is always zero-length. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum Anchor { - /// Match the beginning of a line or the beginning of text. Specifically, - /// this matches at the starting position of the input, or at the position - /// immediately following a `\n` character. - StartLine, - /// Match the end of a line or the end of text. Specifically, - /// this matches at the end position of the input, or at the position - /// immediately preceding a `\n` character. - EndLine, - /// Match the beginning of text. Specifically, this matches at the starting - /// position of the input. - StartText, - /// Match the end of text. Specifically, this matches at the ending - /// position of the input. - EndText, +/// This iterator is created by [`LookSet::iter`]. +#[derive(Clone, Debug)] +pub struct LookSetIter { + set: LookSet, } -/// The high-level intermediate representation for a word-boundary assertion. -/// -/// A matching word boundary assertion is always zero-length. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum WordBoundary { - /// Match a Unicode-aware word boundary. That is, this matches a position - /// where the left adjacent character and right adjacent character - /// correspond to a word and non-word or a non-word and word character. - Unicode, - /// Match a Unicode-aware negation of a word boundary. - UnicodeNegate, - /// Match an ASCII-only word boundary. That is, this matches a position - /// where the left adjacent character and right adjacent character - /// correspond to a word and non-word or a non-word and word character. - Ascii, - /// Match an ASCII-only negation of a word boundary. - AsciiNegate, -} +impl Iterator for LookSetIter { + type Item = Look; -impl WordBoundary { - /// Returns true if and only if this word boundary assertion is negated. - pub fn is_negated(&self) -> bool { - match *self { - WordBoundary::Unicode | WordBoundary::Ascii => false, - WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true, + #[inline] + fn next(&mut self) -> Option { + if self.set.is_empty() { + return None; } + // We'll never have more than u8::MAX distinct look-around assertions, + // so 'repr' will always fit into a u16. + let repr = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); + let look = Look::from_repr(1 << repr)?; + self.set = self.set.remove(look); + Some(look) } } -/// The high-level intermediate representation for a group. -/// -/// This represents one of three possible group types: -/// -/// 1. A non-capturing group (e.g., `(?:expr)`). -/// 2. A capturing group (e.g., `(expr)`). -/// 3. A named capturing group (e.g., `(?Pexpr)`). -#[derive(Clone, Debug, Eq, PartialEq)] -pub struct Group { - /// The kind of this group. If it is a capturing group, then the kind - /// contains the capture group index (and the name, if it is a named - /// group). - pub kind: GroupKind, - /// The expression inside the capturing group, which may be empty. - pub hir: Box, -} - -/// The kind of group. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum GroupKind { - /// A normal unnamed capturing group. - /// - /// The value is the capture index of the group. - CaptureIndex(u32), - /// A named capturing group. - CaptureName { - /// The name of the group. - name: String, - /// The capture index of the group. - index: u32, - }, - /// A non-capturing group. - NonCapturing, +/// Given a sequence of HIR values where each value corresponds to a Unicode +/// class (or an all-ASCII byte class), return a single Unicode class +/// corresponding to the union of the classes found. +fn class_chars(hirs: &[Hir]) -> Option { + let mut cls = ClassUnicode::new(vec![]); + for hir in hirs.iter() { + match *hir.kind() { + HirKind::Class(Class::Unicode(ref cls2)) => { + cls.union(cls2); + } + HirKind::Class(Class::Bytes(ref cls2)) => { + cls.union(&cls2.to_unicode_class()?); + } + _ => return None, + }; + } + Some(Class::Unicode(cls)) } -/// The high-level intermediate representation of a repetition operator. -/// -/// A repetition operator permits the repetition of an arbitrary -/// sub-expression. -#[derive(Clone, Debug, Eq, PartialEq)] -pub struct Repetition { - /// The kind of this repetition operator. - pub kind: RepetitionKind, - /// Whether this repetition operator is greedy or not. A greedy operator - /// will match as much as it can. A non-greedy operator will match as - /// little as it can. - /// - /// Typically, operators are greedy by default and are only non-greedy when - /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is - /// not. However, this can be inverted via the `U` "ungreedy" flag. - pub greedy: bool, - /// The expression being repeated. - pub hir: Box, +/// Given a sequence of HIR values where each value corresponds to a byte class +/// (or an all-ASCII Unicode class), return a single byte class corresponding +/// to the union of the classes found. +fn class_bytes(hirs: &[Hir]) -> Option { + let mut cls = ClassBytes::new(vec![]); + for hir in hirs.iter() { + match *hir.kind() { + HirKind::Class(Class::Unicode(ref cls2)) => { + cls.union(&cls2.to_byte_class()?); + } + HirKind::Class(Class::Bytes(ref cls2)) => { + cls.union(cls2); + } + _ => return None, + }; + } + Some(Class::Bytes(cls)) } -impl Repetition { - /// Returns true if and only if this repetition operator makes it possible - /// to match the empty string. - /// - /// Note that this is not defined inductively. For example, while `a*` - /// will report `true`, `()+` will not, even though `()` matches the empty - /// string and one or more occurrences of something that matches the empty - /// string will always match the empty string. In order to get the - /// inductive definition, see the corresponding method on - /// [`Hir`](struct.Hir.html). - pub fn is_match_empty(&self) -> bool { - match self.kind { - RepetitionKind::ZeroOrOne => true, - RepetitionKind::ZeroOrMore => true, - RepetitionKind::OneOrMore => false, - RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0, - RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0, - RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0, +/// Given a sequence of HIR values where each value corresponds to a literal +/// that is a single `char`, return that sequence of `char`s. Otherwise return +/// None. No deduplication is done. +fn singleton_chars(hirs: &[Hir]) -> Option> { + let mut singletons = vec![]; + for hir in hirs.iter() { + let literal = match *hir.kind() { + HirKind::Literal(Literal(ref bytes)) => bytes, + _ => return None, + }; + let ch = match crate::debug::utf8_decode(literal) { + None => return None, + Some(Err(_)) => return None, + Some(Ok(ch)) => ch, + }; + if literal.len() != ch.len_utf8() { + return None; } + singletons.push(ch); } + Some(singletons) } -/// The kind of a repetition operator. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum RepetitionKind { - /// Matches a sub-expression zero or one times. - ZeroOrOne, - /// Matches a sub-expression zero or more times. - ZeroOrMore, - /// Matches a sub-expression one or more times. - OneOrMore, - /// Matches a sub-expression within a bounded range of times. - Range(RepetitionRange), -} - -/// The kind of a counted repetition operator. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum RepetitionRange { - /// Matches a sub-expression exactly this many times. - Exactly(u32), - /// Matches a sub-expression at least this many times. - AtLeast(u32), - /// Matches a sub-expression at least `m` times and at most `n` times. - Bounded(u32, u32), -} - -/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack -/// space but heap space proportional to the depth of the total `Hir`. -impl Drop for Hir { - fn drop(&mut self) { - use std::mem; - - match *self.kind() { - HirKind::Empty - | HirKind::Literal(_) - | HirKind::Class(_) - | HirKind::Anchor(_) - | HirKind::WordBoundary(_) => return, - HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return, - HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return, - HirKind::Concat(ref x) if x.is_empty() => return, - HirKind::Alternation(ref x) if x.is_empty() => return, - _ => {} - } - - let mut stack = vec![mem::replace(self, Hir::empty())]; - while let Some(mut expr) = stack.pop() { - match expr.kind { - HirKind::Empty - | HirKind::Literal(_) - | HirKind::Class(_) - | HirKind::Anchor(_) - | HirKind::WordBoundary(_) => {} - HirKind::Group(ref mut x) => { - stack.push(mem::replace(&mut x.hir, Hir::empty())); - } - HirKind::Repetition(ref mut x) => { - stack.push(mem::replace(&mut x.hir, Hir::empty())); - } - HirKind::Concat(ref mut x) => { - stack.extend(x.drain(..)); - } - HirKind::Alternation(ref mut x) => { - stack.extend(x.drain(..)); - } - } +/// Given a sequence of HIR values where each value corresponds to a literal +/// that is a single byte, return that sequence of bytes. Otherwise return +/// None. No deduplication is done. +fn singleton_bytes(hirs: &[Hir]) -> Option> { + let mut singletons = vec![]; + for hir in hirs.iter() { + let literal = match *hir.kind() { + HirKind::Literal(Literal(ref bytes)) => bytes, + _ => return None, + }; + if literal.len() != 1 { + return None; } + singletons.push(literal[0]); } + Some(singletons) } -/// A type that documents various attributes of an HIR expression. +/// Looks for a common prefix in the list of alternation branches given. If one +/// is found, then an equivalent but (hopefully) simplified Hir is returned. +/// Otherwise, the original given list of branches is returned unmodified. /// -/// These attributes are typically defined inductively on the HIR. -#[derive(Clone, Debug, Eq, PartialEq)] -struct HirInfo { - /// Represent yes/no questions by a bitfield to conserve space, since - /// this is included in every HIR expression. - /// - /// If more attributes need to be added, it is OK to increase the size of - /// this as appropriate. - bools: u16, -} - -// A simple macro for defining bitfield accessors/mutators. -macro_rules! define_bool { - ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => { - fn $is_fn_name(&self) -> bool { - self.bools & (0b1 << $bit) > 0 +/// This is not quite as good as it could be. Right now, it requires that +/// all branches are 'Concat' expressions. It also doesn't do well with +/// literals. For example, given 'foofoo|foobar', it will not refactor it to +/// 'foo(?:foo|bar)' because literals are flattened into their own special +/// concatenation. (One wonders if perhaps 'Literal' should be a single atom +/// instead of a string of bytes because of this. Otherwise, handling the +/// current representation in this routine will be pretty gnarly. Sigh.) +fn lift_common_prefix(hirs: Vec) -> Result> { + if hirs.len() <= 1 { + return Err(hirs); + } + let mut prefix = match hirs[0].kind() { + HirKind::Concat(ref xs) => &**xs, + _ => return Err(hirs), + }; + if prefix.is_empty() { + return Err(hirs); + } + for h in hirs.iter().skip(1) { + let concat = match h.kind() { + HirKind::Concat(ref xs) => xs, + _ => return Err(hirs), + }; + let common_len = prefix + .iter() + .zip(concat.iter()) + .take_while(|(x, y)| x == y) + .count(); + prefix = &prefix[..common_len]; + if prefix.is_empty() { + return Err(hirs); } - - fn $set_fn_name(&mut self, yes: bool) { - if yes { - self.bools |= 1 << $bit; - } else { - self.bools &= !(1 << $bit); - } + } + let len = prefix.len(); + assert_ne!(0, len); + let mut prefix_concat = vec![]; + let mut suffix_alts = vec![]; + for h in hirs { + let mut concat = match h.into_kind() { + HirKind::Concat(xs) => xs, + // We required all sub-expressions to be + // concats above, so we're only here if we + // have a concat. + _ => unreachable!(), + }; + suffix_alts.push(Hir::concat(concat.split_off(len))); + if prefix_concat.is_empty() { + prefix_concat = concat; } - }; -} - -impl HirInfo { - fn new() -> HirInfo { - HirInfo { bools: 0 } - } - - define_bool!(0, is_always_utf8, set_always_utf8); - define_bool!(1, is_all_assertions, set_all_assertions); - define_bool!(2, is_anchored_start, set_anchored_start); - define_bool!(3, is_anchored_end, set_anchored_end); - define_bool!(4, is_line_anchored_start, set_line_anchored_start); - define_bool!(5, is_line_anchored_end, set_line_anchored_end); - define_bool!(6, is_any_anchored_start, set_any_anchored_start); - define_bool!(7, is_any_anchored_end, set_any_anchored_end); - define_bool!(8, is_match_empty, set_match_empty); - define_bool!(9, is_literal, set_literal); - define_bool!(10, is_alternation_literal, set_alternation_literal); + } + let mut concat = prefix_concat; + concat.push(Hir::alternation(suffix_alts)); + Ok(Hir::concat(concat)) } #[cfg(test)] @@ -2244,12 +3604,6 @@ mod tests { assert_eq!(expected, bsymdifference(&cls1, &cls2)); } - #[test] - #[should_panic] - fn hir_byte_literal_non_ascii() { - Hir::literal(Literal::Byte(b'a')); - } - // We use a thread with an explicit stack size to test that our destructor // for Hir can handle arbitrarily sized expressions in constant stack // space. In case we run on a platform without threads (WASM?), we limit @@ -2262,26 +3616,28 @@ mod tests { let run = || { let mut expr = Hir::empty(); for _ in 0..100 { - expr = Hir::group(Group { - kind: GroupKind::NonCapturing, - hir: Box::new(expr), + expr = Hir::capture(Capture { + index: 1, + name: None, + sub: Box::new(expr), }); expr = Hir::repetition(Repetition { - kind: RepetitionKind::ZeroOrOne, + min: 0, + max: Some(1), greedy: true, - hir: Box::new(expr), + sub: Box::new(expr), }); expr = Hir { kind: HirKind::Concat(vec![expr]), - info: HirInfo::new(), + props: Properties::empty(), }; expr = Hir { kind: HirKind::Alternation(vec![expr]), - info: HirInfo::new(), + props: Properties::empty(), }; } - assert!(!expr.kind.is_empty()); + assert!(!matches!(*expr.kind(), HirKind::Empty)); }; // We run our test on a thread with a small stack size so we can @@ -2296,4 +3652,31 @@ mod tests { .join() .unwrap(); } + + #[test] + fn look_set_iter() { + let set = LookSet::empty(); + assert_eq!(0, set.iter().count()); + + let set = LookSet::full(); + assert_eq!(10, set.iter().count()); + + let set = + LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); + assert_eq!(2, set.iter().count()); + + let set = LookSet::empty().insert(Look::StartLF); + assert_eq!(1, set.iter().count()); + + let set = LookSet::empty().insert(Look::WordAsciiNegate); + assert_eq!(1, set.iter().count()); + } + + #[test] + fn look_set_debug() { + let res = format!("{:?}", LookSet::empty()); + assert_eq!("∅", res); + let res = format!("{:?}", LookSet::full()); + assert_eq!("Az^$rRbB𝛃𝚩", res); + } } diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index b71f3897cf..fcb7cd252b 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -2,11 +2,16 @@ This module provides a regular expression printer for `Hir`. */ -use std::fmt; +use core::fmt; -use crate::hir::visitor::{self, Visitor}; -use crate::hir::{self, Hir, HirKind}; -use crate::is_meta_character; +use crate::{ + hir::{ + self, + visitor::{self, Visitor}, + Hir, HirKind, + }, + is_meta_character, +}; /// A builder for constructing a printer. /// @@ -84,21 +89,54 @@ impl Visitor for Writer { fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { match *hir.kind() { - HirKind::Empty - | HirKind::Repetition(_) - | HirKind::Concat(_) - | HirKind::Alternation(_) => {} - HirKind::Literal(hir::Literal::Unicode(c)) => { - self.write_literal_char(c)?; - } - HirKind::Literal(hir::Literal::Byte(b)) => { - self.write_literal_byte(b)?; + // Empty is represented by nothing in the concrete syntax, and + // repetition operators are strictly suffix oriented. + HirKind::Empty | HirKind::Repetition(_) => {} + HirKind::Literal(hir::Literal(ref bytes)) => { + // See the comment on the 'Concat' and 'Alternation' case below + // for why we put parens here. Literals are, conceptually, + // a special case of concatenation where each element is a + // character. The HIR flattens this into a Box<[u8]>, but we + // still need to treat it like a concatenation for correct + // printing. As a special case, we don't write parens if there + // is only one character. One character means there is no + // concat so we don't need parens. Adding parens would still be + // correct, but we drop them here because it tends to create + // rather noisy regexes even in simple cases. + let result = core::str::from_utf8(bytes); + let len = result.map_or(bytes.len(), |s| s.chars().count()); + if len > 1 { + self.wtr.write_str(r"(?:")?; + } + match result { + Ok(string) => { + for c in string.chars() { + self.write_literal_char(c)?; + } + } + Err(_) => { + for &b in bytes.iter() { + self.write_literal_byte(b)?; + } + } + } + if len > 1 { + self.wtr.write_str(r")")?; + } } HirKind::Class(hir::Class::Unicode(ref cls)) => { + if cls.ranges().is_empty() { + return self.wtr.write_str("[a&&b]"); + } self.wtr.write_str("[")?; for range in cls.iter() { if range.start() == range.end() { self.write_literal_char(range.start())?; + } else if u32::from(range.start()) + 1 + == u32::from(range.end()) + { + self.write_literal_char(range.start())?; + self.write_literal_char(range.end())?; } else { self.write_literal_char(range.start())?; self.wtr.write_str("-")?; @@ -108,10 +146,16 @@ impl Visitor for Writer { self.wtr.write_str("]")?; } HirKind::Class(hir::Class::Bytes(ref cls)) => { + if cls.ranges().is_empty() { + return self.wtr.write_str("[a&&b]"); + } self.wtr.write_str("(?-u:[")?; for range in cls.iter() { if range.start() == range.end() { self.write_literal_class_byte(range.start())?; + } else if range.start() + 1 == range.end() { + self.write_literal_class_byte(range.start())?; + self.write_literal_class_byte(range.end())?; } else { self.write_literal_class_byte(range.start())?; self.wtr.write_str("-")?; @@ -120,41 +164,60 @@ impl Visitor for Writer { } self.wtr.write_str("])")?; } - HirKind::Anchor(hir::Anchor::StartLine) => { - self.wtr.write_str("(?m:^)")?; - } - HirKind::Anchor(hir::Anchor::EndLine) => { - self.wtr.write_str("(?m:$)")?; - } - HirKind::Anchor(hir::Anchor::StartText) => { - self.wtr.write_str(r"\A")?; - } - HirKind::Anchor(hir::Anchor::EndText) => { - self.wtr.write_str(r"\z")?; - } - HirKind::WordBoundary(hir::WordBoundary::Unicode) => { - self.wtr.write_str(r"\b")?; - } - HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => { - self.wtr.write_str(r"\B")?; - } - HirKind::WordBoundary(hir::WordBoundary::Ascii) => { - self.wtr.write_str(r"(?-u:\b)")?; - } - HirKind::WordBoundary(hir::WordBoundary::AsciiNegate) => { - self.wtr.write_str(r"(?-u:\B)")?; - } - HirKind::Group(ref x) => match x.kind { - hir::GroupKind::CaptureIndex(_) => { - self.wtr.write_str("(")?; + HirKind::Look(ref look) => match *look { + hir::Look::Start => { + self.wtr.write_str(r"\A")?; + } + hir::Look::End => { + self.wtr.write_str(r"\z")?; + } + hir::Look::StartLF => { + self.wtr.write_str("(?m:^)")?; + } + hir::Look::EndLF => { + self.wtr.write_str("(?m:$)")?; + } + hir::Look::StartCRLF => { + self.wtr.write_str("(?mR:^)")?; } - hir::GroupKind::CaptureName { ref name, .. } => { - write!(self.wtr, "(?P<{}>", name)?; + hir::Look::EndCRLF => { + self.wtr.write_str("(?mR:$)")?; } - hir::GroupKind::NonCapturing => { - self.wtr.write_str("(?:")?; + hir::Look::WordAscii => { + self.wtr.write_str(r"(?-u:\b)")?; + } + hir::Look::WordAsciiNegate => { + self.wtr.write_str(r"(?-u:\B)")?; + } + hir::Look::WordUnicode => { + self.wtr.write_str(r"\b")?; + } + hir::Look::WordUnicodeNegate => { + self.wtr.write_str(r"\B")?; } }, + HirKind::Capture(hir::Capture { ref name, .. }) => { + self.wtr.write_str("(")?; + if let Some(ref name) = *name { + write!(self.wtr, "?P<{}>", name)?; + } + } + // Why do this? Wrapping concats and alts in non-capturing groups + // is not *always* necessary, but is sometimes necessary. For + // example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)' + // and not 'ab|c'. The former is clearly the intended meaning, but + // the latter is actually 'alt(concat(a, b), c)'. + // + // It would be possible to only group these things in cases where + // it's strictly necessary, but it requires knowing the parent + // expression. And since this technique is simpler and always + // correct, we take this route. More to the point, it is a non-goal + // of an HIR printer to show a nice easy-to-read regex. Indeed, + // its construction forbids it from doing so. Therefore, inserting + // extra groups where they aren't necessary is perfectly okay. + HirKind::Concat(_) | HirKind::Alternation(_) => { + self.wtr.write_str(r"(?:")?; + } } Ok(()) } @@ -165,39 +228,42 @@ impl Visitor for Writer { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) - | HirKind::Anchor(_) - | HirKind::WordBoundary(_) - | HirKind::Concat(_) - | HirKind::Alternation(_) => {} + | HirKind::Look(_) => {} HirKind::Repetition(ref x) => { - match x.kind { - hir::RepetitionKind::ZeroOrOne => { + match (x.min, x.max) { + (0, Some(1)) => { self.wtr.write_str("?")?; } - hir::RepetitionKind::ZeroOrMore => { + (0, None) => { self.wtr.write_str("*")?; } - hir::RepetitionKind::OneOrMore => { + (1, None) => { self.wtr.write_str("+")?; } - hir::RepetitionKind::Range(ref x) => match *x { - hir::RepetitionRange::Exactly(m) => { - write!(self.wtr, "{{{}}}", m)?; - } - hir::RepetitionRange::AtLeast(m) => { - write!(self.wtr, "{{{},}}", m)?; - } - hir::RepetitionRange::Bounded(m, n) => { - write!(self.wtr, "{{{},{}}}", m, n)?; - } - }, + (1, Some(1)) => { + // 'a{1}' and 'a{1}?' are exactly equivalent to 'a'. + return Ok(()); + } + (m, None) => { + write!(self.wtr, "{{{},}}", m)?; + } + (m, Some(n)) if m == n => { + write!(self.wtr, "{{{}}}", m)?; + // a{m} and a{m}? are always exactly equivalent. + return Ok(()); + } + (m, Some(n)) => { + write!(self.wtr, "{{{},{}}}", m, n)?; + } } if !x.greedy { self.wtr.write_str("?")?; } } - HirKind::Group(_) => { - self.wtr.write_str(")")?; + HirKind::Capture(_) + | HirKind::Concat(_) + | HirKind::Alternation(_) => { + self.wtr.write_str(r")")?; } } Ok(()) @@ -217,18 +283,16 @@ impl Writer { } fn write_literal_byte(&mut self, b: u8) -> fmt::Result { - let c = b as char; - if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { - self.write_literal_char(c) + if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { + self.write_literal_char(char::try_from(b).unwrap()) } else { write!(self.wtr, "(?-u:\\x{:02X})", b) } } fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { - let c = b as char; - if c <= 0x7F as char && !c.is_control() && !c.is_whitespace() { - self.write_literal_char(c) + if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { + self.write_literal_char(char::try_from(b).unwrap()) } else { write!(self.wtr, "\\x{:02X}", b) } @@ -237,15 +301,21 @@ impl Writer { #[cfg(test)] mod tests { - use super::Printer; + use alloc::{ + boxed::Box, + string::{String, ToString}, + }; + use crate::ParserBuilder; + use super::*; + fn roundtrip(given: &str, expected: &str) { roundtrip_with(|b| b, given, expected); } fn roundtrip_bytes(given: &str, expected: &str) { - roundtrip_with(|b| b.allow_invalid_utf8(true), given, expected); + roundtrip_with(|b| b.utf8(false), given, expected); } fn roundtrip_with(mut f: F, given: &str, expected: &str) @@ -277,28 +347,35 @@ mod tests { #[test] fn print_class() { - roundtrip(r"[a]", r"[a]"); + roundtrip(r"[a]", r"a"); + roundtrip(r"[ab]", r"[ab]"); roundtrip(r"[a-z]", r"[a-z]"); roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]"); - roundtrip(r"[^\x01-\u{10FFFF}]", "[\u{0}]"); - roundtrip(r"[-]", r"[\-]"); + roundtrip(r"[^\x01-\u{10FFFF}]", "\u{0}"); + roundtrip(r"[-]", r"\-"); roundtrip(r"[☃-⛄]", r"[☃-⛄]"); - roundtrip(r"(?-u)[a]", r"(?-u:[a])"); + roundtrip(r"(?-u)[a]", r"a"); + roundtrip(r"(?-u)[ab]", r"(?-u:[ab])"); roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])"); roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])"); // The following test that the printer escapes meta characters // in character classes. - roundtrip(r"[\[]", r"[\[]"); + roundtrip(r"[\[]", r"\["); roundtrip(r"[Z-_]", r"[Z-_]"); roundtrip(r"[Z-_--Z]", r"[\[-_]"); // The following test that the printer escapes meta characters // in byte oriented character classes. - roundtrip_bytes(r"(?-u)[\[]", r"(?-u:[\[])"); + roundtrip_bytes(r"(?-u)[\[]", r"\["); roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])"); roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])"); + + // This tests that an empty character class is correctly roundtripped. + #[cfg(feature = "unicode-gencat")] + roundtrip(r"\P{any}", r"[a&&b]"); + roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]"); } #[test] @@ -331,37 +408,170 @@ mod tests { roundtrip("a+?", "a+?"); roundtrip("(?U)a+", "a+?"); - roundtrip("a{1}", "a{1}"); - roundtrip("a{1,}", "a{1,}"); + roundtrip("a{1}", "a"); + roundtrip("a{2}", "a{2}"); + roundtrip("a{1,}", "a+"); roundtrip("a{1,5}", "a{1,5}"); - roundtrip("a{1}?", "a{1}?"); - roundtrip("a{1,}?", "a{1,}?"); + roundtrip("a{1}?", "a"); + roundtrip("a{2}?", "a{2}"); + roundtrip("a{1,}?", "a+?"); roundtrip("a{1,5}?", "a{1,5}?"); - roundtrip("(?U)a{1}", "a{1}?"); - roundtrip("(?U)a{1,}", "a{1,}?"); + roundtrip("(?U)a{1}", "a"); + roundtrip("(?U)a{2}", "a{2}"); + roundtrip("(?U)a{1,}", "a+?"); roundtrip("(?U)a{1,5}", "a{1,5}?"); + + // Test that various zero-length repetitions always translate to an + // empty regex. This is more a property of HIR's smart constructors + // than the printer though. + roundtrip("a{0}", ""); + roundtrip("(?:ab){0}", ""); + #[cfg(feature = "unicode-gencat")] + { + roundtrip(r"\p{any}{0}", ""); + roundtrip(r"\P{any}{0}", ""); + } } #[test] fn print_group() { roundtrip("()", "()"); roundtrip("(?P)", "(?P)"); - roundtrip("(?:)", "(?:)"); + roundtrip("(?:)", ""); roundtrip("(a)", "(a)"); roundtrip("(?Pa)", "(?Pa)"); - roundtrip("(?:a)", "(?:a)"); + roundtrip("(?:a)", "a"); roundtrip("((((a))))", "((((a))))"); } #[test] fn print_alternation() { - roundtrip("|", "|"); - roundtrip("||", "||"); + roundtrip("|", "(?:|)"); + roundtrip("||", "(?:||)"); + + roundtrip("a|b", "[ab]"); + roundtrip("ab|cd", "(?:(?:ab)|(?:cd))"); + roundtrip("a|b|c", "[a-c]"); + roundtrip("ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))"); + roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))"); + } - roundtrip("a|b", "a|b"); - roundtrip("a|b|c", "a|b|c"); - roundtrip("foo|bar|quux", "foo|bar|quux"); + // This is a regression test that stresses a peculiarity of how the HIR + // is both constructed and printed. Namely, it is legal for a repetition + // to directly contain a concatenation. This particular construct isn't + // really possible to build from the concrete syntax directly, since you'd + // be forced to put the concatenation into (at least) a non-capturing + // group. Concurrently, the printer doesn't consider this case and just + // kind of naively prints the child expression and tacks on the repetition + // operator. + // + // As a result, if you attached '+' to a 'concat(a, b)', the printer gives + // you 'ab+', but clearly it really should be '(?:ab)+'. + // + // This bug isn't easy to surface because most ways of building an HIR + // come directly from the concrete syntax, and as mentioned above, it just + // isn't possible to build this kind of HIR from the concrete syntax. + // Nevertheless, this is definitely a bug. + // + // See: https://github.com/rust-lang/regex/issues/731 + #[test] + fn regression_repetition_concat() { + let expr = Hir::concat(alloc::vec![ + Hir::literal("x".as_bytes()), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::literal("ab".as_bytes())), + }), + Hir::literal("y".as_bytes()), + ]); + assert_eq!(r"(?:x(?:ab)+y)", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), + ])), + }), + Hir::look(hir::Look::End), + ]); + assert_eq!(r"(?:\A(?:\A\z)+\z)", expr.to_string()); + } + + // Just like regression_repetition_concat, but with the repetition using + // an alternation as a child expression instead. + // + // See: https://github.com/rust-lang/regex/issues/731 + #[test] + fn regression_repetition_alternation() { + let expr = Hir::concat(alloc::vec![ + Hir::literal("ab".as_bytes()), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::alternation(alloc::vec![ + Hir::literal("cd".as_bytes()), + Hir::literal("ef".as_bytes()), + ])), + }), + Hir::literal("gh".as_bytes()), + ]); + assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::repetition(hir::Repetition { + min: 1, + max: None, + greedy: true, + sub: Box::new(Hir::alternation(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), + ])), + }), + Hir::look(hir::Look::End), + ]); + assert_eq!(r"(?:\A(?:\A|\z)+\z)", expr.to_string()); + } + + // This regression test is very similar in flavor to + // regression_repetition_concat in that the root of the issue lies in a + // peculiarity of how the HIR is represented and how the printer writes it + // out. Like the other regression, this one is also rooted in the fact that + // you can't produce the peculiar HIR from the concrete syntax. Namely, you + // just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally + // be in (at least) a non-capturing group. Why? Because the '|' has very + // low precedence (lower that concatenation), and so something like 'ab|c' + // is actually 'alt(ab, c)'. + // + // See: https://github.com/rust-lang/regex/issues/516 + #[test] + fn regression_alternation_concat() { + let expr = Hir::concat(alloc::vec![ + Hir::literal("ab".as_bytes()), + Hir::alternation(alloc::vec![ + Hir::literal("mn".as_bytes()), + Hir::literal("xy".as_bytes()), + ]), + ]); + assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string()); + + let expr = Hir::concat(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::alternation(alloc::vec![ + Hir::look(hir::Look::Start), + Hir::look(hir::Look::End), + ]), + ]); + assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string()); } } diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 890e1608b3..b22861fc7c 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -2,19 +2,23 @@ Defines a translator that converts an `Ast` to an `Hir`. */ -use std::cell::{Cell, RefCell}; -use std::result; +use core::cell::{Cell, RefCell}; -use crate::ast::{self, Ast, Span, Visitor}; -use crate::hir::{self, Error, ErrorKind, Hir}; -use crate::unicode::{self, ClassQuery}; +use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; -type Result = result::Result; +use crate::{ + ast::{self, Ast, Span, Visitor}, + either::Either, + hir::{self, Error, ErrorKind, Hir, HirKind}, + unicode::{self, ClassQuery}, +}; + +type Result = core::result::Result; /// A builder for constructing an AST->HIR translator. #[derive(Clone, Debug)] pub struct TranslatorBuilder { - allow_invalid_utf8: bool, + utf8: bool, flags: Flags, } @@ -27,10 +31,7 @@ impl Default for TranslatorBuilder { impl TranslatorBuilder { /// Create a new translator builder with a default c onfiguration. pub fn new() -> TranslatorBuilder { - TranslatorBuilder { - allow_invalid_utf8: false, - flags: Flags::default(), - } + TranslatorBuilder { utf8: true, flags: Flags::default() } } /// Build a translator using the current configuration. @@ -38,23 +39,27 @@ impl TranslatorBuilder { Translator { stack: RefCell::new(vec![]), flags: Cell::new(self.flags), - allow_invalid_utf8: self.allow_invalid_utf8, + utf8: self.utf8, } } - /// When enabled, translation will permit the construction of a regular + /// When disabled, translation will permit the construction of a regular /// expression that may match invalid UTF-8. /// - /// When disabled (the default), the translator is guaranteed to produce - /// an expression that will only ever match valid UTF-8 (otherwise, the - /// translator will return an error). + /// When enabled (the default), the translator is guaranteed to produce an + /// expression that, for non-empty matches, will only ever produce spans + /// that are entirely valid UTF-8 (otherwise, the translator will return an + /// error). /// - /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII - /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause - /// the parser to return an error. Namely, a negated ASCII word boundary - /// can result in matching positions that aren't valid UTF-8 boundaries. - pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { - self.allow_invalid_utf8 = yes; + /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even + /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete + /// syntax) will be allowed even though they can produce matches that split + /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" + /// matches, and it is expected that the regex engine itself must handle + /// these cases if necessary (perhaps by suppressing any zero-width matches + /// that split a codepoint). + pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.utf8 = yes; self } @@ -80,6 +85,12 @@ impl TranslatorBuilder { self } + /// Enable or disable the CRLF mode flag (`R`) by default. + pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder { + self.flags.crlf = if yes { Some(true) } else { None }; + self + } + /// Enable or disable the "swap greed" flag (`U`) by default. pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.swap_greed = if yes { Some(true) } else { None }; @@ -100,7 +111,7 @@ impl TranslatorBuilder { /// many abstract syntax trees. /// /// A `Translator` can be configured in more detail via a -/// [`TranslatorBuilder`](struct.TranslatorBuilder.html). +/// [`TranslatorBuilder`]. #[derive(Clone, Debug)] pub struct Translator { /// Our call stack, but on the heap. @@ -108,7 +119,7 @@ pub struct Translator { /// The current flag settings. flags: Cell, /// Whether we're allowed to produce HIR that can match arbitrary bytes. - allow_invalid_utf8: bool, + utf8: bool, } impl Translator { @@ -143,6 +154,12 @@ enum HirFrame { /// case in the Ast. They get popped after an inductive (i.e., recursive) /// step is complete. Expr(Hir), + /// A literal that is being constructed, character by character, from the + /// AST. We need this because the AST gives each individual character its + /// own node. So as we see characters, we peek at the top-most HirFrame. + /// If it's a literal, then we add to it. Otherwise, we push a new literal. + /// When it comes time to pop it, we convert it to an Hir via Hir::literal. + Literal(Vec), /// A Unicode character class. This frame is mutated as we descend into /// the Ast of a character class (which is itself its own mini recursive /// structure). @@ -152,10 +169,17 @@ enum HirFrame { /// recursive structure). /// /// Byte character classes are created when Unicode mode (`u`) is disabled. - /// If `allow_invalid_utf8` is disabled (the default), then a byte - /// character is only permitted to match ASCII text. + /// If `utf8` is enabled (the default), then a byte character is only + /// permitted to match ASCII text. ClassBytes(hir::ClassBytes), - /// This is pushed on to the stack upon first seeing any kind of group, + /// This is pushed whenever a repetition is observed. After visiting every + /// sub-expression in the repetition, the translator's stack is expected to + /// have this sentinel at the top. + /// + /// This sentinel only exists to stop other things (like flattening + /// literals) from reaching across repetition operators. + Repetition, + /// This is pushed on to the stack upon first seeing any kind of capture, /// indicated by parentheses (including non-capturing groups). It is popped /// upon leaving a group. Group { @@ -181,6 +205,14 @@ enum HirFrame { /// every sub-expression in the alternation, the translator's stack is /// popped until it sees an Alternation frame. Alternation, + /// This is pushed immediately before each sub-expression in an + /// alternation. This separates the branches of an alternation on the + /// stack and prevents literal flattening from reaching across alternation + /// branches. + /// + /// It is popped after each expression in a branch until an 'Alternation' + /// frame is observed when doing a post visit on an alternation. + AlternationBranch, } impl HirFrame { @@ -188,6 +220,7 @@ impl HirFrame { fn unwrap_expr(self) -> Hir { match self { HirFrame::Expr(expr) => expr, + HirFrame::Literal(lit) => Hir::literal(lit), _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self), } } @@ -218,6 +251,20 @@ impl HirFrame { } } + /// Assert that the current stack frame is a repetition sentinel. If it + /// isn't, then panic. + fn unwrap_repetition(self) { + match self { + HirFrame::Repetition => {} + _ => { + panic!( + "tried to unwrap repetition from HirFrame, got: {:?}", + self + ) + } + } + } + /// Assert that the current stack frame is a group indicator and return /// its corresponding flags (the flags that were active at the time the /// group was entered). @@ -229,6 +276,20 @@ impl HirFrame { } } } + + /// Assert that the current stack frame is an alternation pipe sentinel. If + /// it isn't, then panic. + fn unwrap_alternation_pipe(self) { + match self { + HirFrame::AlternationBranch => {} + _ => { + panic!( + "tried to unwrap alt pipe from HirFrame, got: {:?}", + self + ) + } + } + } } impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { @@ -252,6 +313,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::ClassBytes(cls)); } } + Ast::Repetition(_) => self.push(HirFrame::Repetition), Ast::Group(ref x) => { let old_flags = x .flags() @@ -266,6 +328,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { Ast::Alternation(ref x) if x.asts.is_empty() => {} Ast::Alternation(_) => { self.push(HirFrame::Alternation); + self.push(HirFrame::AlternationBranch); } _ => {} } @@ -291,7 +354,20 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { self.push(HirFrame::Expr(Hir::empty())); } Ast::Literal(ref x) => { - self.push(HirFrame::Expr(self.hir_literal(x)?)); + match self.ast_literal_to_scalar(x)? { + Either::Right(byte) => self.push_byte(byte), + Either::Left(ch) => { + if !self.flags().unicode() && ch.len_utf8() > 1 { + return Err(self + .error(x.span, ErrorKind::UnicodeNotAllowed)); + } + match self.case_fold_char(x.span, ch)? { + None => self.push_char(ch), + Some(expr) => self.push(HirFrame::Expr(expr)), + } + } + } + // self.push(HirFrame::Expr(self.hir_literal(x)?)); } Ast::Dot(span) => { self.push(HirFrame::Expr(self.hir_dot(span)?)); @@ -305,7 +381,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { let hcls = hir::Class::Unicode(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } else { - let cls = self.hir_perl_byte_class(x); + let cls = self.hir_perl_byte_class(x)?; let hcls = hir::Class::Bytes(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } @@ -322,12 +398,6 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { ast.negated, &mut cls, )?; - if cls.ranges().is_empty() { - return Err(self.error( - ast.span, - ErrorKind::EmptyClassNotAllowed, - )); - } let expr = Hir::class(hir::Class::Unicode(cls)); self.push(HirFrame::Expr(expr)); } else { @@ -337,31 +407,25 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { ast.negated, &mut cls, )?; - if cls.ranges().is_empty() { - return Err(self.error( - ast.span, - ErrorKind::EmptyClassNotAllowed, - )); - } - let expr = Hir::class(hir::Class::Bytes(cls)); self.push(HirFrame::Expr(expr)); } } Ast::Repetition(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); + self.pop().unwrap().unwrap_repetition(); self.push(HirFrame::Expr(self.hir_repetition(x, expr))); } Ast::Group(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); let old_flags = self.pop().unwrap().unwrap_group(); self.trans().flags.set(old_flags); - self.push(HirFrame::Expr(self.hir_group(x, expr))); + self.push(HirFrame::Expr(self.hir_capture(x, expr))); } Ast::Concat(_) => { let mut exprs = vec![]; - while let Some(HirFrame::Expr(expr)) = self.pop() { - if !expr.kind().is_empty() { + while let Some(expr) = self.pop_concat_expr() { + if !matches!(*expr.kind(), HirKind::Empty) { exprs.push(expr); } } @@ -370,7 +434,8 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { } Ast::Alternation(_) => { let mut exprs = vec![]; - while let Some(HirFrame::Expr(expr)) = self.pop() { + while let Some(expr) = self.pop_alt_expr() { + self.pop().unwrap().unwrap_alternation_pipe(); exprs.push(expr); } exprs.reverse(); @@ -380,6 +445,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { Ok(()) } + fn visit_alternation_in(&mut self) -> Result<()> { + self.push(HirFrame::AlternationBranch); + Ok(()) + } + fn visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, @@ -458,7 +528,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); } else { - let xcls = self.hir_perl_byte_class(x); + let xcls = self.hir_perl_byte_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_bytes(); cls.union(&xcls); self.push(HirFrame::ClassBytes(cls)); @@ -602,11 +672,103 @@ impl<'t, 'p> TranslatorI<'t, 'p> { self.trans().stack.borrow_mut().push(frame); } + /// Push the given literal char on to the call stack. + /// + /// If the top-most element of the stack is a literal, then the char + /// is appended to the end of that literal. Otherwise, a new literal + /// containing just the given char is pushed to the top of the stack. + fn push_char(&self, ch: char) { + let mut buf = [0; 4]; + let bytes = ch.encode_utf8(&mut buf).as_bytes(); + let mut stack = self.trans().stack.borrow_mut(); + if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { + literal.extend_from_slice(bytes); + } else { + stack.push(HirFrame::Literal(bytes.to_vec())); + } + } + + /// Push the given literal byte on to the call stack. + /// + /// If the top-most element of the stack is a literal, then the byte + /// is appended to the end of that literal. Otherwise, a new literal + /// containing just the given byte is pushed to the top of the stack. + fn push_byte(&self, byte: u8) { + let mut stack = self.trans().stack.borrow_mut(); + if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { + literal.push(byte); + } else { + stack.push(HirFrame::Literal(vec![byte])); + } + } + /// Pop the top of the call stack. If the call stack is empty, return None. fn pop(&self) -> Option { self.trans().stack.borrow_mut().pop() } + /// Pop an HIR expression from the top of the stack for a concatenation. + /// + /// This returns None if the stack is empty or when a concat frame is seen. + /// Otherwise, it panics if it could not find an HIR expression. + fn pop_concat_expr(&self) -> Option { + let frame = self.pop()?; + match frame { + HirFrame::Concat => None, + HirFrame::Expr(expr) => Some(expr), + HirFrame::Literal(lit) => Some(Hir::literal(lit)), + HirFrame::ClassUnicode(_) => { + unreachable!("expected expr or concat, got Unicode class") + } + HirFrame::ClassBytes(_) => { + unreachable!("expected expr or concat, got byte class") + } + HirFrame::Repetition => { + unreachable!("expected expr or concat, got repetition") + } + HirFrame::Group { .. } => { + unreachable!("expected expr or concat, got group") + } + HirFrame::Alternation => { + unreachable!("expected expr or concat, got alt marker") + } + HirFrame::AlternationBranch => { + unreachable!("expected expr or concat, got alt branch marker") + } + } + } + + /// Pop an HIR expression from the top of the stack for an alternation. + /// + /// This returns None if the stack is empty or when an alternation frame is + /// seen. Otherwise, it panics if it could not find an HIR expression. + fn pop_alt_expr(&self) -> Option { + let frame = self.pop()?; + match frame { + HirFrame::Alternation => None, + HirFrame::Expr(expr) => Some(expr), + HirFrame::Literal(lit) => Some(Hir::literal(lit)), + HirFrame::ClassUnicode(_) => { + unreachable!("expected expr or alt, got Unicode class") + } + HirFrame::ClassBytes(_) => { + unreachable!("expected expr or alt, got byte class") + } + HirFrame::Repetition => { + unreachable!("expected expr or alt, got repetition") + } + HirFrame::Group { .. } => { + unreachable!("expected expr or alt, got group") + } + HirFrame::Concat => { + unreachable!("expected expr or alt, got concat marker") + } + HirFrame::AlternationBranch => { + unreachable!("expected expr or alt, got alt branch marker") + } + } + } + /// Create a new error with the given span and error type. fn error(&self, span: Span, kind: ErrorKind) -> Error { Error { kind, pattern: self.pattern.to_string(), span } @@ -627,63 +789,48 @@ impl<'t, 'p> TranslatorI<'t, 'p> { old_flags } - fn hir_literal(&self, lit: &ast::Literal) -> Result { - let ch = match self.literal_to_char(lit)? { - byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)), - hir::Literal::Unicode(ch) => ch, - }; - if self.flags().case_insensitive() { - self.hir_from_char_case_insensitive(lit.span, ch) - } else { - self.hir_from_char(lit.span, ch) - } - } - /// Convert an Ast literal to its scalar representation. /// /// When Unicode mode is enabled, then this always succeeds and returns a /// `char` (Unicode scalar value). /// - /// When Unicode mode is disabled, then a raw byte is returned. If that - /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns - /// an error. - fn literal_to_char(&self, lit: &ast::Literal) -> Result { + /// When Unicode mode is disabled, then a `char` will still be returned + /// whenever possible. A byte is returned only when invalid UTF-8 is + /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte + /// will result in an error when invalid UTF-8 is not allowed. + fn ast_literal_to_scalar( + &self, + lit: &ast::Literal, + ) -> Result> { if self.flags().unicode() { - return Ok(hir::Literal::Unicode(lit.c)); + return Ok(Either::Left(lit.c)); } let byte = match lit.byte() { - None => return Ok(hir::Literal::Unicode(lit.c)), + None => return Ok(Either::Left(lit.c)), Some(byte) => byte, }; if byte <= 0x7F { - return Ok(hir::Literal::Unicode(byte as char)); + return Ok(Either::Left(char::try_from(byte).unwrap())); } - if !self.trans().allow_invalid_utf8 { + if self.trans().utf8 { return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); } - Ok(hir::Literal::Byte(byte)) + Ok(Either::Right(byte)) } - fn hir_from_char(&self, span: Span, c: char) -> Result { - if !self.flags().unicode() && c.len_utf8() > 1 { - return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); + fn case_fold_char(&self, span: Span, c: char) -> Result> { + if !self.flags().case_insensitive() { + return Ok(None); } - Ok(Hir::literal(hir::Literal::Unicode(c))) - } - - fn hir_from_char_case_insensitive( - &self, - span: Span, - c: char, - ) -> Result { if self.flags().unicode() { // If case folding won't do anything, then don't bother trying. - let map = - unicode::contains_simple_case_mapping(c, c).map_err(|_| { + let map = unicode::SimpleCaseFolder::new() + .map(|f| f.overlaps(c, c)) + .map_err(|_| { self.error(span, ErrorKind::UnicodeCaseUnavailable) })?; if !map { - return self.hir_from_char(span, c); + return Ok(None); } let mut cls = hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( @@ -692,7 +839,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { cls.try_case_fold_simple().map_err(|_| { self.error(span, ErrorKind::UnicodeCaseUnavailable) })?; - Ok(Hir::class(hir::Class::Unicode(cls))) + Ok(Some(Hir::class(hir::Class::Unicode(cls)))) } else { if c.len_utf8() > 1 { return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); @@ -700,109 +847,104 @@ impl<'t, 'p> TranslatorI<'t, 'p> { // If case folding won't do anything, then don't bother trying. match c { 'A'..='Z' | 'a'..='z' => {} - _ => return self.hir_from_char(span, c), + _ => return Ok(None), } let mut cls = hir::ClassBytes::new(vec![hir::ClassBytesRange::new( - c as u8, c as u8, + // OK because 'c.len_utf8() == 1' which in turn implies + // that 'c' is ASCII. + // + // MSRV(1.59): Use 'u8::try_from(c)' instead. + u8::try_from(u32::from(c)).unwrap(), + u8::try_from(u32::from(c)).unwrap(), )]); cls.case_fold_simple(); - Ok(Hir::class(hir::Class::Bytes(cls))) + Ok(Some(Hir::class(hir::Class::Bytes(cls)))) } } fn hir_dot(&self, span: Span) -> Result { - let unicode = self.flags().unicode(); - if !unicode && !self.trans().allow_invalid_utf8 { + if !self.flags().unicode() && self.trans().utf8 { return Err(self.error(span, ErrorKind::InvalidUtf8)); } - Ok(if self.flags().dot_matches_new_line() { - Hir::any(!unicode) - } else { - Hir::dot(!unicode) - }) + Ok(Hir::dot(self.flags().dot())) } fn hir_assertion(&self, asst: &ast::Assertion) -> Result { let unicode = self.flags().unicode(); let multi_line = self.flags().multi_line(); + let crlf = self.flags().crlf(); Ok(match asst.kind { - ast::AssertionKind::StartLine => Hir::anchor(if multi_line { - hir::Anchor::StartLine + ast::AssertionKind::StartLine => Hir::look(if multi_line { + if crlf { + hir::Look::StartCRLF + } else { + hir::Look::StartLF + } } else { - hir::Anchor::StartText + hir::Look::Start }), - ast::AssertionKind::EndLine => Hir::anchor(if multi_line { - hir::Anchor::EndLine + ast::AssertionKind::EndLine => Hir::look(if multi_line { + if crlf { + hir::Look::EndCRLF + } else { + hir::Look::EndLF + } } else { - hir::Anchor::EndText + hir::Look::End + }), + ast::AssertionKind::StartText => Hir::look(hir::Look::Start), + ast::AssertionKind::EndText => Hir::look(hir::Look::End), + ast::AssertionKind::WordBoundary => Hir::look(if unicode { + hir::Look::WordUnicode + } else { + hir::Look::WordAscii + }), + ast::AssertionKind::NotWordBoundary => Hir::look(if unicode { + hir::Look::WordUnicodeNegate + } else { + hir::Look::WordAsciiNegate }), - ast::AssertionKind::StartText => { - Hir::anchor(hir::Anchor::StartText) - } - ast::AssertionKind::EndText => Hir::anchor(hir::Anchor::EndText), - ast::AssertionKind::WordBoundary => { - Hir::word_boundary(if unicode { - hir::WordBoundary::Unicode - } else { - hir::WordBoundary::Ascii - }) - } - ast::AssertionKind::NotWordBoundary => { - Hir::word_boundary(if unicode { - hir::WordBoundary::UnicodeNegate - } else { - // It is possible for negated ASCII word boundaries to - // match at invalid UTF-8 boundaries, even when searching - // valid UTF-8. - if !self.trans().allow_invalid_utf8 { - return Err( - self.error(asst.span, ErrorKind::InvalidUtf8) - ); - } - hir::WordBoundary::AsciiNegate - }) - } }) } - fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir { - let kind = match group.kind { - ast::GroupKind::CaptureIndex(idx) => { - hir::GroupKind::CaptureIndex(idx) - } - ast::GroupKind::CaptureName(ref capname) => { - hir::GroupKind::CaptureName { - name: capname.name.clone(), - index: capname.index, - } + fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir { + let (index, name) = match group.kind { + ast::GroupKind::CaptureIndex(index) => (index, None), + ast::GroupKind::CaptureName { ref name, .. } => { + (name.index, Some(name.name.clone().into_boxed_str())) } - ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing, + // The HIR doesn't need to use non-capturing groups, since the way + // in which the data type is defined handles this automatically. + ast::GroupKind::NonCapturing(_) => return expr, }; - Hir::group(hir::Group { kind, hir: Box::new(expr) }) + Hir::capture(hir::Capture { index, name, sub: Box::new(expr) }) } fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { - let kind = match rep.op.kind { - ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne, - ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore, - ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore, + let (min, max) = match rep.op.kind { + ast::RepetitionKind::ZeroOrOne => (0, Some(1)), + ast::RepetitionKind::ZeroOrMore => (0, None), + ast::RepetitionKind::OneOrMore => (1, None), ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { - hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m)) + (m, Some(m)) } ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { - hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m)) + (m, None) } ast::RepetitionKind::Range(ast::RepetitionRange::Bounded( m, n, - )) => { - hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n)) - } + )) => (m, Some(n)), }; let greedy = if self.flags().swap_greed() { !rep.greedy } else { rep.greedy }; - Hir::repetition(hir::Repetition { kind, greedy, hir: Box::new(expr) }) + Hir::repetition(hir::Repetition { + min, + max, + greedy, + sub: Box::new(expr), + }) } fn hir_unicode_class( @@ -834,11 +976,6 @@ impl<'t, 'p> TranslatorI<'t, 'p> { ast_class.negated, class, )?; - if class.ranges().is_empty() { - let err = self - .error(ast_class.span, ErrorKind::EmptyClassNotAllowed); - return Err(err); - } } result } @@ -848,9 +985,8 @@ impl<'t, 'p> TranslatorI<'t, 'p> { ast: &ast::ClassAscii, ) -> Result { let mut cls = hir::ClassUnicode::new( - ascii_class(&ast.kind) - .iter() - .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)), + ascii_class_as_chars(&ast.kind) + .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), ); self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; Ok(cls) @@ -862,8 +998,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { ) -> Result { let mut cls = hir::ClassBytes::new( ascii_class(&ast.kind) - .iter() - .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)), + .map(|(s, e)| hir::ClassBytesRange::new(s, e)), ); self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; Ok(cls) @@ -894,7 +1029,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { fn hir_perl_byte_class( &self, ast_class: &ast::ClassPerl, - ) -> hir::ClassBytes { + ) -> Result { use crate::ast::ClassPerlKind::*; assert!(!self.flags().unicode()); @@ -908,7 +1043,13 @@ impl<'t, 'p> TranslatorI<'t, 'p> { if ast_class.negated { class.negate(); } - class + // Negating a Perl byte class is likely to cause it to match invalid + // UTF-8. That's only OK if the translator is configured to allow such + // things. + if self.trans().utf8 && !class.is_ascii() { + return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8)); + } + Ok(class) } /// Converts the given Unicode specific error to an HIR translation error. @@ -918,7 +1059,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { fn convert_unicode_class_error( &self, span: &Span, - result: unicode::Result, + result: core::result::Result, ) -> Result { result.map_err(|err| { let sp = span.clone(); @@ -943,7 +1084,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { class: &mut hir::ClassUnicode, ) -> Result<()> { // Note that we must apply case folding before negation! - // Consider `(?i)[^x]`. If we applied negation field, then + // Consider `(?i)[^x]`. If we applied negation first, then // the result would be the character class that matched any // Unicode scalar value. if self.flags().case_insensitive() { @@ -973,7 +1114,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> { if negated { class.negate(); } - if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() { + if self.trans().utf8 && !class.is_ascii() { return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); } Ok(()) @@ -982,11 +1123,12 @@ impl<'t, 'p> TranslatorI<'t, 'p> { /// Return a scalar byte value suitable for use as a literal in a byte /// character class. fn class_literal_byte(&self, ast: &ast::Literal) -> Result { - match self.literal_to_char(ast)? { - hir::Literal::Byte(byte) => Ok(byte), - hir::Literal::Unicode(ch) => { - if ch <= 0x7F as char { - Ok(ch as u8) + match self.ast_literal_to_scalar(ast)? { + Either::Right(byte) => Ok(byte), + Either::Left(ch) => { + let cp = u32::from(ch); + if cp <= 0x7F { + Ok(u8::try_from(cp).unwrap()) } else { // We can't feasibly support Unicode in // byte oriented classes. Byte classes don't @@ -1010,6 +1152,7 @@ struct Flags { dot_matches_new_line: Option, swap_greed: Option, unicode: Option, + crlf: Option, // Note that `ignore_whitespace` is omitted here because it is handled // entirely in the parser. } @@ -1038,6 +1181,9 @@ impl Flags { ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { flags.unicode = Some(enable); } + ast::FlagsItemKind::Flag(ast::Flag::CRLF) => { + flags.crlf = Some(enable); + } ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} } } @@ -1060,6 +1206,33 @@ impl Flags { if self.unicode.is_none() { self.unicode = previous.unicode; } + if self.crlf.is_none() { + self.crlf = previous.crlf; + } + } + + fn dot(&self) -> hir::Dot { + if self.dot_matches_new_line() { + if self.unicode() { + hir::Dot::AnyChar + } else { + hir::Dot::AnyByte + } + } else { + if self.unicode() { + if self.crlf() { + hir::Dot::AnyCharExceptCRLF + } else { + hir::Dot::AnyCharExceptLF + } + } else { + if self.crlf() { + hir::Dot::AnyByteExceptCRLF + } else { + hir::Dot::AnyByteExceptLF + } + } + } } fn case_insensitive(&self) -> bool { @@ -1081,52 +1254,63 @@ impl Flags { fn unicode(&self) -> bool { self.unicode.unwrap_or(true) } + + fn crlf(&self) -> bool { + self.crlf.unwrap_or(false) + } } fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { let ranges: Vec<_> = ascii_class(kind) - .iter() - .cloned() - .map(|(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)) + .map(|(s, e)| hir::ClassBytesRange::new(s, e)) .collect(); hir::ClassBytes::new(ranges) } -fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] { +fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator { use crate::ast::ClassAsciiKind::*; - match *kind { - Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')], - Alpha => &[('A', 'Z'), ('a', 'z')], - Ascii => &[('\x00', '\x7F')], - Blank => &[('\t', '\t'), (' ', ' ')], - Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')], - Digit => &[('0', '9')], - Graph => &[('!', '~')], - Lower => &[('a', 'z')], - Print => &[(' ', '~')], - Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')], + + let slice: &'static [(u8, u8)] = match *kind { + Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], + Alpha => &[(b'A', b'Z'), (b'a', b'z')], + Ascii => &[(b'\x00', b'\x7F')], + Blank => &[(b'\t', b'\t'), (b' ', b' ')], + Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], + Digit => &[(b'0', b'9')], + Graph => &[(b'!', b'~')], + Lower => &[(b'a', b'z')], + Print => &[(b' ', b'~')], + Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], Space => &[ - ('\t', '\t'), - ('\n', '\n'), - ('\x0B', '\x0B'), - ('\x0C', '\x0C'), - ('\r', '\r'), - (' ', ' '), + (b'\t', b'\t'), + (b'\n', b'\n'), + (b'\x0B', b'\x0B'), + (b'\x0C', b'\x0C'), + (b'\r', b'\r'), + (b' ', b' '), ], - Upper => &[('A', 'Z')], - Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')], - Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')], - } + Upper => &[(b'A', b'Z')], + Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], + Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], + }; + slice.iter().copied() +} + +fn ascii_class_as_chars( + kind: &ast::ClassAsciiKind, +) -> impl Iterator { + ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e))) } #[cfg(test)] mod tests { - use crate::ast::parse::ParserBuilder; - use crate::ast::{self, Ast, Position, Span}; - use crate::hir::{self, Hir, HirKind}; - use crate::unicode::{self, ClassQuery}; + use crate::{ + ast::{self, parse::ParserBuilder, Ast, Position, Span}, + hir::{self, Hir, HirKind, Look, Properties}, + unicode::{self, ClassQuery}, + }; - use super::{ascii_class, TranslatorBuilder}; + use super::*; // We create these errors to compare with real hir::Errors in the tests. // We define equality between TestError and hir::Error to disregard the @@ -1155,7 +1339,7 @@ mod tests { fn t(pattern: &str) -> Hir { TranslatorBuilder::new() - .allow_invalid_utf8(false) + .utf8(true) .build() .translate(pattern, &parse(pattern)) .unwrap() @@ -1163,7 +1347,7 @@ mod tests { fn t_err(pattern: &str) -> hir::Error { TranslatorBuilder::new() - .allow_invalid_utf8(false) + .utf8(true) .build() .translate(pattern, &parse(pattern)) .unwrap_err() @@ -1171,95 +1355,73 @@ mod tests { fn t_bytes(pattern: &str) -> Hir { TranslatorBuilder::new() - .allow_invalid_utf8(true) + .utf8(false) .build() .translate(pattern, &parse(pattern)) .unwrap() } - fn hir_lit(s: &str) -> Hir { - match s.len() { - 0 => Hir::empty(), - _ => { - let lits = s - .chars() - .map(hir::Literal::Unicode) - .map(Hir::literal) - .collect(); - Hir::concat(lits) - } - } + fn props(pattern: &str) -> Properties { + t(pattern).properties().clone() } - fn hir_blit(s: &[u8]) -> Hir { - match s.len() { - 0 => Hir::empty(), - 1 => Hir::literal(hir::Literal::Byte(s[0])), - _ => { - let lits = s - .iter() - .cloned() - .map(hir::Literal::Byte) - .map(Hir::literal) - .collect(); - Hir::concat(lits) - } - } + fn props_bytes(pattern: &str) -> Properties { + t_bytes(pattern).properties().clone() } - fn hir_group(i: u32, expr: Hir) -> Hir { - Hir::group(hir::Group { - kind: hir::GroupKind::CaptureIndex(i), - hir: Box::new(expr), - }) + fn hir_lit(s: &str) -> Hir { + hir_blit(s.as_bytes()) } - fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir { - Hir::group(hir::Group { - kind: hir::GroupKind::CaptureName { - name: name.to_string(), - index: i, - }, - hir: Box::new(expr), - }) + fn hir_blit(s: &[u8]) -> Hir { + Hir::literal(s) + } + + fn hir_capture(index: u32, expr: Hir) -> Hir { + Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) }) } - fn hir_group_nocap(expr: Hir) -> Hir { - Hir::group(hir::Group { - kind: hir::GroupKind::NonCapturing, - hir: Box::new(expr), + fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir { + Hir::capture(hir::Capture { + index, + name: Some(name.into()), + sub: Box::new(expr), }) } fn hir_quest(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrOne, + min: 0, + max: Some(1), greedy, - hir: Box::new(expr), + sub: Box::new(expr), }) } fn hir_star(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, + min: 0, + max: None, greedy, - hir: Box::new(expr), + sub: Box::new(expr), }) } fn hir_plus(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::OneOrMore, + min: 1, + max: None, greedy, - hir: Box::new(expr), + sub: Box::new(expr), }) } - fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir { + fn hir_range(greedy: bool, min: u32, max: Option, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::Range(range), + min, + max, greedy, - hir: Box::new(expr), + sub: Box::new(expr), }) } @@ -1281,32 +1443,25 @@ mod tests { Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) } - fn hir_uclass(ranges: &[(char, char)]) -> Hir { - let ranges: Vec = ranges - .iter() - .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) - .collect(); - Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges))) + fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir { + Hir::class(hir::Class::Unicode(hir::ClassUnicode::new( + ascii_class_as_chars(kind) + .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), + ))) } - fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { - let ranges: Vec = ranges - .iter() - .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) - .collect(); - Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) + fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir { + Hir::class(hir::Class::Bytes(hir::ClassBytes::new( + ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)), + ))) } - fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir { - let ranges: Vec = ranges - .iter() - .map(|&(s, e)| { - assert!(s as u32 <= 0x7F); - assert!(e as u32 <= 0x7F); - hir::ClassBytesRange::new(s as u8, e as u8) - }) - .collect(); - Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) + fn hir_uclass(ranges: &[(char, char)]) -> Hir { + Hir::class(uclass(ranges)) + } + + fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { + Hir::class(bclass(ranges)) } fn hir_case_fold(expr: Hir) -> Hir { @@ -1329,6 +1484,33 @@ mod tests { } } + fn uclass(ranges: &[(char, char)]) -> hir::Class { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) + .collect(); + hir::Class::Unicode(hir::ClassUnicode::new(ranges)) + } + + fn bclass(ranges: &[(u8, u8)]) -> hir::Class { + let ranges: Vec = ranges + .iter() + .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) + .collect(); + hir::Class::Bytes(hir::ClassBytes::new(ranges)) + } + + #[cfg(feature = "unicode-case")] + fn class_case_fold(mut cls: hir::Class) -> Hir { + cls.case_fold_simple(); + Hir::class(cls) + } + + fn class_negate(mut cls: hir::Class) -> Hir { + cls.negate(); + Hir::class(cls) + } + #[allow(dead_code)] fn hir_union(expr1: Hir, expr2: Hir) -> Hir { use crate::hir::Class::{Bytes, Unicode}; @@ -1363,47 +1545,43 @@ mod tests { } } - fn hir_anchor(anchor: hir::Anchor) -> Hir { - Hir::anchor(anchor) - } - - fn hir_word(wb: hir::WordBoundary) -> Hir { - Hir::word_boundary(wb) + fn hir_look(look: hir::Look) -> Hir { + Hir::look(look) } #[test] fn empty() { assert_eq!(t(""), Hir::empty()); assert_eq!(t("(?i)"), Hir::empty()); - assert_eq!(t("()"), hir_group(1, Hir::empty())); - assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); - assert_eq!(t("(?P)"), hir_group_name(1, "wat", Hir::empty())); + assert_eq!(t("()"), hir_capture(1, Hir::empty())); + assert_eq!(t("(?:)"), Hir::empty()); + assert_eq!(t("(?P)"), hir_capture_name(1, "wat", Hir::empty())); assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()])); assert_eq!( t("()|()"), hir_alt(vec![ - hir_group(1, Hir::empty()), - hir_group(2, Hir::empty()), + hir_capture(1, Hir::empty()), + hir_capture(2, Hir::empty()), ]) ); assert_eq!( t("(|b)"), - hir_group(1, hir_alt(vec![Hir::empty(), hir_lit("b"),])) + hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),])) ); assert_eq!( t("(a|)"), - hir_group(1, hir_alt(vec![hir_lit("a"), Hir::empty(),])) + hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),])) ); assert_eq!( t("(a||c)"), - hir_group( + hir_capture( 1, hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),]) ) ); assert_eq!( t("(||)"), - hir_group( + hir_capture( 1, hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),]) ) @@ -1449,10 +1627,7 @@ mod tests { #[cfg(feature = "unicode-case")] assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),])); #[cfg(feature = "unicode-case")] - assert_eq!( - t("(?i:a)"), - hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],)) - ); + assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')])); #[cfg(feature = "unicode-case")] assert_eq!( t("a(?i)a(?-i)a"), @@ -1528,14 +1703,32 @@ mod tests { fn dot() { assert_eq!( t("."), - hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}'),]) + hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')]) ); - assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}'),])); + assert_eq!( + t("(?R)."), + hir_uclass(&[ + ('\0', '\t'), + ('\x0B', '\x0C'), + ('\x0E', '\u{10FFFF}'), + ]) + ); + assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')])); + assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')])); assert_eq!( t_bytes("(?-u)."), - hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF'),]) + hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')]) + ); + assert_eq!( + t_bytes("(?R-u)."), + hir_bclass(&[ + (b'\0', b'\t'), + (b'\x0B', b'\x0C'), + (b'\x0E', b'\xFF'), + ]) ); assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); + assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. assert_eq!( @@ -1549,7 +1742,7 @@ mod tests { } ); assert_eq!( - t_err("(?s-u)."), + t_err("(?R-u)."), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( @@ -1558,94 +1751,123 @@ mod tests { ), } ); - } - - #[test] - fn assertions() { - assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText)); - assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText)); - assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText)); - assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText)); - assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine)); - assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine)); - assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText)); - assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText)); - - assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode)); - assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate)); - assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii)); assert_eq!( - t_bytes(r"(?-u)\B"), - hir_word(hir::WordBoundary::AsciiNegate) + t_err("(?s-u)."), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(7, 1, 8) + ), + } ); - assert_eq!( - t_err(r"(?-u)\B"), + t_err("(?Rs-u)."), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( - Position::new(5, 1, 6), - Position::new(7, 1, 8) + Position::new(7, 1, 8), + Position::new(8, 1, 9) ), } ); } + #[test] + fn assertions() { + assert_eq!(t("^"), hir_look(hir::Look::Start)); + assert_eq!(t("$"), hir_look(hir::Look::End)); + assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); + assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); + assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); + + assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode)); + assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate)); + assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii)); + assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); + } + #[test] fn group() { - assert_eq!(t("(a)"), hir_group(1, hir_lit("a"))); + assert_eq!(t("(a)"), hir_capture(1, hir_lit("a"))); assert_eq!( t("(a)(b)"), hir_cat(vec![ - hir_group(1, hir_lit("a")), - hir_group(2, hir_lit("b")), + hir_capture(1, hir_lit("a")), + hir_capture(2, hir_lit("b")), ]) ); assert_eq!( t("(a)|(b)"), hir_alt(vec![ - hir_group(1, hir_lit("a")), - hir_group(2, hir_lit("b")), + hir_capture(1, hir_lit("a")), + hir_capture(2, hir_lit("b")), ]) ); - assert_eq!(t("(?P)"), hir_group_name(1, "foo", Hir::empty())); - assert_eq!(t("(?Pa)"), hir_group_name(1, "foo", hir_lit("a"))); + assert_eq!(t("(?P)"), hir_capture_name(1, "foo", Hir::empty())); + assert_eq!(t("(?Pa)"), hir_capture_name(1, "foo", hir_lit("a"))); assert_eq!( t("(?Pa)(?Pb)"), hir_cat(vec![ - hir_group_name(1, "foo", hir_lit("a")), - hir_group_name(2, "bar", hir_lit("b")), + hir_capture_name(1, "foo", hir_lit("a")), + hir_capture_name(2, "bar", hir_lit("b")), ]) ); - assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); - assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a"))); + assert_eq!(t("(?:)"), Hir::empty()); + assert_eq!(t("(?:a)"), hir_lit("a")); assert_eq!( t("(?:a)(b)"), - hir_cat(vec![ - hir_group_nocap(hir_lit("a")), - hir_group(1, hir_lit("b")), - ]) + hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),]) ); assert_eq!( t("(a)(?:b)(c)"), hir_cat(vec![ - hir_group(1, hir_lit("a")), - hir_group_nocap(hir_lit("b")), - hir_group(2, hir_lit("c")), + hir_capture(1, hir_lit("a")), + hir_lit("b"), + hir_capture(2, hir_lit("c")), ]) ); assert_eq!( t("(a)(?Pb)(c)"), hir_cat(vec![ - hir_group(1, hir_lit("a")), - hir_group_name(2, "foo", hir_lit("b")), - hir_group(3, hir_lit("c")), + hir_capture(1, hir_lit("a")), + hir_capture_name(2, "foo", hir_lit("b")), + hir_capture(3, hir_lit("c")), ]) ); - assert_eq!(t("()"), hir_group(1, Hir::empty())); - assert_eq!(t("((?i))"), hir_group(1, Hir::empty())); - assert_eq!(t("((?x))"), hir_group(1, Hir::empty())); - assert_eq!(t("(((?x)))"), hir_group(1, hir_group(2, Hir::empty()))); + assert_eq!(t("()"), hir_capture(1, Hir::empty())); + assert_eq!(t("((?i))"), hir_capture(1, Hir::empty())); + assert_eq!(t("((?x))"), hir_capture(1, Hir::empty())); + assert_eq!( + t("(((?x)))"), + hir_capture(1, hir_capture(2, Hir::empty())) + ); + } + + #[test] + fn line_anchors() { + assert_eq!(t("^"), hir_look(hir::Look::Start)); + assert_eq!(t("$"), hir_look(hir::Look::End)); + assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"\z"), hir_look(hir::Look::End)); + + assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); + assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); + + assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?R)^"), hir_look(hir::Look::Start)); + assert_eq!(t("(?R)$"), hir_look(hir::Look::End)); + + assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start)); + assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End)); + assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF)); + assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF)); } #[test] @@ -1653,46 +1875,44 @@ mod tests { #[cfg(feature = "unicode-case")] assert_eq!( t("(?i:a)a"), - hir_cat(vec![ - hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])), - hir_lit("a"), - ]) + hir_cat( + vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),] + ) ); assert_eq!( t("(?i-u:a)β"), hir_cat(vec![ - hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), hir_lit("β"), ]) ); assert_eq!( t("(?:(?i-u)a)b"), hir_cat(vec![ - hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), hir_lit("b"), ]) ); assert_eq!( t("((?i-u)a)b"), hir_cat(vec![ - hir_group(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), + hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), hir_lit("b"), ]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)(?-i:a)a"), - hir_cat(vec![ - hir_group_nocap(hir_lit("a")), - hir_uclass(&[('A', 'A'), ('a', 'a')]), - ]) + hir_cat( + vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),] + ) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?im)a^"), hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), - hir_anchor(hir::Anchor::StartLine), + hir_look(hir::Look::StartLF), ]) ); #[cfg(feature = "unicode-case")] @@ -1700,9 +1920,9 @@ mod tests { t("(?im)a^(?i-m)a^"), hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), - hir_anchor(hir::Anchor::StartLine), + hir_look(hir::Look::StartLF), hir_uclass(&[('A', 'A'), ('a', 'a')]), - hir_anchor(hir::Anchor::StartText), + hir_look(hir::Look::Start), ]) ); assert_eq!( @@ -1718,10 +1938,10 @@ mod tests { assert_eq!( t("(?:a(?i)a)a"), hir_cat(vec![ - hir_group_nocap(hir_cat(vec![ + hir_cat(vec![ hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]), - ])), + ]), hir_lit("a"), ]) ); @@ -1729,10 +1949,10 @@ mod tests { assert_eq!( t("(?i)(?:a(?-i)a)a"), hir_cat(vec![ - hir_group_nocap(hir_cat(vec![ + hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"), - ])), + ]), hir_uclass(&[('A', 'A'), ('a', 'a')]), ]) ); @@ -1755,46 +1975,18 @@ mod tests { assert_eq!(t("a*?"), hir_star(false, hir_lit("a"))); assert_eq!(t("a+?"), hir_plus(false, hir_lit("a"))); - assert_eq!( - t("a{1}"), - hir_range(true, hir::RepetitionRange::Exactly(1), hir_lit("a"),) - ); - assert_eq!( - t("a{1,}"), - hir_range(true, hir::RepetitionRange::AtLeast(1), hir_lit("a"),) - ); - assert_eq!( - t("a{1,2}"), - hir_range(true, hir::RepetitionRange::Bounded(1, 2), hir_lit("a"),) - ); - assert_eq!( - t("a{1}?"), - hir_range(false, hir::RepetitionRange::Exactly(1), hir_lit("a"),) - ); - assert_eq!( - t("a{1,}?"), - hir_range(false, hir::RepetitionRange::AtLeast(1), hir_lit("a"),) - ); - assert_eq!( - t("a{1,2}?"), - hir_range( - false, - hir::RepetitionRange::Bounded(1, 2), - hir_lit("a"), - ) - ); + assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),)); + assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),)); + assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),)); + assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),)); + assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),)); + assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),)); assert_eq!( t("ab?"), hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) ); - assert_eq!( - t("(ab)?"), - hir_quest( - true, - hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),])) - ) - ); + assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab")))); assert_eq!( t("a|b?"), hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) @@ -1803,48 +1995,49 @@ mod tests { #[test] fn cat_alt() { + let a = || hir_look(hir::Look::Start); + let b = || hir_look(hir::Look::End); + let c = || hir_look(hir::Look::WordUnicode); + let d = || hir_look(hir::Look::WordUnicodeNegate); + + assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()]))); + assert_eq!(t("^|$"), hir_alt(vec![a(), b()])); + assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()])); assert_eq!( - t("(ab)"), - hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),])) - ); - assert_eq!(t("a|b"), hir_alt(vec![hir_lit("a"), hir_lit("b"),])); - assert_eq!( - t("a|b|c"), - hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),]) - ); - assert_eq!( - t("ab|bc|cd"), - hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),]) - ); - assert_eq!( - t("(a|b)"), - hir_group(1, hir_alt(vec![hir_lit("a"), hir_lit("b"),])) + t(r"^$|$\b|\b\B"), + hir_alt(vec![ + hir_cat(vec![a(), b()]), + hir_cat(vec![b(), c()]), + hir_cat(vec![c(), d()]), + ]) ); + assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()]))); assert_eq!( - t("(a|b|c)"), - hir_group( - 1, - hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),]) - ) + t(r"(^|$|\b)"), + hir_capture(1, hir_alt(vec![a(), b(), c()])) ); assert_eq!( - t("(ab|bc|cd)"), - hir_group( + t(r"(^$|$\b|\b\B)"), + hir_capture( 1, - hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),]) + hir_alt(vec![ + hir_cat(vec![a(), b()]), + hir_cat(vec![b(), c()]), + hir_cat(vec![c(), d()]), + ]) ) ); assert_eq!( - t("(ab|(bc|(cd)))"), - hir_group( + t(r"(^$|($\b|(\b\B)))"), + hir_capture( 1, hir_alt(vec![ - hir_lit("ab"), - hir_group( + hir_cat(vec![a(), b()]), + hir_capture( 2, hir_alt(vec![ - hir_lit("bc"), - hir_group(3, hir_lit("cd")), + hir_cat(vec![b(), c()]), + hir_capture(3, hir_cat(vec![c(), d()])), ]) ), ]) @@ -1852,68 +2045,107 @@ mod tests { ); } + // Tests the HIR transformation of things like '[a-z]|[A-Z]' into + // '[A-Za-z]'. In other words, an alternation of just classes is always + // equivalent to a single class corresponding to the union of the branches + // in that class. (Unless some branches match invalid UTF-8 and others + // match non-ASCII Unicode.) + #[test] + fn cat_class_flattened() { + assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); + // Combining all of the letter properties should give us the one giant + // letter property. + #[cfg(feature = "unicode-gencat")] + assert_eq!( + t(r"(?x) + \p{Lowercase_Letter} + |\p{Uppercase_Letter} + |\p{Titlecase_Letter} + |\p{Modifier_Letter} + |\p{Other_Letter} + "), + hir_uclass_query(ClassQuery::Binary("letter")) + ); + // Byte classes that can truly match invalid UTF-8 cannot be combined + // with Unicode classes. + assert_eq!( + t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"), + hir_alt(vec![ + hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]), + hir_bclass(&[(b'\x90', b'\xFF')]), + hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]), + ]) + ); + // Byte classes on their own can be combined, even if some are ASCII + // and others are invalid UTF-8. + assert_eq!( + t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"), + hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]), + ); + } + #[test] fn class_ascii() { assert_eq!( t("[[:alnum:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)) + hir_ascii_uclass(&ast::ClassAsciiKind::Alnum) ); assert_eq!( t("[[:alpha:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha)) + hir_ascii_uclass(&ast::ClassAsciiKind::Alpha) ); assert_eq!( t("[[:ascii:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii)) + hir_ascii_uclass(&ast::ClassAsciiKind::Ascii) ); assert_eq!( t("[[:blank:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank)) + hir_ascii_uclass(&ast::ClassAsciiKind::Blank) ); assert_eq!( t("[[:cntrl:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl)) + hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl) ); assert_eq!( t("[[:digit:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_uclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t("[[:graph:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph)) + hir_ascii_uclass(&ast::ClassAsciiKind::Graph) ); assert_eq!( t("[[:lower:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)) + hir_ascii_uclass(&ast::ClassAsciiKind::Lower) ); assert_eq!( t("[[:print:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Print)) + hir_ascii_uclass(&ast::ClassAsciiKind::Print) ); assert_eq!( t("[[:punct:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct)) + hir_ascii_uclass(&ast::ClassAsciiKind::Punct) ); assert_eq!( t("[[:space:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Space)) + hir_ascii_uclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t("[[:upper:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper)) + hir_ascii_uclass(&ast::ClassAsciiKind::Upper) ); assert_eq!( t("[[:word:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Word)) + hir_ascii_uclass(&ast::ClassAsciiKind::Word) ); assert_eq!( t("[[:xdigit:]]"), - hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit)) + hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit) ); assert_eq!( t("[[:^lower:]]"), - hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))) + hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower)) ); #[cfg(feature = "unicode-case")] assert_eq!( @@ -1928,13 +2160,11 @@ mod tests { assert_eq!( t("(?-u)[[:lower:]]"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower)) + hir_ascii_bclass(&ast::ClassAsciiKind::Lower) ); assert_eq!( t("(?i-u)[[:lower:]]"), - hir_case_fold(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Lower - ))) + hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower)) ); assert_eq!( @@ -1965,14 +2195,14 @@ mod tests { assert_eq!( t("[[:alnum:][:^ascii:]]"), hir_union( - hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_ascii_uclass(&ast::ClassAsciiKind::Alnum), hir_uclass(&[('\u{80}', '\u{10FFFF}')]), ), ); assert_eq!( t_bytes("(?-u)[[:alnum:][:^ascii:]]"), hir_union( - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)), + hir_ascii_bclass(&ast::ClassAsciiKind::Alnum), hir_bclass(&[(0x80, 0xFF)]), ), ); @@ -1980,7 +2210,7 @@ mod tests { #[test] #[cfg(feature = "unicode-perl")] - fn class_perl() { + fn class_perl_unicode() { // Unicode assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); @@ -2020,69 +2250,124 @@ mod tests { ); #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); + } + #[test] + fn class_perl_ascii() { // ASCII only assert_eq!( t(r"(?-u)\d"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t(r"(?-u)\s"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)) + hir_ascii_bclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t(r"(?-u)\w"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)) + hir_ascii_bclass(&ast::ClassAsciiKind::Word) ); assert_eq!( t(r"(?i-u)\d"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t(r"(?i-u)\s"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)) + hir_ascii_bclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t(r"(?i-u)\w"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)) + hir_ascii_bclass(&ast::ClassAsciiKind::Word) ); // ASCII only, negated assert_eq!( - t(r"(?-u)\D"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + t_bytes(r"(?-u)\D"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( - t(r"(?-u)\S"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Space - ))) + t_bytes(r"(?-u)\S"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( - t(r"(?-u)\W"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Word - ))) + t_bytes(r"(?-u)\W"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); assert_eq!( - t(r"(?i-u)\D"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + t_bytes(r"(?i-u)\D"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( - t(r"(?i-u)\S"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Space - ))) + t_bytes(r"(?i-u)\S"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( - t(r"(?i-u)\W"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Word - ))) + t_bytes(r"(?i-u)\W"), + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) + ); + + // ASCII only, negated, with UTF-8 mode enabled. + // In this case, negating any Perl class results in an error because + // all such classes can match invalid UTF-8. + assert_eq!( + t_err(r"(?-u)\D"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?-u)\S"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?-u)\W"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(5, 1, 6), + Position::new(7, 1, 8), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\D"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\S"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, + ); + assert_eq!( + t_err(r"(?i-u)\W"), + TestError { + kind: hir::ErrorKind::InvalidUtf8, + span: Span::new( + Position::new(6, 1, 7), + Position::new(8, 1, 9), + ), + }, ); } @@ -2360,16 +2645,7 @@ mod tests { #[test] #[cfg(feature = "unicode-gencat")] fn class_unicode_any_empty() { - assert_eq!( - t_err(r"\P{any}"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(0, 1, 1), - Position::new(7, 1, 8) - ), - } - ); + assert_eq!(t(r"\P{any}"), hir_uclass(&[]),); } #[test] @@ -2389,8 +2665,9 @@ mod tests { #[test] fn class_bracketed() { - assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')])); - assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')]))); + assert_eq!(t("[a]"), hir_lit("a")); + assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')])); + assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')]))); assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')])); assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')])); assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')])); @@ -2453,11 +2730,11 @@ mod tests { ); assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),])); - assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')]))); - assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')]))); + assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')]))); + assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')]))); assert_eq!( t_bytes("(?-u)[^a]"), - hir_negate(hir_bclass(&[(b'a', b'a')])) + class_negate(bclass(&[(b'a', b'a')])) ); #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!( @@ -2521,27 +2798,9 @@ mod tests { } ); #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] - assert_eq!( - t_err(r"[^\s\S]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(0, 1, 1), - Position::new(7, 1, 8) - ), - } - ); + assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),); #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] - assert_eq!( - t_err(r"(?-u)[^\s\S]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(5, 1, 6), - Position::new(12, 1, 13) - ), - } - ); + assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),); } #[test] @@ -2663,9 +2922,9 @@ mod tests { #[test] fn class_bracketed_nested() { - assert_eq!(t(r"[a[^c]]"), hir_negate(hir_uclass(&[('c', 'c')]))); - assert_eq!(t(r"[a-b[^c]]"), hir_negate(hir_uclass(&[('c', 'c')]))); - assert_eq!(t(r"[a-c[^c]]"), hir_negate(hir_uclass(&[]))); + assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')]))); + assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')]))); + assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[]))); assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')])); @@ -2673,12 +2932,12 @@ mod tests { #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[a[^c]]"), - hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) + hir_negate(class_case_fold(uclass(&[('c', 'c')]))) ); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[a-b[^c]]"), - hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) + hir_negate(class_case_fold(uclass(&[('c', 'c')]))) ); #[cfg(feature = "unicode-case")] @@ -2689,27 +2948,9 @@ mod tests { hir_uclass(&[('C', 'C'), ('c', 'c')]) ); - assert_eq!( - t_err(r"[^a-c[^c]]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(0, 1, 1), - Position::new(10, 1, 11) - ), - } - ); + assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),); #[cfg(feature = "unicode-case")] - assert_eq!( - t_err(r"(?i)[^a-c[^c]]"), - TestError { - kind: hir::ErrorKind::EmptyClassNotAllowed, - span: Span::new( - Position::new(4, 1, 5), - Position::new(14, 1, 15) - ), - } - ); + assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),); } #[test] @@ -2826,9 +3067,7 @@ mod tests { #[cfg(feature = "unicode-perl")] assert_eq!( t_bytes(r"(?-u)[^\w&&\d]"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?-u)[^[a-z&&a-c]]"), @@ -2836,19 +3075,15 @@ mod tests { ); assert_eq!( t_bytes(r"(?-u)[^[\w&&\d]]"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Digit - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?-u)[^[^\w&&\d]]"), - hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) + hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), - hir_negate(hir_bclass_from_char(ascii_class( - &ast::ClassAsciiKind::Word - ))) + hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); } @@ -2924,284 +3159,420 @@ mod tests { , # comment 10 # comment } # comment"), - hir_range( - true, - hir::RepetitionRange::Bounded(5, 10), - hir_lit("a") - ) + hir_range(true, 5, Some(10), hir_lit("a")) ); assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a ")); } #[test] - fn analysis_is_always_utf8() { + fn analysis_is_utf8() { // Positive examples. - assert!(t_bytes(r"a").is_always_utf8()); - assert!(t_bytes(r"ab").is_always_utf8()); - assert!(t_bytes(r"(?-u)a").is_always_utf8()); - assert!(t_bytes(r"(?-u)ab").is_always_utf8()); - assert!(t_bytes(r"\xFF").is_always_utf8()); - assert!(t_bytes(r"\xFF\xFF").is_always_utf8()); - assert!(t_bytes(r"[^a]").is_always_utf8()); - assert!(t_bytes(r"[^a][^a]").is_always_utf8()); - assert!(t_bytes(r"\b").is_always_utf8()); - assert!(t_bytes(r"\B").is_always_utf8()); - assert!(t_bytes(r"(?-u)\b").is_always_utf8()); + assert!(props_bytes(r"a").is_utf8()); + assert!(props_bytes(r"ab").is_utf8()); + assert!(props_bytes(r"(?-u)a").is_utf8()); + assert!(props_bytes(r"(?-u)ab").is_utf8()); + assert!(props_bytes(r"\xFF").is_utf8()); + assert!(props_bytes(r"\xFF\xFF").is_utf8()); + assert!(props_bytes(r"[^a]").is_utf8()); + assert!(props_bytes(r"[^a][^a]").is_utf8()); + assert!(props_bytes(r"\b").is_utf8()); + assert!(props_bytes(r"\B").is_utf8()); + assert!(props_bytes(r"(?-u)\b").is_utf8()); + assert!(props_bytes(r"(?-u)\B").is_utf8()); // Negative examples. - assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8()); - assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8()); - assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8()); - assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8()); - assert!(!t_bytes(r"(?-u)\B").is_always_utf8()); + assert!(!props_bytes(r"(?-u)\xFF").is_utf8()); + assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8()); + assert!(!props_bytes(r"(?-u)[^a]").is_utf8()); + assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8()); + } + + #[test] + fn analysis_captures_len() { + assert_eq!(0, props(r"a").explicit_captures_len()); + assert_eq!(0, props(r"(?:a)").explicit_captures_len()); + assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len()); + assert_eq!(0, props(r"(?i-u)a").explicit_captures_len()); + assert_eq!(1, props(r"(a)").explicit_captures_len()); + assert_eq!(1, props(r"(?Pa)").explicit_captures_len()); + assert_eq!(1, props(r"()").explicit_captures_len()); + assert_eq!(1, props(r"()a").explicit_captures_len()); + assert_eq!(1, props(r"(a)+").explicit_captures_len()); + assert_eq!(2, props(r"(a)(b)").explicit_captures_len()); + assert_eq!(2, props(r"(a)|(b)").explicit_captures_len()); + assert_eq!(2, props(r"((a))").explicit_captures_len()); + assert_eq!(1, props(r"([a&&b])").explicit_captures_len()); + } + + #[test] + fn analysis_static_captures_len() { + let len = |pattern| props(pattern).static_explicit_captures_len(); + assert_eq!(Some(0), len(r"")); + assert_eq!(Some(0), len(r"foo|bar")); + assert_eq!(None, len(r"(foo)|bar")); + assert_eq!(None, len(r"foo|(bar)")); + assert_eq!(Some(1), len(r"(foo|bar)")); + assert_eq!(Some(1), len(r"(a|b|c|d|e|f)")); + assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)")); + assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)")); + assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)")); + assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()")); + assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)")); + assert_eq!(None, len(r"(a)(b)(extra)?")); + assert_eq!(Some(1), len(r"(foo)|(bar)")); + assert_eq!(Some(2), len(r"(foo)(bar)")); + assert_eq!(Some(2), len(r"(foo)+(bar)")); + assert_eq!(None, len(r"(foo)*(bar)")); + assert_eq!(Some(0), len(r"(foo)?{0}")); + assert_eq!(None, len(r"(foo)?{1}")); + assert_eq!(Some(1), len(r"(foo){1}")); + assert_eq!(Some(1), len(r"(foo){1,}")); + assert_eq!(Some(1), len(r"(foo){1,}?")); + assert_eq!(None, len(r"(foo){1,}??")); + assert_eq!(None, len(r"(foo){0,}")); + assert_eq!(Some(1), len(r"(foo)(?:bar)")); + assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))")); + assert_eq!(Some(2), len(r"(?Pfoo)(?:bar)(bal|loon)")); + assert_eq!( + Some(2), + len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#) + ); } #[test] fn analysis_is_all_assertions() { // Positive examples. - assert!(t(r"\b").is_all_assertions()); - assert!(t(r"\B").is_all_assertions()); - assert!(t(r"^").is_all_assertions()); - assert!(t(r"$").is_all_assertions()); - assert!(t(r"\A").is_all_assertions()); - assert!(t(r"\z").is_all_assertions()); - assert!(t(r"$^\z\A\b\B").is_all_assertions()); - assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions()); - assert!(t(r"^$|$^").is_all_assertions()); - assert!(t(r"((\b)+())*^").is_all_assertions()); + let p = props(r"\b"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\A"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"\z"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$^\z\A\b\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"$|^|\z|\A|\b|\B"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"^$|$^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); + + let p = props(r"((\b)+())*^"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(0)); // Negative examples. - assert!(!t(r"^a").is_all_assertions()); + let p = props(r"^a"); + assert!(!p.look_set().is_empty()); + assert_eq!(p.minimum_len(), Some(1)); } #[test] fn analysis_is_anchored() { + let is_start = |p| props(p).look_set_prefix().contains(Look::Start); + let is_end = |p| props(p).look_set_suffix().contains(Look::End); + // Positive examples. - assert!(t(r"^").is_anchored_start()); - assert!(t(r"$").is_anchored_end()); - assert!(t(r"^").is_line_anchored_start()); - assert!(t(r"$").is_line_anchored_end()); - - assert!(t(r"^^").is_anchored_start()); - assert!(t(r"$$").is_anchored_end()); - assert!(t(r"^^").is_line_anchored_start()); - assert!(t(r"$$").is_line_anchored_end()); - - assert!(t(r"^$").is_anchored_start()); - assert!(t(r"^$").is_anchored_end()); - assert!(t(r"^$").is_line_anchored_start()); - assert!(t(r"^$").is_line_anchored_end()); - - assert!(t(r"^foo").is_anchored_start()); - assert!(t(r"foo$").is_anchored_end()); - assert!(t(r"^foo").is_line_anchored_start()); - assert!(t(r"foo$").is_line_anchored_end()); - - assert!(t(r"^foo|^bar").is_anchored_start()); - assert!(t(r"foo$|bar$").is_anchored_end()); - assert!(t(r"^foo|^bar").is_line_anchored_start()); - assert!(t(r"foo$|bar$").is_line_anchored_end()); - - assert!(t(r"^(foo|bar)").is_anchored_start()); - assert!(t(r"(foo|bar)$").is_anchored_end()); - assert!(t(r"^(foo|bar)").is_line_anchored_start()); - assert!(t(r"(foo|bar)$").is_line_anchored_end()); - - assert!(t(r"^+").is_anchored_start()); - assert!(t(r"$+").is_anchored_end()); - assert!(t(r"^+").is_line_anchored_start()); - assert!(t(r"$+").is_line_anchored_end()); - assert!(t(r"^++").is_anchored_start()); - assert!(t(r"$++").is_anchored_end()); - assert!(t(r"^++").is_line_anchored_start()); - assert!(t(r"$++").is_line_anchored_end()); - assert!(t(r"(^)+").is_anchored_start()); - assert!(t(r"($)+").is_anchored_end()); - assert!(t(r"(^)+").is_line_anchored_start()); - assert!(t(r"($)+").is_line_anchored_end()); - - assert!(t(r"$^").is_anchored_start()); - assert!(t(r"$^").is_anchored_start()); - assert!(t(r"$^").is_line_anchored_end()); - assert!(t(r"$^").is_line_anchored_end()); - assert!(t(r"$^|^$").is_anchored_start()); - assert!(t(r"$^|^$").is_anchored_end()); - assert!(t(r"$^|^$").is_line_anchored_start()); - assert!(t(r"$^|^$").is_line_anchored_end()); - - assert!(t(r"\b^").is_anchored_start()); - assert!(t(r"$\b").is_anchored_end()); - assert!(t(r"\b^").is_line_anchored_start()); - assert!(t(r"$\b").is_line_anchored_end()); - assert!(t(r"^(?m:^)").is_anchored_start()); - assert!(t(r"(?m:$)$").is_anchored_end()); - assert!(t(r"^(?m:^)").is_line_anchored_start()); - assert!(t(r"(?m:$)$").is_line_anchored_end()); - assert!(t(r"(?m:^)^").is_anchored_start()); - assert!(t(r"$(?m:$)").is_anchored_end()); - assert!(t(r"(?m:^)^").is_line_anchored_start()); - assert!(t(r"$(?m:$)").is_line_anchored_end()); + assert!(is_start(r"^")); + assert!(is_end(r"$")); - // Negative examples. - assert!(!t(r"(?m)^").is_anchored_start()); - assert!(!t(r"(?m)$").is_anchored_end()); - assert!(!t(r"(?m:^$)|$^").is_anchored_start()); - assert!(!t(r"(?m:^$)|$^").is_anchored_end()); - assert!(!t(r"$^|(?m:^$)").is_anchored_start()); - assert!(!t(r"$^|(?m:^$)").is_anchored_end()); - - assert!(!t(r"a^").is_anchored_start()); - assert!(!t(r"$a").is_anchored_start()); - assert!(!t(r"a^").is_line_anchored_start()); - assert!(!t(r"$a").is_line_anchored_start()); - - assert!(!t(r"a^").is_anchored_end()); - assert!(!t(r"$a").is_anchored_end()); - assert!(!t(r"a^").is_line_anchored_end()); - assert!(!t(r"$a").is_line_anchored_end()); - - assert!(!t(r"^foo|bar").is_anchored_start()); - assert!(!t(r"foo|bar$").is_anchored_end()); - assert!(!t(r"^foo|bar").is_line_anchored_start()); - assert!(!t(r"foo|bar$").is_line_anchored_end()); - - assert!(!t(r"^*").is_anchored_start()); - assert!(!t(r"$*").is_anchored_end()); - assert!(!t(r"^*").is_line_anchored_start()); - assert!(!t(r"$*").is_line_anchored_end()); - assert!(!t(r"^*+").is_anchored_start()); - assert!(!t(r"$*+").is_anchored_end()); - assert!(!t(r"^*+").is_line_anchored_start()); - assert!(!t(r"$*+").is_line_anchored_end()); - assert!(!t(r"^+*").is_anchored_start()); - assert!(!t(r"$+*").is_anchored_end()); - assert!(!t(r"^+*").is_line_anchored_start()); - assert!(!t(r"$+*").is_line_anchored_end()); - assert!(!t(r"(^)*").is_anchored_start()); - assert!(!t(r"($)*").is_anchored_end()); - assert!(!t(r"(^)*").is_line_anchored_start()); - assert!(!t(r"($)*").is_line_anchored_end()); - } + assert!(is_start(r"^^")); + assert!(props(r"$$").look_set_suffix().contains(Look::End)); - #[test] - fn analysis_is_line_anchored() { - assert!(t(r"(?m)^(foo|bar)").is_line_anchored_start()); - assert!(t(r"(?m)(foo|bar)$").is_line_anchored_end()); + assert!(is_start(r"^$")); + assert!(is_end(r"^$")); - assert!(t(r"(?m)^foo|^bar").is_line_anchored_start()); - assert!(t(r"(?m)foo$|bar$").is_line_anchored_end()); + assert!(is_start(r"^foo")); + assert!(is_end(r"foo$")); + + assert!(is_start(r"^foo|^bar")); + assert!(is_end(r"foo$|bar$")); + + assert!(is_start(r"^(foo|bar)")); + assert!(is_end(r"(foo|bar)$")); + + assert!(is_start(r"^+")); + assert!(is_end(r"$+")); + assert!(is_start(r"^++")); + assert!(is_end(r"$++")); + assert!(is_start(r"(^)+")); + assert!(is_end(r"($)+")); + + assert!(is_start(r"$^")); + assert!(is_start(r"$^")); + assert!(is_start(r"$^|^$")); + assert!(is_end(r"$^|^$")); + + assert!(is_start(r"\b^")); + assert!(is_end(r"$\b")); + assert!(is_start(r"^(?m:^)")); + assert!(is_end(r"(?m:$)$")); + assert!(is_start(r"(?m:^)^")); + assert!(is_end(r"$(?m:$)")); + + // Negative examples. + assert!(!is_start(r"(?m)^")); + assert!(!is_end(r"(?m)$")); + assert!(!is_start(r"(?m:^$)|$^")); + assert!(!is_end(r"(?m:^$)|$^")); + assert!(!is_start(r"$^|(?m:^$)")); + assert!(!is_end(r"$^|(?m:^$)")); - assert!(t(r"(?m)^").is_line_anchored_start()); - assert!(t(r"(?m)$").is_line_anchored_end()); + assert!(!is_start(r"a^")); + assert!(!is_start(r"$a")); - assert!(t(r"(?m:^$)|$^").is_line_anchored_start()); - assert!(t(r"(?m:^$)|$^").is_line_anchored_end()); + assert!(!is_end(r"a^")); + assert!(!is_end(r"$a")); - assert!(t(r"$^|(?m:^$)").is_line_anchored_start()); - assert!(t(r"$^|(?m:^$)").is_line_anchored_end()); + assert!(!is_start(r"^foo|bar")); + assert!(!is_end(r"foo|bar$")); + + assert!(!is_start(r"^*")); + assert!(!is_end(r"$*")); + assert!(!is_start(r"^*+")); + assert!(!is_end(r"$*+")); + assert!(!is_start(r"^+*")); + assert!(!is_end(r"$+*")); + assert!(!is_start(r"(^)*")); + assert!(!is_end(r"($)*")); } #[test] fn analysis_is_any_anchored() { + let is_start = |p| props(p).look_set().contains(Look::Start); + let is_end = |p| props(p).look_set().contains(Look::End); + // Positive examples. - assert!(t(r"^").is_any_anchored_start()); - assert!(t(r"$").is_any_anchored_end()); - assert!(t(r"\A").is_any_anchored_start()); - assert!(t(r"\z").is_any_anchored_end()); + assert!(is_start(r"^")); + assert!(is_end(r"$")); + assert!(is_start(r"\A")); + assert!(is_end(r"\z")); // Negative examples. - assert!(!t(r"(?m)^").is_any_anchored_start()); - assert!(!t(r"(?m)$").is_any_anchored_end()); - assert!(!t(r"$").is_any_anchored_start()); - assert!(!t(r"^").is_any_anchored_end()); + assert!(!is_start(r"(?m)^")); + assert!(!is_end(r"(?m)$")); + assert!(!is_start(r"$")); + assert!(!is_end(r"^")); } #[test] - fn analysis_is_match_empty() { + fn analysis_can_empty() { // Positive examples. - assert!(t(r"").is_match_empty()); - assert!(t(r"()").is_match_empty()); - assert!(t(r"()*").is_match_empty()); - assert!(t(r"()+").is_match_empty()); - assert!(t(r"()?").is_match_empty()); - assert!(t(r"a*").is_match_empty()); - assert!(t(r"a?").is_match_empty()); - assert!(t(r"a{0}").is_match_empty()); - assert!(t(r"a{0,}").is_match_empty()); - assert!(t(r"a{0,1}").is_match_empty()); - assert!(t(r"a{0,10}").is_match_empty()); + let assert_empty = + |p| assert_eq!(Some(0), props_bytes(p).minimum_len()); + assert_empty(r""); + assert_empty(r"()"); + assert_empty(r"()*"); + assert_empty(r"()+"); + assert_empty(r"()?"); + assert_empty(r"a*"); + assert_empty(r"a?"); + assert_empty(r"a{0}"); + assert_empty(r"a{0,}"); + assert_empty(r"a{0,1}"); + assert_empty(r"a{0,10}"); #[cfg(feature = "unicode-gencat")] - assert!(t(r"\pL*").is_match_empty()); - assert!(t(r"a*|b").is_match_empty()); - assert!(t(r"b|a*").is_match_empty()); - assert!(t(r"a|").is_match_empty()); - assert!(t(r"|a").is_match_empty()); - assert!(t(r"a||b").is_match_empty()); - assert!(t(r"a*a?(abcd)*").is_match_empty()); - assert!(t(r"^").is_match_empty()); - assert!(t(r"$").is_match_empty()); - assert!(t(r"(?m)^").is_match_empty()); - assert!(t(r"(?m)$").is_match_empty()); - assert!(t(r"\A").is_match_empty()); - assert!(t(r"\z").is_match_empty()); - assert!(t(r"\B").is_match_empty()); - assert!(t_bytes(r"(?-u)\B").is_match_empty()); - assert!(t(r"\b").is_match_empty()); - assert!(t(r"(?-u)\b").is_match_empty()); + assert_empty(r"\pL*"); + assert_empty(r"a*|b"); + assert_empty(r"b|a*"); + assert_empty(r"a|"); + assert_empty(r"|a"); + assert_empty(r"a||b"); + assert_empty(r"a*a?(abcd)*"); + assert_empty(r"^"); + assert_empty(r"$"); + assert_empty(r"(?m)^"); + assert_empty(r"(?m)$"); + assert_empty(r"\A"); + assert_empty(r"\z"); + assert_empty(r"\B"); + assert_empty(r"(?-u)\B"); + assert_empty(r"\b"); + assert_empty(r"(?-u)\b"); // Negative examples. - assert!(!t(r"a+").is_match_empty()); - assert!(!t(r"a{1}").is_match_empty()); - assert!(!t(r"a{1,}").is_match_empty()); - assert!(!t(r"a{1,2}").is_match_empty()); - assert!(!t(r"a{1,10}").is_match_empty()); - assert!(!t(r"b|a").is_match_empty()); - assert!(!t(r"a*a+(abcd)*").is_match_empty()); + let assert_non_empty = + |p| assert_ne!(Some(0), props_bytes(p).minimum_len()); + assert_non_empty(r"a+"); + assert_non_empty(r"a{1}"); + assert_non_empty(r"a{1,}"); + assert_non_empty(r"a{1,2}"); + assert_non_empty(r"a{1,10}"); + assert_non_empty(r"b|a"); + assert_non_empty(r"a*a+(abcd)*"); + #[cfg(feature = "unicode-gencat")] + assert_non_empty(r"\P{any}"); + assert_non_empty(r"[a--a]"); + assert_non_empty(r"[a&&b]"); } #[test] fn analysis_is_literal() { // Positive examples. - assert!(t(r"a").is_literal()); - assert!(t(r"ab").is_literal()); - assert!(t(r"abc").is_literal()); - assert!(t(r"(?m)abc").is_literal()); + assert!(props(r"a").is_literal()); + assert!(props(r"ab").is_literal()); + assert!(props(r"abc").is_literal()); + assert!(props(r"(?m)abc").is_literal()); + assert!(props(r"(?:a)").is_literal()); + assert!(props(r"foo(?:a)").is_literal()); + assert!(props(r"(?:a)foo").is_literal()); + assert!(props(r"[a]").is_literal()); // Negative examples. - assert!(!t(r"").is_literal()); - assert!(!t(r"^").is_literal()); - assert!(!t(r"a|b").is_literal()); - assert!(!t(r"(a)").is_literal()); - assert!(!t(r"a+").is_literal()); - assert!(!t(r"foo(a)").is_literal()); - assert!(!t(r"(a)foo").is_literal()); - assert!(!t(r"[a]").is_literal()); + assert!(!props(r"").is_literal()); + assert!(!props(r"^").is_literal()); + assert!(!props(r"a|b").is_literal()); + assert!(!props(r"(a)").is_literal()); + assert!(!props(r"a+").is_literal()); + assert!(!props(r"foo(a)").is_literal()); + assert!(!props(r"(a)foo").is_literal()); + assert!(!props(r"[ab]").is_literal()); } #[test] fn analysis_is_alternation_literal() { // Positive examples. - assert!(t(r"a").is_alternation_literal()); - assert!(t(r"ab").is_alternation_literal()); - assert!(t(r"abc").is_alternation_literal()); - assert!(t(r"(?m)abc").is_alternation_literal()); - assert!(t(r"a|b").is_alternation_literal()); - assert!(t(r"a|b|c").is_alternation_literal()); - assert!(t(r"foo|bar").is_alternation_literal()); - assert!(t(r"foo|bar|baz").is_alternation_literal()); + assert!(props(r"a").is_alternation_literal()); + assert!(props(r"ab").is_alternation_literal()); + assert!(props(r"abc").is_alternation_literal()); + assert!(props(r"(?m)abc").is_alternation_literal()); + assert!(props(r"foo|bar").is_alternation_literal()); + assert!(props(r"foo|bar|baz").is_alternation_literal()); + assert!(props(r"[a]").is_alternation_literal()); + assert!(props(r"(?:ab)|cd").is_alternation_literal()); + assert!(props(r"ab|(?:cd)").is_alternation_literal()); // Negative examples. - assert!(!t(r"").is_alternation_literal()); - assert!(!t(r"^").is_alternation_literal()); - assert!(!t(r"(a)").is_alternation_literal()); - assert!(!t(r"a+").is_alternation_literal()); - assert!(!t(r"foo(a)").is_alternation_literal()); - assert!(!t(r"(a)foo").is_alternation_literal()); - assert!(!t(r"[a]").is_alternation_literal()); - assert!(!t(r"[a]|b").is_alternation_literal()); - assert!(!t(r"a|[b]").is_alternation_literal()); - assert!(!t(r"(a)|b").is_alternation_literal()); - assert!(!t(r"a|(b)").is_alternation_literal()); + assert!(!props(r"").is_alternation_literal()); + assert!(!props(r"^").is_alternation_literal()); + assert!(!props(r"(a)").is_alternation_literal()); + assert!(!props(r"a+").is_alternation_literal()); + assert!(!props(r"foo(a)").is_alternation_literal()); + assert!(!props(r"(a)foo").is_alternation_literal()); + assert!(!props(r"[ab]").is_alternation_literal()); + assert!(!props(r"[ab]|b").is_alternation_literal()); + assert!(!props(r"a|[ab]").is_alternation_literal()); + assert!(!props(r"(a)|b").is_alternation_literal()); + assert!(!props(r"a|(b)").is_alternation_literal()); + assert!(!props(r"a|b").is_alternation_literal()); + assert!(!props(r"a|b|c").is_alternation_literal()); + assert!(!props(r"[a]|b").is_alternation_literal()); + assert!(!props(r"a|[b]").is_alternation_literal()); + assert!(!props(r"(?:a)|b").is_alternation_literal()); + assert!(!props(r"a|(?:b)").is_alternation_literal()); + } + + // This tests that the smart Hir::concat constructor simplifies the given + // exprs in a way we expect. + #[test] + fn smart_concat() { + assert_eq!(t(""), Hir::empty()); + assert_eq!(t("(?:)"), Hir::empty()); + assert_eq!(t("abc"), hir_lit("abc")); + assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar")); + assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz")); + assert_eq!( + t("foo(?:bar^baz)quux"), + hir_cat(vec![ + hir_lit("foobar"), + hir_look(hir::Look::Start), + hir_lit("bazquux"), + ]) + ); + assert_eq!( + t("foo(?:ba(?:r^b)az)quux"), + hir_cat(vec![ + hir_lit("foobar"), + hir_look(hir::Look::Start), + hir_lit("bazquux"), + ]) + ); + } + + // This tests that the smart Hir::alternation constructor simplifies the + // given exprs in a way we expect. + #[test] + fn smart_alternation() { + assert_eq!( + t("(?:foo)|(?:bar)"), + hir_alt(vec![hir_lit("foo"), hir_lit("bar")]) + ); + assert_eq!( + t("quux|(?:abc|def|xyz)|baz"), + hir_alt(vec![ + hir_lit("quux"), + hir_lit("abc"), + hir_lit("def"), + hir_lit("xyz"), + hir_lit("baz"), + ]) + ); + assert_eq!( + t("quux|(?:abc|(?:def|mno)|xyz)|baz"), + hir_alt(vec![ + hir_lit("quux"), + hir_lit("abc"), + hir_lit("def"), + hir_lit("mno"), + hir_lit("xyz"), + hir_lit("baz"), + ]) + ); + assert_eq!( + t("a|b|c|d|e|f|x|y|z"), + hir_uclass(&[('a', 'f'), ('x', 'z')]), + ); + // Tests that we lift common prefixes out of an alternation. + assert_eq!( + t("[A-Z]foo|[A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_lit("foo"), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z][A-Z]|[A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z][A-Z]|[A-Z][A-Z]quux"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![Hir::empty(), hir_lit("quux")]), + ]), + ); + assert_eq!( + t("[A-Z]foo|[A-Z]foobar"), + hir_cat(vec![ + hir_uclass(&[('A', 'Z')]), + hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]), + ]), + ); } } diff --git a/regex-syntax/src/hir/visitor.rs b/regex-syntax/src/hir/visitor.rs index 4f5a70909c..e5f15cf1c2 100644 --- a/regex-syntax/src/hir/visitor.rs +++ b/regex-syntax/src/hir/visitor.rs @@ -1,3 +1,5 @@ +use alloc::{vec, vec::Vec}; + use crate::hir::{self, Hir, HirKind}; /// A trait for visiting the high-level IR (HIR) in depth first order. @@ -9,7 +11,7 @@ use crate::hir::{self, Hir, HirKind}; /// important since the size of an HIR may be proportional to end user input. /// /// Typical usage of this trait involves providing an implementation and then -/// running it using the [`visit`](fn.visit.html) function. +/// running it using the [`visit`] function. pub trait Visitor { /// The result of visiting an HIR. type Output; @@ -44,8 +46,7 @@ pub trait Visitor { /// Executes an implementation of `Visitor` in constant stack space. /// /// This function will visit every node in the given `Hir` while calling -/// appropriate methods provided by the -/// [`Visitor`](trait.Visitor.html) trait. +/// appropriate methods provided by the [`Visitor`] trait. /// /// The primary use case for this method is when one wants to perform case /// analysis over an `Hir` without using a stack size proportional to the depth @@ -74,9 +75,9 @@ enum Frame<'a> { /// A stack frame allocated just before descending into a repetition /// operator's child node. Repetition(&'a hir::Repetition), - /// A stack frame allocated just before descending into a group's child + /// A stack frame allocated just before descending into a capture's child /// node. - Group(&'a hir::Group), + Capture(&'a hir::Capture), /// The stack frame used while visiting every child node of a concatenation /// of expressions. Concat { @@ -149,7 +150,7 @@ impl<'a> HeapVisitor<'a> { fn induct(&mut self, hir: &'a Hir) -> Option> { match *hir.kind() { HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), - HirKind::Group(ref x) => Some(Frame::Group(x)), + HirKind::Capture(ref x) => Some(Frame::Capture(x)), HirKind::Concat(ref x) if x.is_empty() => None, HirKind::Concat(ref x) => { Some(Frame::Concat { head: &x[0], tail: &x[1..] }) @@ -167,7 +168,7 @@ impl<'a> HeapVisitor<'a> { fn pop(&self, induct: Frame<'a>) -> Option> { match induct { Frame::Repetition(_) => None, - Frame::Group(_) => None, + Frame::Capture(_) => None, Frame::Concat { tail, .. } => { if tail.is_empty() { None @@ -194,8 +195,8 @@ impl<'a> Frame<'a> { /// child HIR node to visit. fn child(&self) -> &'a Hir { match *self { - Frame::Repetition(rep) => &rep.hir, - Frame::Group(group) => &group.hir, + Frame::Repetition(rep) => &rep.sub, + Frame::Capture(capture) => &capture.sub, Frame::Concat { head, .. } => head, Frame::Alternation { head, .. } => head, } diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index 1dfb38af39..4953641d73 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -3,14 +3,14 @@ This crate provides a robust regular expression parser. This crate defines two primary types: -* [`Ast`](ast/enum.Ast.html) is the abstract syntax of a regular expression. +* [`Ast`](ast::Ast) is the abstract syntax of a regular expression. An abstract syntax corresponds to a *structured representation* of the concrete syntax of a regular expression, where the concrete syntax is the pattern string itself (e.g., `foo(bar)+`). Given some abstract syntax, it can be converted back to the original concrete syntax (modulo some details, like whitespace). To a first approximation, the abstract syntax is complex and difficult to analyze. -* [`Hir`](hir/struct.Hir.html) is the high-level intermediate representation +* [`Hir`](hir::Hir) is the high-level intermediate representation ("HIR" or "high-level IR" for short) of regular expression. It corresponds to an intermediate state of a regular expression that sits between the abstract syntax and the low level compiled opcodes that are eventually responsible for @@ -22,14 +22,15 @@ This crate defines two primary types: These two types come with conversion routines: -* An [`ast::parse::Parser`](ast/parse/struct.Parser.html) converts concrete - syntax (a `&str`) to an [`Ast`](ast/enum.Ast.html). -* A [`hir::translate::Translator`](hir/translate/struct.Translator.html) - converts an [`Ast`](ast/enum.Ast.html) to a [`Hir`](hir/struct.Hir.html). +* An [`ast::parse::Parser`] converts concrete syntax (a `&str`) to an +[`Ast`](ast::Ast). +* A [`hir::translate::Translator`] converts an [`Ast`](ast::Ast) to a +[`Hir`](hir::Hir). As a convenience, the above two conversion routines are combined into one via -the top-level [`Parser`](struct.Parser.html) type. This `Parser` will first -convert your pattern to an `Ast` and then convert the `Ast` to an `Hir`. +the top-level [`Parser`] type. This `Parser` will first convert your pattern to +an `Ast` and then convert the `Ast` to an `Hir`. It's also exposed as top-level +[`parse`] free function. # Example @@ -37,14 +38,14 @@ convert your pattern to an `Ast` and then convert the `Ast` to an `Hir`. This example shows how to parse a pattern string into its HIR: ``` -use regex_syntax::Parser; -use regex_syntax::hir::{self, Hir}; +use regex_syntax::{hir::Hir, parse}; -let hir = Parser::new().parse("a|b").unwrap(); +let hir = parse("a|b")?; assert_eq!(hir, Hir::alternation(vec![ - Hir::literal(hir::Literal::Unicode('a')), - Hir::literal(hir::Literal::Unicode('b')), + Hir::literal("a".as_bytes()), + Hir::literal("b".as_bytes()), ])); +# Ok::<(), Box>(()) ``` @@ -81,10 +82,9 @@ in a monospace font. # Literal extraction -This crate provides limited support for -[literal extraction from `Hir` values](hir/literal/struct.Literals.html). -Be warned that literal extraction currently uses recursion, and therefore, -stack size proportional to the size of the `Hir`. +This crate provides limited support for [literal extraction from `Hir` +values](hir::literal). Be warned that literal extraction uses recursion, and +therefore, stack size proportional to the size of the `Hir`. The purpose of literal extraction is to speed up searches. That is, if you know a regular expression must match a prefix or suffix literal, then it is @@ -116,6 +116,11 @@ match semantics of a regular expression. The following features are available: +* **std** - + Enables support for the standard library. This feature is enabled by default. + When disabled, only `core` and `alloc` are used. Otherwise, enabling `std` + generally just enables `std::error::Error` trait impls for the various error + types. * **unicode** - Enables all Unicode features. This feature is enabled by default, and will always cover all Unicode features, even if more are added in the future. @@ -154,19 +159,32 @@ The following features are available: `\p{sb=ATerm}`. */ -#![deny(missing_docs)] -#![warn(missing_debug_implementations)] +#![no_std] #![forbid(unsafe_code)] +#![deny(missing_docs, rustdoc::broken_intra_doc_links)] +#![warn(missing_debug_implementations)] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] + +#[cfg(any(test, feature = "std"))] +extern crate std; -pub use crate::error::{Error, Result}; -pub use crate::parser::{Parser, ParserBuilder}; -pub use crate::unicode::UnicodeWordError; +extern crate alloc; + +pub use crate::{ + error::Error, + parser::{parse, Parser, ParserBuilder}, + unicode::UnicodeWordError, +}; + +use alloc::string::String; pub mod ast; +mod debug; mod either; mod error; pub mod hir; mod parser; +mod rank; mod unicode; mod unicode_tables; pub mod utf8; @@ -197,13 +215,43 @@ pub fn escape_into(text: &str, buf: &mut String) { /// Returns true if the given character has significance in a regex. /// -/// These are the only characters that are allowed to be escaped, with one -/// exception: an ASCII space character may be escaped when extended mode (with -/// the `x` flag) is enabled. In particular, `is_meta_character(' ')` returns -/// `false`. +/// Generally speaking, these are the only characters which _must_ be escaped +/// in order to match their literal meaning. For example, to match a literal +/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For +/// example, `-` is treated as a meta character because of its significance +/// for writing ranges inside of character classes, but the regex `-` will +/// match a literal `-` because `-` has no special meaning outside of character +/// classes. +/// +/// In order to determine whether a character may be escaped at all, the +/// [`is_escapeable_character`] routine should be used. The difference between +/// `is_meta_character` and `is_escapeable_character` is that the latter will +/// return true for some characters that are _not_ meta characters. For +/// example, `%` and `\%` both match a literal `%` in all contexts. In other +/// words, `is_escapeable_character` includes "superfluous" escapes. /// /// Note that the set of characters for which this function returns `true` or -/// `false` is fixed and won't change in a semver compatible release. +/// `false` is fixed and won't change in a semver compatible release. (In this +/// case, "semver compatible release" actually refers to the `regex` crate +/// itself, since reducing or expanding the set of meta characters would be a +/// breaking change for not just `regex-syntax` but also `regex` itself.) +/// +/// # Example +/// +/// ``` +/// use regex_syntax::is_meta_character; +/// +/// assert!(is_meta_character('?')); +/// assert!(is_meta_character('-')); +/// assert!(is_meta_character('&')); +/// assert!(is_meta_character('#')); +/// +/// assert!(!is_meta_character('%')); +/// assert!(!is_meta_character('/')); +/// assert!(!is_meta_character('!')); +/// assert!(!is_meta_character('"')); +/// assert!(!is_meta_character('e')); +/// ``` pub fn is_meta_character(c: char) -> bool { match c { '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' @@ -212,6 +260,68 @@ pub fn is_meta_character(c: char) -> bool { } } +/// Returns true if the given character can be escaped in a regex. +/// +/// This returns true in all cases that `is_meta_character` returns true, but +/// also returns true in some cases where `is_meta_character` returns false. +/// For example, `%` is not a meta character, but it is escapeable. That is, +/// `%` and `\%` both match a literal `%` in all contexts. +/// +/// The purpose of this routine is to provide knowledge about what characters +/// may be escaped. Namely, most regex engines permit "superfluous" escapes +/// where characters without any special significance may be escaped even +/// though there is no actual _need_ to do so. +/// +/// This will return false for some characters. For example, `e` is not +/// escapeable. Therefore, `\e` will either result in a parse error (which is +/// true today), or it could backwards compatibly evolve into a new construct +/// with its own meaning. Indeed, that is the purpose of banning _some_ +/// superfluous escapes: it provides a way to evolve the syntax in a compatible +/// manner. +/// +/// # Example +/// +/// ``` +/// use regex_syntax::is_escapeable_character; +/// +/// assert!(is_escapeable_character('?')); +/// assert!(is_escapeable_character('-')); +/// assert!(is_escapeable_character('&')); +/// assert!(is_escapeable_character('#')); +/// assert!(is_escapeable_character('%')); +/// assert!(is_escapeable_character('/')); +/// assert!(is_escapeable_character('!')); +/// assert!(is_escapeable_character('"')); +/// +/// assert!(!is_escapeable_character('e')); +/// ``` +pub fn is_escapeable_character(c: char) -> bool { + // Certainly escapeable if it's a meta character. + if is_meta_character(c) { + return true; + } + // Any character that isn't ASCII is definitely not escapeable. There's + // no real need to allow things like \☃ right? + if !c.is_ascii() { + return false; + } + // Otherwise, we basically say that everything is escapeable unless it's a + // letter or digit. Things like \3 are either octal (when enabled) or an + // error, and we should keep it that way. Otherwise, letters are reserved + // for adding new syntax in a backwards compatible way. + match c { + '0'..='9' | 'A'..='Z' | 'a'..='z' => false, + // While not currently supported, we keep these as not escapeable to + // give us some flexibility with respect to supporting the \< and + // \> word boundary assertions in the future. By rejecting them as + // escapeable, \< and \> will result in a parse error. Thus, we can + // turn them into something else in the future without it being a + // backwards incompatible change. + '<' | '>' => false, + _ => true, + } +} + /// Returns true if and only if the given character is a Unicode word /// character. /// @@ -224,10 +334,9 @@ pub fn is_meta_character(c: char) -> bool { /// /// # Panics /// -/// If the `unicode-perl` feature is not enabled, then this function panics. -/// For this reason, it is recommended that callers use -/// [`try_is_word_character`](fn.try_is_word_character.html) -/// instead. +/// If the `unicode-perl` feature is not enabled, then this function +/// panics. For this reason, it is recommended that callers use +/// [`try_is_word_character`] instead. pub fn is_word_character(c: char) -> bool { try_is_word_character(c).expect("unicode-perl feature must be enabled") } @@ -248,7 +357,7 @@ pub fn is_word_character(c: char) -> bool { /// returns an error. pub fn try_is_word_character( c: char, -) -> std::result::Result { +) -> core::result::Result { unicode::is_word_character(c) } @@ -265,6 +374,8 @@ pub fn is_word_byte(c: u8) -> bool { #[cfg(test)] mod tests { + use alloc::string::ToString; + use super::*; #[test] diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index ded95b280a..2e7a2bb80c 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -1,16 +1,26 @@ -use crate::ast; -use crate::hir; +use crate::{ast, hir, Error}; -use crate::Result; +/// A convenience routine for parsing a regex using default options. +/// +/// This is equivalent to `Parser::new().parse(pattern)`. +/// +/// If you need to set non-default options, then use a [`ParserBuilder`]. +/// +/// This routine returns an [`Hir`](hir::Hir) value. Namely, it automatically +/// parses the pattern as an [`Ast`](ast::Ast) and then invokes the translator +/// to convert the `Ast` into an `Hir`. If you need access to the `Ast`, then +/// you should use a [`ast::parse::Parser`]. +pub fn parse(pattern: &str) -> Result { + Parser::new().parse(pattern) +} /// A builder for a regular expression parser. /// /// This builder permits modifying configuration options for the parser. /// -/// This type combines the builder options for both the -/// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html) -/// and the -/// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html). +/// This type combines the builder options for both the [AST +/// `ParserBuilder`](ast::parse::ParserBuilder) and the [HIR +/// `TranslatorBuilder`](hir::translate::TranslatorBuilder). #[derive(Clone, Debug, Default)] pub struct ParserBuilder { ast: ast::parse::ParserBuilder, @@ -78,19 +88,23 @@ impl ParserBuilder { self } - /// When enabled, the parser will permit the construction of a regular + /// When disabled, translation will permit the construction of a regular /// expression that may match invalid UTF-8. /// - /// When disabled (the default), the parser is guaranteed to produce - /// an expression that will only ever match valid UTF-8 (otherwise, the - /// parser will return an error). - /// - /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII - /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause - /// the parser to return an error. Namely, a negated ASCII word boundary - /// can result in matching positions that aren't valid UTF-8 boundaries. - pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder { - self.hir.allow_invalid_utf8(yes); + /// When enabled (the default), the translator is guaranteed to produce an + /// expression that, for non-empty matches, will only ever produce spans + /// that are entirely valid UTF-8 (otherwise, the translator will return an + /// error). + /// + /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even + /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete + /// syntax) will be allowed even though they can produce matches that split + /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" + /// matches, and it is expected that the regex engine itself must handle + /// these cases if necessary (perhaps by suppressing any zero-width matches + /// that split a codepoint). + pub fn utf8(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.utf8(yes); self } @@ -134,6 +148,23 @@ impl ParserBuilder { self } + /// Enable or disable the CRLF mode flag by default. + /// + /// By default this is disabled. It may alternatively be selectively + /// enabled in the regular expression itself via the `R` flag. + /// + /// When CRLF mode is enabled, the following happens: + /// + /// * Unless `dot_matches_new_line` is enabled, `.` will match any character + /// except for `\r` and `\n`. + /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, + /// `\r` and `\n` as line terminators. And in particular, neither will + /// match between a `\r` and a `\n`. + pub fn crlf(&mut self, yes: bool) -> &mut ParserBuilder { + self.hir.crlf(yes); + self + } + /// Enable or disable the "swap greed" flag by default. /// /// By default this is disabled. It may alternatively be selectively @@ -148,9 +179,9 @@ impl ParserBuilder { /// By default this is **enabled**. It may alternatively be selectively /// disabled in the regular expression itself via the `u` flag. /// - /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by - /// default), a regular expression will fail to parse if Unicode mode is - /// disabled and a sub-expression could possibly match invalid UTF-8. + /// Note that unless `utf8` is disabled (it's enabled by default), a + /// regular expression will fail to parse if Unicode mode is disabled and a + /// sub-expression could possibly match invalid UTF-8. pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.unicode(yes); self @@ -167,10 +198,9 @@ impl ParserBuilder { /// convenience for never having to deal with it at all. /// /// If callers have more fine grained use cases that need an AST, then please -/// see the [`ast::parse`](ast/parse/index.html) module. +/// see the [`ast::parse`] module. /// -/// A `Parser` can be configured in more detail via a -/// [`ParserBuilder`](struct.ParserBuilder.html). +/// A `Parser` can be configured in more detail via a [`ParserBuilder`]. #[derive(Clone, Debug)] pub struct Parser { ast: ast::parse::Parser, @@ -184,15 +214,14 @@ impl Parser { /// a high level intermediate representation of the given regular /// expression. /// - /// To set configuration options on the parser, use - /// [`ParserBuilder`](struct.ParserBuilder.html). + /// To set configuration options on the parser, use [`ParserBuilder`]. pub fn new() -> Parser { ParserBuilder::new().build() } /// Parse the regular expression into a high level intermediate /// representation. - pub fn parse(&mut self, pattern: &str) -> Result { + pub fn parse(&mut self, pattern: &str) -> Result { let ast = self.ast.parse(pattern)?; let hir = self.hir.translate(pattern, &ast)?; Ok(hir) diff --git a/regex-syntax/src/rank.rs b/regex-syntax/src/rank.rs new file mode 100644 index 0000000000..ccb25a20ae --- /dev/null +++ b/regex-syntax/src/rank.rs @@ -0,0 +1,258 @@ +pub(crate) const BYTE_FREQUENCIES: [u8; 256] = [ + 55, // '\x00' + 52, // '\x01' + 51, // '\x02' + 50, // '\x03' + 49, // '\x04' + 48, // '\x05' + 47, // '\x06' + 46, // '\x07' + 45, // '\x08' + 103, // '\t' + 242, // '\n' + 66, // '\x0b' + 67, // '\x0c' + 229, // '\r' + 44, // '\x0e' + 43, // '\x0f' + 42, // '\x10' + 41, // '\x11' + 40, // '\x12' + 39, // '\x13' + 38, // '\x14' + 37, // '\x15' + 36, // '\x16' + 35, // '\x17' + 34, // '\x18' + 33, // '\x19' + 56, // '\x1a' + 32, // '\x1b' + 31, // '\x1c' + 30, // '\x1d' + 29, // '\x1e' + 28, // '\x1f' + 255, // ' ' + 148, // '!' + 164, // '"' + 149, // '#' + 136, // '$' + 160, // '%' + 155, // '&' + 173, // "'" + 221, // '(' + 222, // ')' + 134, // '*' + 122, // '+' + 232, // ',' + 202, // '-' + 215, // '.' + 224, // '/' + 208, // '0' + 220, // '1' + 204, // '2' + 187, // '3' + 183, // '4' + 179, // '5' + 177, // '6' + 168, // '7' + 178, // '8' + 200, // '9' + 226, // ':' + 195, // ';' + 154, // '<' + 184, // '=' + 174, // '>' + 126, // '?' + 120, // '@' + 191, // 'A' + 157, // 'B' + 194, // 'C' + 170, // 'D' + 189, // 'E' + 162, // 'F' + 161, // 'G' + 150, // 'H' + 193, // 'I' + 142, // 'J' + 137, // 'K' + 171, // 'L' + 176, // 'M' + 185, // 'N' + 167, // 'O' + 186, // 'P' + 112, // 'Q' + 175, // 'R' + 192, // 'S' + 188, // 'T' + 156, // 'U' + 140, // 'V' + 143, // 'W' + 123, // 'X' + 133, // 'Y' + 128, // 'Z' + 147, // '[' + 138, // '\\' + 146, // ']' + 114, // '^' + 223, // '_' + 151, // '`' + 249, // 'a' + 216, // 'b' + 238, // 'c' + 236, // 'd' + 253, // 'e' + 227, // 'f' + 218, // 'g' + 230, // 'h' + 247, // 'i' + 135, // 'j' + 180, // 'k' + 241, // 'l' + 233, // 'm' + 246, // 'n' + 244, // 'o' + 231, // 'p' + 139, // 'q' + 245, // 'r' + 243, // 's' + 251, // 't' + 235, // 'u' + 201, // 'v' + 196, // 'w' + 240, // 'x' + 214, // 'y' + 152, // 'z' + 182, // '{' + 205, // '|' + 181, // '}' + 127, // '~' + 27, // '\x7f' + 212, // '\x80' + 211, // '\x81' + 210, // '\x82' + 213, // '\x83' + 228, // '\x84' + 197, // '\x85' + 169, // '\x86' + 159, // '\x87' + 131, // '\x88' + 172, // '\x89' + 105, // '\x8a' + 80, // '\x8b' + 98, // '\x8c' + 96, // '\x8d' + 97, // '\x8e' + 81, // '\x8f' + 207, // '\x90' + 145, // '\x91' + 116, // '\x92' + 115, // '\x93' + 144, // '\x94' + 130, // '\x95' + 153, // '\x96' + 121, // '\x97' + 107, // '\x98' + 132, // '\x99' + 109, // '\x9a' + 110, // '\x9b' + 124, // '\x9c' + 111, // '\x9d' + 82, // '\x9e' + 108, // '\x9f' + 118, // '\xa0' + 141, // '¡' + 113, // '¢' + 129, // '£' + 119, // '¤' + 125, // '¥' + 165, // '¦' + 117, // '§' + 92, // '¨' + 106, // '©' + 83, // 'ª' + 72, // '«' + 99, // '¬' + 93, // '\xad' + 65, // '®' + 79, // '¯' + 166, // '°' + 237, // '±' + 163, // '²' + 199, // '³' + 190, // '´' + 225, // 'µ' + 209, // '¶' + 203, // '·' + 198, // '¸' + 217, // '¹' + 219, // 'º' + 206, // '»' + 234, // '¼' + 248, // '½' + 158, // '¾' + 239, // '¿' + 255, // 'À' + 255, // 'Á' + 255, // 'Â' + 255, // 'Ã' + 255, // 'Ä' + 255, // 'Å' + 255, // 'Æ' + 255, // 'Ç' + 255, // 'È' + 255, // 'É' + 255, // 'Ê' + 255, // 'Ë' + 255, // 'Ì' + 255, // 'Í' + 255, // 'Î' + 255, // 'Ï' + 255, // 'Ð' + 255, // 'Ñ' + 255, // 'Ò' + 255, // 'Ó' + 255, // 'Ô' + 255, // 'Õ' + 255, // 'Ö' + 255, // '×' + 255, // 'Ø' + 255, // 'Ù' + 255, // 'Ú' + 255, // 'Û' + 255, // 'Ü' + 255, // 'Ý' + 255, // 'Þ' + 255, // 'ß' + 255, // 'à' + 255, // 'á' + 255, // 'â' + 255, // 'ã' + 255, // 'ä' + 255, // 'å' + 255, // 'æ' + 255, // 'ç' + 255, // 'è' + 255, // 'é' + 255, // 'ê' + 255, // 'ë' + 255, // 'ì' + 255, // 'í' + 255, // 'î' + 255, // 'ï' + 255, // 'ð' + 255, // 'ñ' + 255, // 'ò' + 255, // 'ó' + 255, // 'ô' + 255, // 'õ' + 255, // 'ö' + 255, // '÷' + 255, // 'ø' + 255, // 'ù' + 255, // 'ú' + 255, // 'û' + 255, // 'ü' + 255, // 'ý' + 255, // 'þ' + 255, // 'ÿ' +]; diff --git a/regex-syntax/src/unicode.rs b/regex-syntax/src/unicode.rs index 8194d7f55b..91bd4b1203 100644 --- a/regex-syntax/src/unicode.rs +++ b/regex-syntax/src/unicode.rs @@ -1,12 +1,10 @@ -use std::error; -use std::fmt; -use std::result; +use alloc::{ + string::{String, ToString}, + vec::Vec, +}; use crate::hir; -/// A type alias for errors specific to Unicode handling of classes. -pub type Result = result::Result; - /// An inclusive range of codepoints from a generated file (hence the static /// lifetime). type Range = &'static [(char, char)]; @@ -24,9 +22,6 @@ pub enum Error { PerlClassNotFound, } -/// A type alias for errors specific to Unicode case folding. -pub type FoldResult = result::Result; - /// An error that occurs when Unicode-aware simple case folding fails. /// /// This error can occur when the case mapping tables necessary for Unicode @@ -35,10 +30,11 @@ pub type FoldResult = result::Result; #[derive(Debug)] pub struct CaseFoldError(()); -impl error::Error for CaseFoldError {} +#[cfg(feature = "std")] +impl std::error::Error for CaseFoldError {} -impl fmt::Display for CaseFoldError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for CaseFoldError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!( f, "Unicode-aware case folding is not available \ @@ -55,10 +51,11 @@ impl fmt::Display for CaseFoldError { #[derive(Debug)] pub struct UnicodeWordError(()); -impl error::Error for UnicodeWordError {} +#[cfg(feature = "std")] +impl std::error::Error for UnicodeWordError {} -impl fmt::Display for UnicodeWordError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl core::fmt::Display for UnicodeWordError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!( f, "Unicode-aware \\w class is not available \ @@ -67,74 +64,122 @@ impl fmt::Display for UnicodeWordError { } } -/// Return an iterator over the equivalence class of simple case mappings -/// for the given codepoint. The equivalence class does not include the -/// given codepoint. +/// A state oriented traverser of the simple case folding table. /// -/// If the equivalence class is empty, then this returns the next scalar -/// value that has a non-empty equivalence class, if it exists. If no such -/// scalar value exists, then `None` is returned. The point of this behavior -/// is to permit callers to avoid calling `simple_fold` more than they need -/// to, since there is some cost to fetching the equivalence class. +/// A case folder can be constructed via `SimpleCaseFolder::new()`, which will +/// return an error if the underlying case folding table is unavailable. /// -/// This returns an error if the Unicode case folding tables are not available. -pub fn simple_fold( - c: char, -) -> FoldResult, Option>> { - #[cfg(not(feature = "unicode-case"))] - fn imp( - _: char, - ) -> FoldResult, Option>> - { - use std::option::IntoIter; - Err::, _>, _>(CaseFoldError(())) - } +/// After construction, it is expected that callers will use +/// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly +/// increasing order. For example, calling it on `b` and then on `a` is illegal +/// and will result in a panic. +/// +/// The main idea of this type is that it tries hard to make mapping lookups +/// fast by exploiting the structure of the underlying table, and the ordering +/// assumption enables this. +#[derive(Debug)] +pub struct SimpleCaseFolder { + /// The simple case fold table. It's a sorted association list, where the + /// keys are Unicode scalar values and the values are the corresponding + /// equivalence class (not including the key) of the "simple" case folded + /// Unicode scalar values. + table: &'static [(char, &'static [char])], + /// The last codepoint that was used for a lookup. + last: Option, + /// The index to the entry in `table` corresponding to the smallest key `k` + /// such that `k > k0`, where `k0` is the most recent key lookup. Note that + /// in particular, `k0` may not be in the table! + next: usize, +} - #[cfg(feature = "unicode-case")] - fn imp( - c: char, - ) -> FoldResult, Option>> - { - use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; - - Ok(CASE_FOLDING_SIMPLE - .binary_search_by_key(&c, |&(c1, _)| c1) - .map(|i| CASE_FOLDING_SIMPLE[i].1.iter().copied()) - .map_err(|i| { - if i >= CASE_FOLDING_SIMPLE.len() { - None - } else { - Some(CASE_FOLDING_SIMPLE[i].0) - } - })) +impl SimpleCaseFolder { + /// Create a new simple case folder, returning an error if the underlying + /// case folding table is unavailable. + pub fn new() -> Result { + #[cfg(not(feature = "unicode-case"))] + { + Err(CaseFoldError(())) + } + #[cfg(feature = "unicode-case")] + { + Ok(SimpleCaseFolder { + table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE, + last: None, + next: 0, + }) + } } - imp(c) -} - -/// Returns true if and only if the given (inclusive) range contains at least -/// one Unicode scalar value that has a non-empty non-trivial simple case -/// mapping. -/// -/// This function panics if `end < start`. -/// -/// This returns an error if the Unicode case folding tables are not available. -pub fn contains_simple_case_mapping( - start: char, - end: char, -) -> FoldResult { - #[cfg(not(feature = "unicode-case"))] - fn imp(_: char, _: char) -> FoldResult { - Err(CaseFoldError(())) + /// Return the equivalence class of case folded codepoints for the given + /// codepoint. The equivalence class returned never includes the codepoint + /// given. If the given codepoint has no case folded codepoints (i.e., + /// no entry in the underlying case folding table), then this returns an + /// empty slice. + /// + /// # Panics + /// + /// This panics when called with a `c` that is less than or equal to the + /// previous call. In other words, callers need to use this method with + /// strictly increasing values of `c`. + pub fn mapping(&mut self, c: char) -> &'static [char] { + if let Some(last) = self.last { + assert!( + last < c, + "got codepoint U+{:X} which occurs before \ + last codepoint U+{:X}", + u32::from(c), + u32::from(last), + ); + } + self.last = Some(c); + if self.next >= self.table.len() { + return &[]; + } + let (k, v) = self.table[self.next]; + if k == c { + self.next += 1; + return v; + } + match self.get(c) { + Err(i) => { + self.next = i; + &[] + } + Ok(i) => { + // Since we require lookups to proceed + // in order, anything we find should be + // after whatever we thought might be + // next. Otherwise, the caller is either + // going out of order or we would have + // found our next key at 'self.next'. + assert!(i > self.next); + self.next = i + 1; + self.table[i].1 + } + } } - #[cfg(feature = "unicode-case")] - fn imp(start: char, end: char) -> FoldResult { - use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE; - use std::cmp::Ordering; + /// Returns true if and only if the given range overlaps with any region + /// of the underlying case folding table. That is, when true, there exists + /// at least one codepoint in the inclusive range `[start, end]` that has + /// a non-trivial equivalence class of case folded codepoints. Conversely, + /// when this returns false, all codepoints in the range `[start, end]` + /// correspond to the trivial equivalence class of case folded codepoints, + /// i.e., itself. + /// + /// This is useful to call before iterating over the codepoints in the + /// range and looking up the mapping for each. If you know none of the + /// mappings will return anything, then you might be able to skip doing it + /// altogether. + /// + /// # Panics + /// + /// This panics when `end < start`. + pub fn overlaps(&self, start: char, end: char) -> bool { + use core::cmp::Ordering; assert!(start <= end); - Ok(CASE_FOLDING_SIMPLE + self.table .binary_search_by(|&(c, _)| { if start <= c && c <= end { Ordering::Equal @@ -144,10 +189,15 @@ pub fn contains_simple_case_mapping( Ordering::Less } }) - .is_ok()) + .is_ok() } - imp(start, end) + /// Returns the index at which `c` occurs in the simple case fold table. If + /// `c` does not occur, then this returns an `i` such that `table[i-1].0 < + /// c` and `table[i].0 > c`. + fn get(&self, c: char) -> Result { + self.table.binary_search_by_key(&c, |&(c1, _)| c1) + } } /// A query for finding a character class defined by Unicode. This supports @@ -185,7 +235,7 @@ pub enum ClassQuery<'a> { } impl<'a> ClassQuery<'a> { - fn canonicalize(&self) -> Result { + fn canonicalize(&self) -> Result { match *self { ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), ClassQuery::Binary(name) => self.canonical_binary(name), @@ -234,7 +284,10 @@ impl<'a> ClassQuery<'a> { } } - fn canonical_binary(&self, name: &str) -> Result { + fn canonical_binary( + &self, + name: &str, + ) -> Result { let norm = symbolic_name_normalize(name); // This is a special case where 'cf' refers to the 'Format' general @@ -243,7 +296,17 @@ impl<'a> ClassQuery<'a> { // a general category. (Currently, we don't even support the // 'Case_Folding' property. But if we do in the future, users will be // required to spell it out.) - if norm != "cf" { + // + // Also 'sc' refers to the 'Currency_Symbol' general category, but is + // also the abbreviation for the 'Script' property. So we avoid calling + // 'canonical_prop' for it too, which would erroneously normalize it + // to 'Script'. + // + // Another case: 'lc' is an abbreviation for the 'Cased_Letter' + // general category, but is also an abbreviation for the 'Lowercase_Mapping' + // property. We don't currently support the latter, so as with 'cf' + // above, we treat 'lc' as 'Cased_Letter'. + if norm != "cf" && norm != "sc" && norm != "lc" { if let Some(canon) = canonical_prop(&norm)? { return Ok(CanonicalClassQuery::Binary(canon)); } @@ -285,7 +348,7 @@ enum CanonicalClassQuery { /// Looks up a Unicode class given a query. If one doesn't exist, then /// `None` is returned. -pub fn class(query: ClassQuery<'_>) -> Result { +pub fn class(query: ClassQuery<'_>) -> Result { use self::CanonicalClassQuery::*; match query.canonicalize()? { @@ -322,14 +385,14 @@ pub fn class(query: ClassQuery<'_>) -> Result { /// Returns a Unicode aware class for \w. /// /// This returns an error if the data is not available for \w. -pub fn perl_word() -> Result { +pub fn perl_word() -> Result { #[cfg(not(feature = "unicode-perl"))] - fn imp() -> Result { + fn imp() -> Result { Err(Error::PerlClassNotFound) } #[cfg(feature = "unicode-perl")] - fn imp() -> Result { + fn imp() -> Result { use crate::unicode_tables::perl_word::PERL_WORD; Ok(hir_class(PERL_WORD)) } @@ -340,20 +403,20 @@ pub fn perl_word() -> Result { /// Returns a Unicode aware class for \s. /// /// This returns an error if the data is not available for \s. -pub fn perl_space() -> Result { +pub fn perl_space() -> Result { #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] - fn imp() -> Result { + fn imp() -> Result { Err(Error::PerlClassNotFound) } #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] - fn imp() -> Result { + fn imp() -> Result { use crate::unicode_tables::perl_space::WHITE_SPACE; Ok(hir_class(WHITE_SPACE)) } #[cfg(feature = "unicode-bool")] - fn imp() -> Result { + fn imp() -> Result { use crate::unicode_tables::property_bool::WHITE_SPACE; Ok(hir_class(WHITE_SPACE)) } @@ -364,20 +427,20 @@ pub fn perl_space() -> Result { /// Returns a Unicode aware class for \d. /// /// This returns an error if the data is not available for \d. -pub fn perl_digit() -> Result { +pub fn perl_digit() -> Result { #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] - fn imp() -> Result { + fn imp() -> Result { Err(Error::PerlClassNotFound) } #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] - fn imp() -> Result { + fn imp() -> Result { use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER; Ok(hir_class(DECIMAL_NUMBER)) } #[cfg(feature = "unicode-gencat")] - fn imp() -> Result { + fn imp() -> Result { use crate::unicode_tables::general_category::DECIMAL_NUMBER; Ok(hir_class(DECIMAL_NUMBER)) } @@ -397,23 +460,24 @@ pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { /// Returns true only if the given codepoint is in the `\w` character class. /// /// If the `unicode-perl` feature is not enabled, then this returns an error. -pub fn is_word_character(c: char) -> result::Result { +pub fn is_word_character(c: char) -> Result { #[cfg(not(feature = "unicode-perl"))] - fn imp(_: char) -> result::Result { + fn imp(_: char) -> Result { Err(UnicodeWordError(())) } #[cfg(feature = "unicode-perl")] - fn imp(c: char) -> result::Result { - use crate::is_word_byte; - use crate::unicode_tables::perl_word::PERL_WORD; - use std::cmp::Ordering; + fn imp(c: char) -> Result { + use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD}; - if c <= 0x7F as char && is_word_byte(c as u8) { + // MSRV(1.59): Use 'u8::try_from(c)' instead. + if u8::try_from(u32::from(c)).map_or(false, is_word_byte) { return Ok(true); } Ok(PERL_WORD .binary_search_by(|&(start, end)| { + use core::cmp::Ordering; + if start <= c && c <= end { Ordering::Equal } else if start > c { @@ -435,7 +499,9 @@ pub fn is_word_character(c: char) -> result::Result { /// value. type PropertyValues = &'static [(&'static str, &'static str)]; -fn canonical_gencat(normalized_value: &str) -> Result> { +fn canonical_gencat( + normalized_value: &str, +) -> Result, Error> { Ok(match normalized_value { "any" => Some("Any"), "assigned" => Some("Assigned"), @@ -447,7 +513,9 @@ fn canonical_gencat(normalized_value: &str) -> Result> { }) } -fn canonical_script(normalized_value: &str) -> Result> { +fn canonical_script( + normalized_value: &str, +) -> Result, Error> { let scripts = property_values("Script")?.unwrap(); Ok(canonical_value(scripts, normalized_value)) } @@ -460,7 +528,9 @@ fn canonical_script(normalized_value: &str) -> Result> { /// UAX44 LM3, which can be done using `symbolic_name_normalize`. /// /// If the property names data is not available, then an error is returned. -fn canonical_prop(normalized_name: &str) -> Result> { +fn canonical_prop( + normalized_name: &str, +) -> Result, Error> { #[cfg(not(any( feature = "unicode-age", feature = "unicode-bool", @@ -469,7 +539,7 @@ fn canonical_prop(normalized_name: &str) -> Result> { feature = "unicode-script", feature = "unicode-segment", )))] - fn imp(_: &str) -> Result> { + fn imp(_: &str) -> Result, Error> { Err(Error::PropertyNotFound) } @@ -481,7 +551,7 @@ fn canonical_prop(normalized_name: &str) -> Result> { feature = "unicode-script", feature = "unicode-segment", ))] - fn imp(name: &str) -> Result> { + fn imp(name: &str) -> Result, Error> { use crate::unicode_tables::property_names::PROPERTY_NAMES; Ok(PROPERTY_NAMES @@ -517,7 +587,7 @@ fn canonical_value( /// If the property values data is not available, then an error is returned. fn property_values( canonical_property_name: &'static str, -) -> Result> { +) -> Result, Error> { #[cfg(not(any( feature = "unicode-age", feature = "unicode-bool", @@ -526,7 +596,7 @@ fn property_values( feature = "unicode-script", feature = "unicode-segment", )))] - fn imp(_: &'static str) -> Result> { + fn imp(_: &'static str) -> Result, Error> { Err(Error::PropertyValueNotFound) } @@ -538,7 +608,7 @@ fn property_values( feature = "unicode-script", feature = "unicode-segment", ))] - fn imp(name: &'static str) -> Result> { + fn imp(name: &'static str) -> Result, Error> { use crate::unicode_tables::property_values::PROPERTY_VALUES; Ok(PROPERTY_VALUES @@ -569,15 +639,15 @@ fn property_set( /// /// If the given age value isn't valid or if the data isn't available, then an /// error is returned instead. -fn ages(canonical_age: &str) -> Result> { +fn ages(canonical_age: &str) -> Result, Error> { #[cfg(not(feature = "unicode-age"))] - fn imp(_: &str) -> Result> { - use std::option::IntoIter; + fn imp(_: &str) -> Result, Error> { + use core::option::IntoIter; Err::, _>(Error::PropertyNotFound) } #[cfg(feature = "unicode-age")] - fn imp(canonical_age: &str) -> Result> { + fn imp(canonical_age: &str) -> Result, Error> { use crate::unicode_tables::age; const AGES: &[(&str, Range)] = &[ @@ -625,14 +695,14 @@ fn ages(canonical_age: &str) -> Result> { /// /// If the given general category could not be found, or if the general /// category data is not available, then an error is returned. -fn gencat(canonical_name: &'static str) -> Result { +fn gencat(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-gencat"))] - fn imp(_: &'static str) -> Result { + fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-gencat")] - fn imp(name: &'static str) -> Result { + fn imp(name: &'static str) -> Result { use crate::unicode_tables::general_category::BY_NAME; match name { "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), @@ -660,14 +730,14 @@ fn gencat(canonical_name: &'static str) -> Result { /// /// If the given script could not be found, or if the script data is not /// available, then an error is returned. -fn script(canonical_name: &'static str) -> Result { +fn script(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-script"))] - fn imp(_: &'static str) -> Result { + fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-script")] - fn imp(name: &'static str) -> Result { + fn imp(name: &'static str) -> Result { use crate::unicode_tables::script::BY_NAME; property_set(BY_NAME, name) .map(hir_class) @@ -685,14 +755,14 @@ fn script(canonical_name: &'static str) -> Result { /// not available, then an error is returned. fn script_extension( canonical_name: &'static str, -) -> Result { +) -> Result { #[cfg(not(feature = "unicode-script"))] - fn imp(_: &'static str) -> Result { + fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-script")] - fn imp(name: &'static str) -> Result { + fn imp(name: &'static str) -> Result { use crate::unicode_tables::script_extension::BY_NAME; property_set(BY_NAME, name) .map(hir_class) @@ -709,14 +779,16 @@ fn script_extension( /// /// If the given boolean property could not be found, or if the boolean /// property data is not available, then an error is returned. -fn bool_property(canonical_name: &'static str) -> Result { +fn bool_property( + canonical_name: &'static str, +) -> Result { #[cfg(not(feature = "unicode-bool"))] - fn imp(_: &'static str) -> Result { + fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-bool")] - fn imp(name: &'static str) -> Result { + fn imp(name: &'static str) -> Result { use crate::unicode_tables::property_bool::BY_NAME; property_set(BY_NAME, name) .map(hir_class) @@ -737,14 +809,14 @@ fn bool_property(canonical_name: &'static str) -> Result { /// /// If the given property could not be found, or if the corresponding data is /// not available, then an error is returned. -fn gcb(canonical_name: &'static str) -> Result { +fn gcb(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-segment"))] - fn imp(_: &'static str) -> Result { + fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-segment")] - fn imp(name: &'static str) -> Result { + fn imp(name: &'static str) -> Result { use crate::unicode_tables::grapheme_cluster_break::BY_NAME; property_set(BY_NAME, name) .map(hir_class) @@ -761,14 +833,14 @@ fn gcb(canonical_name: &'static str) -> Result { /// /// If the given property could not be found, or if the corresponding data is /// not available, then an error is returned. -fn wb(canonical_name: &'static str) -> Result { +fn wb(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-segment"))] - fn imp(_: &'static str) -> Result { + fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-segment")] - fn imp(name: &'static str) -> Result { + fn imp(name: &'static str) -> Result { use crate::unicode_tables::word_break::BY_NAME; property_set(BY_NAME, name) .map(hir_class) @@ -785,14 +857,14 @@ fn wb(canonical_name: &'static str) -> Result { /// /// If the given property could not be found, or if the corresponding data is /// not available, then an error is returned. -fn sb(canonical_name: &'static str) -> Result { +fn sb(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-segment"))] - fn imp(_: &'static str) -> Result { + fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-segment")] - fn imp(name: &'static str) -> Result { + fn imp(name: &'static str) -> Result { use crate::unicode_tables::sentence_break::BY_NAME; property_set(BY_NAME, name) .map(hir_class) @@ -873,72 +945,45 @@ fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { #[cfg(test)] mod tests { - use super::{ - contains_simple_case_mapping, simple_fold, symbolic_name_normalize, - symbolic_name_normalize_bytes, - }; + use super::*; #[cfg(feature = "unicode-case")] fn simple_fold_ok(c: char) -> impl Iterator { - simple_fold(c).unwrap().unwrap() - } - - #[cfg(feature = "unicode-case")] - fn simple_fold_err(c: char) -> Option { - match simple_fold(c).unwrap() { - Ok(_) => unreachable!("simple_fold returned Ok iterator"), - Err(next) => next, - } + SimpleCaseFolder::new().unwrap().mapping(c).iter().copied() } #[cfg(feature = "unicode-case")] fn contains_case_map(start: char, end: char) -> bool { - contains_simple_case_mapping(start, end).unwrap() + SimpleCaseFolder::new().unwrap().overlaps(start, end) } #[test] #[cfg(feature = "unicode-case")] fn simple_fold_k() { let xs: Vec = simple_fold_ok('k').collect(); - assert_eq!(xs, vec!['K', 'K']); + assert_eq!(xs, alloc::vec!['K', 'K']); let xs: Vec = simple_fold_ok('K').collect(); - assert_eq!(xs, vec!['k', 'K']); + assert_eq!(xs, alloc::vec!['k', 'K']); let xs: Vec = simple_fold_ok('K').collect(); - assert_eq!(xs, vec!['K', 'k']); + assert_eq!(xs, alloc::vec!['K', 'k']); } #[test] #[cfg(feature = "unicode-case")] fn simple_fold_a() { let xs: Vec = simple_fold_ok('a').collect(); - assert_eq!(xs, vec!['A']); + assert_eq!(xs, alloc::vec!['A']); let xs: Vec = simple_fold_ok('A').collect(); - assert_eq!(xs, vec!['a']); - } - - #[test] - #[cfg(feature = "unicode-case")] - fn simple_fold_empty() { - assert_eq!(Some('A'), simple_fold_err('?')); - assert_eq!(Some('A'), simple_fold_err('@')); - assert_eq!(Some('a'), simple_fold_err('[')); - assert_eq!(Some('Ⰰ'), simple_fold_err('☃')); - } - - #[test] - #[cfg(feature = "unicode-case")] - fn simple_fold_max() { - assert_eq!(None, simple_fold_err('\u{10FFFE}')); - assert_eq!(None, simple_fold_err('\u{10FFFF}')); + assert_eq!(xs, alloc::vec!['a']); } #[test] #[cfg(not(feature = "unicode-case"))] fn simple_fold_disabled() { - assert!(simple_fold('a').is_err()); + assert!(SimpleCaseFolder::new().is_err()); } #[test] @@ -957,12 +1002,6 @@ mod tests { assert!(!contains_case_map('☃', '☃')); } - #[test] - #[cfg(not(feature = "unicode-case"))] - fn range_contains_disabled() { - assert!(contains_simple_case_mapping('a', 'a').is_err()); - } - #[test] #[cfg(feature = "unicode-gencat")] fn regression_466() { diff --git a/regex-syntax/src/utf8.rs b/regex-syntax/src/utf8.rs index b9c8655320..e13b55abf0 100644 --- a/regex-syntax/src/utf8.rs +++ b/regex-syntax/src/utf8.rs @@ -3,7 +3,7 @@ Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes. This is sub-module is useful for constructing byte based automatons that need to embed UTF-8 decoding. The most common use of this module is in conjunction -with the [`hir::ClassUnicodeRange`](../hir/struct.ClassUnicodeRange.html) type. +with the [`hir::ClassUnicodeRange`](crate::hir::ClassUnicodeRange) type. See the documentation on the `Utf8Sequences` iterator for more details and an example. @@ -80,12 +80,9 @@ I also got the idea from which uses it for executing automata on their term index. */ -#![deny(missing_docs)] +use core::{char, fmt, iter::FusedIterator, slice}; -use std::char; -use std::fmt; -use std::iter::FusedIterator; -use std::slice; +use alloc::{vec, vec::Vec}; const MAX_UTF8_BYTES: usize = 4; @@ -306,7 +303,7 @@ impl Utf8Sequences { /// given. pub fn new(start: char, end: char) -> Self { let mut it = Utf8Sequences { range_stack: vec![] }; - it.push(start as u32, end as u32); + it.push(u32::from(start), u32::from(end)); it } @@ -317,7 +314,7 @@ impl Utf8Sequences { #[doc(hidden)] pub fn reset(&mut self, start: char, end: char) { self.range_stack.clear(); - self.push(start as u32, end as u32); + self.push(u32::from(start), u32::from(end)); } fn push(&mut self, start: u32, end: u32) { @@ -416,7 +413,9 @@ impl ScalarRange { /// values in this range can be encoded as a single byte. fn as_ascii(&self) -> Option { if self.is_ascii() { - Some(Utf8Range::new(self.start as u8, self.end as u8)) + let start = u8::try_from(self.start).unwrap(); + let end = u8::try_from(self.end).unwrap(); + Some(Utf8Range::new(start, end)) } else { None } @@ -455,7 +454,9 @@ fn max_scalar_value(nbytes: usize) -> u32 { #[cfg(test)] mod tests { - use std::char; + use core::char; + + use alloc::{vec, vec::Vec}; use crate::utf8::{Utf8Range, Utf8Sequences}; @@ -472,7 +473,11 @@ mod tests { "Sequence ({:X}, {:X}) contains range {:?}, \ which matches surrogate code point {:X} \ with encoded bytes {:?}", - start as u32, end as u32, r, cp, buf, + u32::from(start), + u32::from(end), + r, + cp, + buf, ); } } @@ -579,9 +584,9 @@ mod tests { assert!(0xD800 <= cp && cp < 0xE000); let mut dst = [0; 3]; - dst[0] = (cp >> 12 & 0x0F) as u8 | TAG_THREE_B; - dst[1] = (cp >> 6 & 0x3F) as u8 | TAG_CONT; - dst[2] = (cp & 0x3F) as u8 | TAG_CONT; + dst[0] = u8::try_from(cp >> 12 & 0x0F).unwrap() | TAG_THREE_B; + dst[1] = u8::try_from(cp >> 6 & 0x3F).unwrap() | TAG_CONT; + dst[2] = u8::try_from(cp & 0x3F).unwrap() | TAG_CONT; dst } } diff --git a/regex-syntax/test b/regex-syntax/test index 4b1b9fb1a9..a4d6cfaba5 100755 --- a/regex-syntax/test +++ b/regex-syntax/test @@ -7,6 +7,7 @@ echo "===== DEFAULT FEATURES ===" cargo test features=( + std unicode unicode-age unicode-bool @@ -17,6 +18,9 @@ features=( unicode-segment ) for f in "${features[@]}"; do - echo "===== FEATURE: $f ===" - cargo test --no-default-features --features "$f" + echo "=== FEATURE: $f ===" + # We only run library tests because I couldn't figure out how to easily + # make doc tests run in 'no_std' mode. In particular, without the Error + # trait, using '?' in doc tests seems tricky. + cargo test --no-default-features --lib --features "$f" done diff --git a/src/compile.rs b/src/compile.rs index 90ca25015f..0030cfb108 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -4,7 +4,7 @@ use std::iter; use std::result; use std::sync::Arc; -use regex_syntax::hir::{self, Hir}; +use regex_syntax::hir::{self, Hir, Look}; use regex_syntax::is_word_byte; use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; @@ -142,8 +142,10 @@ impl Compiler { // Other matching engines handle this by baking the logic into the // matching engine itself. let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; - self.compiled.is_anchored_start = expr.is_anchored_start(); - self.compiled.is_anchored_end = expr.is_anchored_end(); + self.compiled.is_anchored_start = + expr.properties().look_set_prefix().contains(Look::Start); + self.compiled.is_anchored_end = + expr.properties().look_set_suffix().contains(Look::End); if self.compiled.needs_dotstar() { dotstar_patch = self.c_dotstar()?; self.compiled.start = dotstar_patch.entry; @@ -159,6 +161,8 @@ impl Compiler { self.fill_to_next(patch.hole); self.compiled.matches = vec![self.insts.len()]; self.push_compiled(Inst::Match(0)); + self.compiled.static_captures_len = + expr.properties().static_explicit_captures_len(); self.compile_finish() } @@ -168,10 +172,12 @@ impl Compiler { ) -> result::Result { debug_assert!(exprs.len() > 1); - self.compiled.is_anchored_start = - exprs.iter().all(|e| e.is_anchored_start()); - self.compiled.is_anchored_end = - exprs.iter().all(|e| e.is_anchored_end()); + self.compiled.is_anchored_start = exprs + .iter() + .all(|e| e.properties().look_set_prefix().contains(Look::Start)); + self.compiled.is_anchored_end = exprs + .iter() + .all(|e| e.properties().look_set_suffix().contains(Look::End)); let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 }; if self.compiled.needs_dotstar() { dotstar_patch = self.c_dotstar()?; @@ -272,17 +278,21 @@ impl Compiler { self.check_size()?; match *expr.kind() { Empty => self.c_empty(), - Literal(hir::Literal::Unicode(c)) => self.c_char(c), - Literal(hir::Literal::Byte(b)) => { - assert!(self.compiled.uses_bytes()); - self.c_byte(b) + Literal(hir::Literal(ref bytes)) => { + if self.compiled.is_reverse { + let mut bytes = bytes.to_vec(); + bytes.reverse(); + self.c_literal(&bytes) + } else { + self.c_literal(bytes) + } } Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()), Class(hir::Class::Bytes(ref cls)) => { if self.compiled.uses_bytes() { self.c_class_bytes(cls.ranges()) } else { - assert!(cls.is_all_ascii()); + assert!(cls.is_ascii()); let mut char_ranges = vec![]; for r in cls.iter() { let (s, e) = (r.start() as char, r.end() as char); @@ -291,92 +301,94 @@ impl Compiler { self.c_class(&char_ranges) } } - Anchor(hir::Anchor::StartLine) if self.compiled.is_reverse => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::EndLine) - } - Anchor(hir::Anchor::StartLine) => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::StartLine) - } - Anchor(hir::Anchor::EndLine) if self.compiled.is_reverse => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::StartLine) - } - Anchor(hir::Anchor::EndLine) => { - self.byte_classes.set_range(b'\n', b'\n'); - self.c_empty_look(prog::EmptyLook::EndLine) - } - Anchor(hir::Anchor::StartText) if self.compiled.is_reverse => { - self.c_empty_look(prog::EmptyLook::EndText) - } - Anchor(hir::Anchor::StartText) => { - self.c_empty_look(prog::EmptyLook::StartText) - } - Anchor(hir::Anchor::EndText) if self.compiled.is_reverse => { - self.c_empty_look(prog::EmptyLook::StartText) - } - Anchor(hir::Anchor::EndText) => { - self.c_empty_look(prog::EmptyLook::EndText) - } - WordBoundary(hir::WordBoundary::Unicode) => { - if !cfg!(feature = "unicode-perl") { - return Err(Error::Syntax( - "Unicode word boundaries are unavailable when \ - the unicode-perl feature is disabled" - .to_string(), - )); + Look(ref look) => match *look { + hir::Look::Start if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::EndText) } - self.compiled.has_unicode_word_boundary = true; - self.byte_classes.set_word_boundary(); - // We also make sure that all ASCII bytes are in a different - // class from non-ASCII bytes. Otherwise, it's possible for - // ASCII bytes to get lumped into the same class as non-ASCII - // bytes. This in turn may cause the lazy DFA to falsely start - // when it sees an ASCII byte that maps to a byte class with - // non-ASCII bytes. This ensures that never happens. - self.byte_classes.set_range(0, 0x7F); - self.c_empty_look(prog::EmptyLook::WordBoundary) - } - WordBoundary(hir::WordBoundary::UnicodeNegate) => { - if !cfg!(feature = "unicode-perl") { + hir::Look::Start => { + self.c_empty_look(prog::EmptyLook::StartText) + } + hir::Look::End if self.compiled.is_reverse => { + self.c_empty_look(prog::EmptyLook::StartText) + } + hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText), + hir::Look::StartLF if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + hir::Look::StartLF => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + hir::Look::EndLF if self.compiled.is_reverse => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::StartLine) + } + hir::Look::EndLF => { + self.byte_classes.set_range(b'\n', b'\n'); + self.c_empty_look(prog::EmptyLook::EndLine) + } + hir::Look::StartCRLF | hir::Look::EndCRLF => { return Err(Error::Syntax( - "Unicode word boundaries are unavailable when \ - the unicode-perl feature is disabled" + "CRLF-aware line anchors are not supported yet" .to_string(), )); } - self.compiled.has_unicode_word_boundary = true; - self.byte_classes.set_word_boundary(); - // See comments above for why we set the ASCII range here. - self.byte_classes.set_range(0, 0x7F); - self.c_empty_look(prog::EmptyLook::NotWordBoundary) - } - WordBoundary(hir::WordBoundary::Ascii) => { - self.byte_classes.set_word_boundary(); - self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) - } - WordBoundary(hir::WordBoundary::AsciiNegate) => { - self.byte_classes.set_word_boundary(); - self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) - } - Group(ref g) => match g.kind { - hir::GroupKind::NonCapturing => self.c(&g.hir), - hir::GroupKind::CaptureIndex(index) => { - if index as usize >= self.compiled.captures.len() { - self.compiled.captures.push(None); + hir::Look::WordAscii => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::WordBoundaryAscii) + } + hir::Look::WordAsciiNegate => { + self.byte_classes.set_word_boundary(); + self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii) + } + hir::Look::WordUnicode => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); } - self.c_capture(2 * index as usize, &g.hir) + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // We also make sure that all ASCII bytes are in a different + // class from non-ASCII bytes. Otherwise, it's possible for + // ASCII bytes to get lumped into the same class as non-ASCII + // bytes. This in turn may cause the lazy DFA to falsely start + // when it sees an ASCII byte that maps to a byte class with + // non-ASCII bytes. This ensures that never happens. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::WordBoundary) } - hir::GroupKind::CaptureName { index, ref name } => { - if index as usize >= self.compiled.captures.len() { - let n = name.to_string(); - self.compiled.captures.push(Some(n.clone())); - self.capture_name_idx.insert(n, index as usize); + hir::Look::WordUnicodeNegate => { + if !cfg!(feature = "unicode-perl") { + return Err(Error::Syntax( + "Unicode word boundaries are unavailable when \ + the unicode-perl feature is disabled" + .to_string(), + )); } - self.c_capture(2 * index as usize, &g.hir) + self.compiled.has_unicode_word_boundary = true; + self.byte_classes.set_word_boundary(); + // See comments above for why we set the ASCII range here. + self.byte_classes.set_range(0, 0x7F); + self.c_empty_look(prog::EmptyLook::NotWordBoundary) } }, + Capture(hir::Capture { index, ref name, ref sub }) => { + if index as usize >= self.compiled.captures.len() { + let name = match *name { + None => None, + Some(ref boxed_str) => Some(boxed_str.to_string()), + }; + self.compiled.captures.push(name.clone()); + if let Some(name) = name { + self.capture_name_idx.insert(name, index as usize); + } + } + self.c_capture(2 * index as usize, sub) + } Concat(ref es) => { if self.compiled.is_reverse { self.c_concat(es.iter().rev()) @@ -420,21 +432,19 @@ impl Compiler { } fn c_dotstar(&mut self) -> Result { - Ok(if !self.compiled.only_utf8() { - self.c(&Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, - greedy: false, - hir: Box::new(Hir::any(true)), - }))? - .unwrap() + let hir = if self.compiled.only_utf8() { + Hir::dot(hir::Dot::AnyChar) } else { - self.c(&Hir::repetition(hir::Repetition { - kind: hir::RepetitionKind::ZeroOrMore, + Hir::dot(hir::Dot::AnyByte) + }; + Ok(self + .c(&Hir::repetition(hir::Repetition { + min: 0, + max: None, greedy: false, - hir: Box::new(Hir::any(false)), + sub: Box::new(hir), }))? - .unwrap() - }) + .unwrap()) } fn c_char(&mut self, c: char) -> ResultOrEmpty { @@ -457,7 +467,11 @@ impl Compiler { fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty { use std::mem::size_of; - assert!(!ranges.is_empty()); + if ranges.is_empty() { + return Err(Error::Syntax( + "empty character classes are not allowed".to_string(), + )); + } if self.compiled.uses_bytes() { Ok(Some(CompileClass { c: self, ranges }.compile()?)) } else { @@ -482,7 +496,11 @@ impl Compiler { &mut self, ranges: &[hir::ClassBytesRange], ) -> ResultOrEmpty { - debug_assert!(!ranges.is_empty()); + if ranges.is_empty() { + return Err(Error::Syntax( + "empty character classes are not allowed".to_string(), + )); + } let first_split_entry = self.insts.len(); let mut holes = vec![]; @@ -513,6 +531,52 @@ impl Compiler { Ok(Some(Patch { hole, entry: self.insts.len() - 1 })) } + fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty { + match core::str::from_utf8(bytes) { + Ok(string) => { + let mut it = string.chars(); + let Patch { mut hole, entry } = loop { + match it.next() { + None => return self.c_empty(), + Some(ch) => { + if let Some(p) = self.c_char(ch)? { + break p; + } + } + } + }; + for ch in it { + if let Some(p) = self.c_char(ch)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + Err(_) => { + assert!(self.compiled.uses_bytes()); + let mut it = bytes.iter().copied(); + let Patch { mut hole, entry } = loop { + match it.next() { + None => return self.c_empty(), + Some(byte) => { + if let Some(p) = self.c_byte(byte)? { + break p; + } + } + } + }; + for byte in it { + if let Some(p) = self.c_byte(byte)? { + self.fill(hole, p.entry); + hole = p.hole; + } + } + Ok(Some(Patch { hole, entry })) + } + } + } + fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty where I: IntoIterator, @@ -587,19 +651,15 @@ impl Compiler { } fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty { - use regex_syntax::hir::RepetitionKind::*; - match rep.kind { - ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy), - ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy), - OneOrMore => self.c_repeat_one_or_more(&rep.hir, rep.greedy), - Range(hir::RepetitionRange::Exactly(min_max)) => { - self.c_repeat_range(&rep.hir, rep.greedy, min_max, min_max) - } - Range(hir::RepetitionRange::AtLeast(min)) => { - self.c_repeat_range_min_or_more(&rep.hir, rep.greedy, min) + match (rep.min, rep.max) { + (0, Some(1)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy), + (0, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy), + (1, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy), + (min, None) => { + self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min) } - Range(hir::RepetitionRange::Bounded(min, max)) => { - self.c_repeat_range(&rep.hir, rep.greedy, min, max) + (min, Some(max)) => { + self.c_repeat_range(&rep.sub, rep.greedy, min, max) } } } diff --git a/src/dfa.rs b/src/dfa.rs index dc9952120e..78ed71021e 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -1576,7 +1576,7 @@ impl<'a> Fsm<'a> { /// inputs, a new state could be created for every byte of input. (This is /// bad for memory use, so we bound it with a cache.) fn approximate_size(&self) -> usize { - self.cache.size + self.prog.approximate_size() + self.cache.size } } diff --git a/src/error.rs b/src/error.rs index 3e0ec75210..6c341f604b 100644 --- a/src/error.rs +++ b/src/error.rs @@ -6,8 +6,26 @@ use std::iter::repeat; pub enum Error { /// A syntax error. Syntax(String), - /// The compiled program exceeded the set size limit. - /// The argument is the size limit imposed. + /// The compiled program exceeded the set size + /// limit. The argument is the size limit imposed by + /// [`RegexBuilder::size_limit`](crate::RegexBuilder::size_limit). Even + /// when not configured explicitly, it defaults to a reasonable limit. + /// + /// If you're getting this error, it occurred because your regex has been + /// compiled to an intermediate state that is too big. It is important to + /// note that exceeding this limit does _not_ mean the regex is too big to + /// _work_, but rather, the regex is big enough that it may wind up being + /// surprisingly slow when used in a search. In other words, this error is + /// meant to be a practical heuristic for avoiding a performance footgun, + /// and especially so for the case where the regex pattern is coming from + /// an untrusted source. + /// + /// There are generally two ways to move forward if you hit this error. + /// The first is to find some way to use a smaller regex. The second is to + /// increase the size limit via `RegexBuilder::size_limit`. However, if + /// your regex pattern is not from a trusted source, then neither of these + /// approaches may be appropriate. Instead, you'll have to determine just + /// how big of a regex you want to allow. CompiledTooBig(usize), /// Hints that destructuring should not be exhaustive. /// diff --git a/src/exec.rs b/src/exec.rs index b9abcdc040..778a39d4c3 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -4,9 +4,9 @@ use std::panic::AssertUnwindSafe; use std::sync::Arc; #[cfg(feature = "perf-literal")] -use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; -use regex_syntax::hir::literal::Literals; -use regex_syntax::hir::Hir; +use aho_corasick::{AhoCorasick, MatchKind}; +use regex_syntax::hir::literal; +use regex_syntax::hir::{Hir, Look}; use regex_syntax::ParserBuilder; use crate::backtrack; @@ -78,15 +78,18 @@ struct ExecReadOnly { /// not supported.) Note that this program contains an embedded `.*?` /// preceding the first capture group, unless the regex is anchored at the /// beginning. + #[allow(dead_code)] dfa: Program, /// The same as above, except the program is reversed (and there is no /// preceding `.*?`). This is used by the DFA to find the starting location /// of matches. + #[allow(dead_code)] dfa_reverse: Program, /// A set of suffix literals extracted from the regex. /// /// Prefix literals are stored on the `Program`, since they are used inside /// the matching engines. + #[allow(dead_code)] suffixes: LiteralSearcher, /// An Aho-Corasick automaton with leftmost-first match semantics. /// @@ -98,7 +101,7 @@ struct ExecReadOnly { /// if we were to exhaust the ID space, we probably would have long /// surpassed the compilation size limit. #[cfg(feature = "perf-literal")] - ac: Option>, + ac: Option, /// match_type encodes as much upfront knowledge about how we're going to /// execute a search as possible. match_type: MatchType, @@ -121,8 +124,8 @@ pub struct ExecBuilder { /// literals. struct Parsed { exprs: Vec, - prefixes: Literals, - suffixes: Literals, + prefixes: literal::Seq, + suffixes: literal::Seq, bytes: bool, } @@ -228,8 +231,8 @@ impl ExecBuilder { /// Parse the current set of patterns into their AST and extract literals. fn parse(&self) -> Result { let mut exprs = Vec::with_capacity(self.options.pats.len()); - let mut prefixes = Some(Literals::empty()); - let mut suffixes = Some(Literals::empty()); + let mut prefixes = Some(literal::Seq::empty()); + let mut suffixes = Some(literal::Seq::empty()); let mut bytes = false; let is_set = self.options.pats.len() > 1; // If we're compiling a regex set and that set has any anchored @@ -243,54 +246,102 @@ impl ExecBuilder { .swap_greed(self.options.swap_greed) .ignore_whitespace(self.options.ignore_whitespace) .unicode(self.options.unicode) - .allow_invalid_utf8(!self.only_utf8) + .utf8(self.only_utf8) .nest_limit(self.options.nest_limit) .build(); let expr = parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; - bytes = bytes || !expr.is_always_utf8(); + let props = expr.properties(); + // This used to just check whether the HIR matched valid UTF-8 + // or not, but in regex-syntax 0.7, we changed our definition of + // "matches valid UTF-8" to exclude zero-width matches. And in + // particular, previously, we considered WordAsciiNegate (that + // is '(?-u:\B)') to be capable of matching invalid UTF-8. Our + // matcher engines were built under this assumption and fixing + // them is not worth it with the imminent plan to switch over to + // regex-automata. So for now, we retain the previous behavior by + // just explicitly treating the presence of a negated ASCII word + // boundary as forcing use to use a byte oriented automaton. + bytes = bytes + || !props.is_utf8() + || props.look_set().contains(Look::WordAsciiNegate); if cfg!(feature = "perf-literal") { - if !expr.is_anchored_start() && expr.is_any_anchored_start() { + if !props.look_set_prefix().contains(Look::Start) + && props.look_set().contains(Look::Start) + { // Partial anchors unfortunately make it hard to use // prefixes, so disable them. prefixes = None; - } else if is_set && expr.is_anchored_start() { + } else if is_set + && props.look_set_prefix().contains(Look::Start) + { // Regex sets with anchors do not go well with literal // optimizations. prefixes = None; + } else if props.look_set_prefix().contains_word() { + // The new literal extractor ignores look-around while + // the old one refused to extract prefixes from regexes + // that began with a \b. These old creaky regex internals + // can't deal with it, so we drop it. + prefixes = None; + } else if props.look_set().contains(Look::StartLF) { + // Similar to the reasoning for word boundaries, this old + // regex engine can't handle literal prefixes with '(?m:^)' + // at the beginning of a regex. + prefixes = None; } - prefixes = prefixes.and_then(|mut prefixes| { - if !prefixes.union_prefixes(&expr) { - None - } else { - Some(prefixes) - } - }); - if !expr.is_anchored_end() && expr.is_any_anchored_end() { + if !props.look_set_suffix().contains(Look::End) + && props.look_set().contains(Look::End) + { // Partial anchors unfortunately make it hard to use // suffixes, so disable them. suffixes = None; - } else if is_set && expr.is_anchored_end() { + } else if is_set && props.look_set_suffix().contains(Look::End) + { // Regex sets with anchors do not go well with literal // optimizations. suffixes = None; + } else if props.look_set_suffix().contains_word() { + // See the prefix case for reasoning here. + suffixes = None; + } else if props.look_set().contains(Look::EndLF) { + // See the prefix case for reasoning here. + suffixes = None; } - suffixes = suffixes.and_then(|mut suffixes| { - if !suffixes.union_suffixes(&expr) { - None + + let (mut pres, mut suffs) = + if prefixes.is_none() && suffixes.is_none() { + (literal::Seq::infinite(), literal::Seq::infinite()) } else { - Some(suffixes) - } + literal_analysis(&expr) + }; + // These old creaky regex internals can't handle cases where + // the literal sequences are exact but there are look-around + // assertions. So we make sure the sequences are inexact if + // there are look-around assertions anywhere. This forces the + // regex engines to run instead of assuming that a literal + // match implies an overall match. + if !props.look_set().is_empty() { + pres.make_inexact(); + suffs.make_inexact(); + } + prefixes = prefixes.and_then(|mut prefixes| { + prefixes.union(&mut pres); + Some(prefixes) + }); + suffixes = suffixes.and_then(|mut suffixes| { + suffixes.union(&mut suffs); + Some(suffixes) }); } exprs.push(expr); } Ok(Parsed { exprs, - prefixes: prefixes.unwrap_or_else(Literals::empty), - suffixes: suffixes.unwrap_or_else(Literals::empty), + prefixes: prefixes.unwrap_or_else(literal::Seq::empty), + suffixes: suffixes.unwrap_or_else(literal::Seq::empty), bytes, }) } @@ -356,7 +407,7 @@ impl ExecBuilder { } #[cfg(feature = "perf-literal")] - fn build_aho_corasick(&self, parsed: &Parsed) -> Option> { + fn build_aho_corasick(&self, parsed: &Parsed) -> Option { if parsed.exprs.len() != 1 { return None; } @@ -370,10 +421,9 @@ impl ExecBuilder { return None; } Some( - AhoCorasickBuilder::new() + AhoCorasick::builder() .match_kind(MatchKind::LeftmostFirst) - .auto_configure(&lits) - .build_with_size::(&lits) + .build(&lits) // This should never happen because we'd long exceed the // compilation limit for regexes first. .expect("AC automaton too big"), @@ -1311,6 +1361,12 @@ impl Exec { pub fn capture_name_idx(&self) -> &Arc> { &self.ro.nfa.capture_name_idx } + + /// If the number of capture groups in every match is always the same, then + /// return that number. Otherwise return `None`. + pub fn static_captures_len(&self) -> Option { + self.ro.nfa.static_captures_len + } } impl Clone for Exec { @@ -1557,7 +1613,7 @@ fn alternation_literals(expr: &Hir) -> Option>> { // optimization pipeline, because this is a terribly inflexible way to go // about things. - if !expr.is_alternation_literal() { + if !expr.properties().is_alternation_literal() { return None; } let alts = match *expr.kind() { @@ -1565,25 +1621,19 @@ fn alternation_literals(expr: &Hir) -> Option>> { _ => return None, // one literal isn't worth it }; - let extendlit = |lit: &Literal, dst: &mut Vec| match *lit { - Literal::Unicode(c) => { - let mut buf = [0; 4]; - dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes()); - } - Literal::Byte(b) => { - dst.push(b); - } - }; - let mut lits = vec![]; for alt in alts { let mut lit = vec![]; match *alt.kind() { - HirKind::Literal(ref x) => extendlit(x, &mut lit), + HirKind::Literal(Literal(ref bytes)) => { + lit.extend_from_slice(bytes) + } HirKind::Concat(ref exprs) => { for e in exprs { match *e.kind() { - HirKind::Literal(ref x) => extendlit(x, &mut lit), + HirKind::Literal(Literal(ref bytes)) => { + lit.extend_from_slice(bytes); + } _ => unreachable!("expected literal, got {:?}", e), } } @@ -1595,6 +1645,48 @@ fn alternation_literals(expr: &Hir) -> Option>> { Some(lits) } +#[cfg(not(feature = "perf-literal"))] +fn literal_analysis(_: &Hir) -> (literal::Seq, literal::Seq) { + (literal::Seq::infinite(), literal::Seq::infinite()) +} + +#[cfg(feature = "perf-literal")] +fn literal_analysis(expr: &Hir) -> (literal::Seq, literal::Seq) { + const ATTEMPTS: [(usize, usize); 3] = [(5, 50), (4, 30), (3, 20)]; + + let mut prefixes = literal::Extractor::new() + .kind(literal::ExtractKind::Prefix) + .extract(expr); + for (keep, limit) in ATTEMPTS { + let len = match prefixes.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + prefixes.keep_first_bytes(keep); + prefixes.minimize_by_preference(); + } + + let mut suffixes = literal::Extractor::new() + .kind(literal::ExtractKind::Suffix) + .extract(expr); + for (keep, limit) in ATTEMPTS { + let len = match suffixes.len() { + None => break, + Some(len) => len, + }; + if len <= limit { + break; + } + suffixes.keep_last_bytes(keep); + suffixes.minimize_by_preference(); + } + + (prefixes, suffixes) +} + #[cfg(test)] mod test { #[test] diff --git a/src/expand.rs b/src/expand.rs index 67b514926a..98fafc949f 100644 --- a/src/expand.rs +++ b/src/expand.rs @@ -182,7 +182,8 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option> { }) } -/// Returns true if and only if the given byte is allowed in a capture name. +/// Returns true if and only if the given byte is allowed in a capture name +/// written in non-brace form. fn is_valid_cap_letter(b: u8) -> bool { match b { b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, @@ -236,4 +237,11 @@ mod tests { find!(find_cap_ref17, "$x_$y", c!("x_", 3)); find!(find_cap_ref18, "${#}", c!("#", 4)); find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); + find!(find_cap_ref20, "${¾}", c!("¾", 5)); + find!(find_cap_ref21, "${¾a}", c!("¾a", 6)); + find!(find_cap_ref22, "${a¾}", c!("a¾", 6)); + find!(find_cap_ref23, "${☃}", c!("☃", 6)); + find!(find_cap_ref24, "${a☃}", c!("a☃", 7)); + find!(find_cap_ref25, "${☃a}", c!("☃a", 7)); + find!(find_cap_ref26, "${名字}", c!("名字", 9)); } diff --git a/src/lib.rs b/src/lib.rs index 6b95739c5c..82c1b77ad8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -199,6 +199,8 @@ instead.) This implementation executes regular expressions **only** on valid UTF-8 while exposing match locations as byte indices into the search string. (To relax this restriction, use the [`bytes`](bytes/index.html) sub-module.) +Conceptually, the regex engine works by matching a haystack as if it were a +sequence of Unicode scalar values. Only simple case folding is supported. Namely, when matching case-insensitively, the characters are first mapped using the "simple" case @@ -285,9 +287,9 @@ a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax). . any character except new line (includes new line with s flag) \d digit (\p{Nd}) \D not digit -\pN One-letter name Unicode character class +\pX Unicode character class identified by a one-letter name \p{Greek} Unicode character class (general category or script) -\PN Negated one-letter name Unicode character class +\PX Negated Unicode character class identified by a one-letter name \P{Greek} negated Unicode character class (general category or script) @@ -325,6 +327,25 @@ xy concatenation (x followed by y) x|y alternation (x or y, prefer x) +This example shows how an alternation works, and what it means to prefer a +branch in the alternation over subsequent branches. + +``` +use regex::Regex; + +let haystack = "samwise"; +// If 'samwise' comes first in our alternation, then it is +// preferred as a match, even if the regex engine could +// technically detect that 'sam' led to a match earlier. +let re = Regex::new(r"samwise|sam").unwrap(); +assert_eq!("samwise", re.find(haystack).unwrap().as_str()); +// But if 'sam' comes first, then it will match instead. +// In this case, it is impossible for 'samwise' to match +// because 'sam' is a prefix of it. +let re = Regex::new(r"sam|samwise").unwrap(); +assert_eq!("sam", re.find(haystack).unwrap().as_str()); +``` + ## Repetitions
@@ -360,12 +381,19 @@ regex matches `abc` at positions `0`, `1`, `2` and `3`.
 
 
 (exp)          numbered capture group (indexed by opening parenthesis)
-(?P<name>exp)  named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
+(?P<name>exp)  named (also numbered) capture group (names must be alpha-numeric)
+(?<name>exp)   named (also numbered) capture group (names must be alpha-numeric)
 (?:exp)        non-capturing group
 (?flags)       set flags within current group
 (?flags:exp)   set flags for exp (non-capturing)
 
+Capture group names must be any sequence of alpha-numeric Unicode codepoints, +in addition to `.`, `_`, `[` and `]`. Names must start with either an `_` or +an alphabetic codepoint. Alphabetic codepoints correspond to the `Alphabetic` +Unicode property, while numeric codepoints correspond to the union of the +`Decimal_Number`, `Letter_Number` and `Other_Number` general categories. + Flags are each a single character. For example, `(?x)` sets the flag `x` and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets @@ -379,9 +407,13 @@ m multi-line mode: ^ and $ match begin/end of line s allow . to match \n U swap the meaning of x* and x*? u Unicode support (enabled by default) -x ignore whitespace and allow line comments (starting with `#`) +x verbose mode, ignores whitespace and allow line comments (starting with `#`)
+Note that in verbose mode, whitespace is ignored everywhere, including within +character classes. To insert whitespace, use its escaped form or a hex literal. +For example, `\ ` or `\x20` for an ASCII space. + Flags can be toggled within a pattern. Here's an example that matches case-insensitively for the first part but case-sensitively for the second part: diff --git a/src/literal/imp.rs b/src/literal/imp.rs index 90b2f11606..75fa6e37b2 100644 --- a/src/literal/imp.rs +++ b/src/literal/imp.rs @@ -1,8 +1,8 @@ use std::mem; -use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder}; +use aho_corasick::{self, packed, AhoCorasick}; use memchr::{memchr, memchr2, memchr3, memmem}; -use regex_syntax::hir::literal::{Literal, Literals}; +use regex_syntax::hir::literal::{Literal, Seq}; /// A prefix extracted from a compiled regular expression. /// @@ -26,7 +26,7 @@ enum Matcher { /// A single substring, using vector accelerated routines when available. Memmem(Memmem), /// An Aho-Corasick automaton. - AC { ac: AhoCorasick, lits: Vec }, + AC { ac: AhoCorasick, lits: Vec }, /// A packed multiple substring searcher, using SIMD. /// /// Note that Aho-Corasick will actually use this packed searcher @@ -39,27 +39,26 @@ enum Matcher { impl LiteralSearcher { /// Returns a matcher that never matches and never advances the input. pub fn empty() -> Self { - Self::new(Literals::empty(), Matcher::Empty) + Self::new(Seq::infinite(), Matcher::Empty) } /// Returns a matcher for literal prefixes from the given set. - pub fn prefixes(lits: Literals) -> Self { + pub fn prefixes(lits: Seq) -> Self { let matcher = Matcher::prefixes(&lits); Self::new(lits, matcher) } /// Returns a matcher for literal suffixes from the given set. - pub fn suffixes(lits: Literals) -> Self { + pub fn suffixes(lits: Seq) -> Self { let matcher = Matcher::suffixes(&lits); Self::new(lits, matcher) } - fn new(lits: Literals, matcher: Matcher) -> Self { - let complete = lits.all_complete(); + fn new(lits: Seq, matcher: Matcher) -> Self { LiteralSearcher { - complete, - lcp: Memmem::new(lits.longest_common_prefix()), - lcs: Memmem::new(lits.longest_common_suffix()), + complete: lits.is_exact(), + lcp: Memmem::new(lits.longest_common_prefix().unwrap_or(b"")), + lcs: Memmem::new(lits.longest_common_suffix().unwrap_or(b"")), matcher, } } @@ -150,7 +149,7 @@ impl LiteralSearcher { Empty => 0, Bytes(ref sset) => sset.dense.len(), Memmem(_) => 1, - AC { ref ac, .. } => ac.pattern_count(), + AC { ref ac, .. } => ac.patterns_len(), Packed { ref lits, .. } => lits.len(), } } @@ -162,27 +161,31 @@ impl LiteralSearcher { Empty => 0, Bytes(ref sset) => sset.approximate_size(), Memmem(ref single) => single.approximate_size(), - AC { ref ac, .. } => ac.heap_bytes(), - Packed { ref s, .. } => s.heap_bytes(), + AC { ref ac, .. } => ac.memory_usage(), + Packed { ref s, .. } => s.memory_usage(), } } } impl Matcher { - fn prefixes(lits: &Literals) -> Self { + fn prefixes(lits: &Seq) -> Self { let sset = SingleByteSet::prefixes(lits); Matcher::new(lits, sset) } - fn suffixes(lits: &Literals) -> Self { + fn suffixes(lits: &Seq) -> Self { let sset = SingleByteSet::suffixes(lits); Matcher::new(lits, sset) } - fn new(lits: &Literals, sset: SingleByteSet) -> Self { - if lits.literals().is_empty() { + fn new(lits: &Seq, sset: SingleByteSet) -> Self { + if lits.is_empty() || lits.min_literal_len() == Some(0) { return Matcher::Empty; } + let lits = match lits.literals() { + None => return Matcher::Empty, + Some(members) => members, + }; if sset.dense.len() >= 26 { // Avoid trying to match a large number of single bytes. // This is *very* sensitive to a frequency analysis comparison @@ -195,26 +198,26 @@ impl Matcher { if sset.complete { return Matcher::Bytes(sset); } - if lits.literals().len() == 1 { - return Matcher::Memmem(Memmem::new(&lits.literals()[0])); + if lits.len() == 1 { + return Matcher::Memmem(Memmem::new(lits[0].as_bytes())); } - let pats = lits.literals().to_owned(); + let pats: Vec<&[u8]> = lits.iter().map(|lit| lit.as_bytes()).collect(); let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii; - if lits.literals().len() <= 100 && !is_aho_corasick_fast { + if lits.len() <= 100 && !is_aho_corasick_fast { let mut builder = packed::Config::new() .match_kind(packed::MatchKind::LeftmostFirst) .builder(); if let Some(s) = builder.extend(&pats).build() { - return Matcher::Packed { s, lits: pats }; + return Matcher::Packed { s, lits: lits.to_owned() }; } } - let ac = AhoCorasickBuilder::new() + let ac = AhoCorasick::builder() .match_kind(aho_corasick::MatchKind::LeftmostFirst) - .dfa(true) - .build_with_size::(&pats) + .kind(Some(aho_corasick::AhoCorasickKind::DFA)) + .build(&pats) .unwrap(); - Matcher::AC { ac, lits: pats } + Matcher::AC { ac, lits: lits.to_owned() } } } @@ -257,7 +260,7 @@ impl<'a> Iterator for LiteralIter<'a> { } else { let next = &lits[0]; *lits = &lits[1..]; - Some(&**next) + Some(next.as_bytes()) } } LiteralIter::Packed(ref mut lits) => { @@ -266,7 +269,7 @@ impl<'a> Iterator for LiteralIter<'a> { } else { let next = &lits[0]; *lits = &lits[1..]; - Some(&**next) + Some(next.as_bytes()) } } } @@ -291,11 +294,15 @@ impl SingleByteSet { } } - fn prefixes(lits: &Literals) -> SingleByteSet { + fn prefixes(lits: &Seq) -> SingleByteSet { let mut sset = SingleByteSet::new(); - for lit in lits.literals() { + let lits = match lits.literals() { + None => return sset, + Some(lits) => lits, + }; + for lit in lits.iter() { sset.complete = sset.complete && lit.len() == 1; - if let Some(&b) = lit.get(0) { + if let Some(&b) = lit.as_bytes().get(0) { if !sset.sparse[b as usize] { if b > 0x7F { sset.all_ascii = false; @@ -308,11 +315,15 @@ impl SingleByteSet { sset } - fn suffixes(lits: &Literals) -> SingleByteSet { + fn suffixes(lits: &Seq) -> SingleByteSet { let mut sset = SingleByteSet::new(); - for lit in lits.literals() { + let lits = match lits.literals() { + None => return sset, + Some(lits) => lits, + }; + for lit in lits.iter() { sset.complete = sset.complete && lit.len() == 1; - if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) { + if let Some(&b) = lit.as_bytes().last() { if !sset.sparse[b as usize] { if b > 0x7F { sset.all_ascii = false; diff --git a/src/literal/mod.rs b/src/literal/mod.rs index 980f523309..b9fb77aed9 100644 --- a/src/literal/mod.rs +++ b/src/literal/mod.rs @@ -6,7 +6,7 @@ mod imp; #[allow(missing_docs)] #[cfg(not(feature = "perf-literal"))] mod imp { - use regex_syntax::hir::literal::Literals; + use regex_syntax::hir::literal::Seq; #[derive(Clone, Debug)] pub struct LiteralSearcher(()); @@ -16,11 +16,11 @@ mod imp { LiteralSearcher(()) } - pub fn prefixes(_: Literals) -> Self { + pub fn prefixes(_: Seq) -> Self { LiteralSearcher(()) } - pub fn suffixes(_: Literals) -> Self { + pub fn suffixes(_: Seq) -> Self { LiteralSearcher(()) } diff --git a/src/prog.rs b/src/prog.rs index c211f71d8a..100862cf1b 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -27,6 +27,9 @@ pub struct Program { pub captures: Vec>, /// Pointers to all named capture groups into `captures`. pub capture_name_idx: Arc>, + /// If the number of capture groups is the same for all possible matches, + /// then this is that number. + pub static_captures_len: Option, /// A pointer to the start instruction. This can vary depending on how /// the program was compiled. For example, programs for use with the DFA /// engine have a `.*?` inserted at the beginning of unanchored regular @@ -83,6 +86,7 @@ impl Program { matches: vec![], captures: vec![], capture_name_idx: Arc::new(HashMap::new()), + static_captures_len: None, start: 0, byte_classes: vec![0; 256], only_utf8: true, diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 07e9f98acc..e3a3b019b5 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -17,7 +17,7 @@ use crate::re_trait::{self, RegularExpression, SubCapturesPosIter}; /// Match represents a single match of a regex in a haystack. /// /// The lifetime parameter `'t` refers to the lifetime of the matched text. -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Eq, PartialEq)] pub struct Match<'t> { text: &'t [u8], start: usize, @@ -37,6 +37,18 @@ impl<'t> Match<'t> { self.end } + /// Returns true if and only if this match has a length of zero. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + /// Returns the length, in bytes, of this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + /// Returns the range over the starting and ending byte offsets of the /// match in the haystack. #[inline] @@ -57,6 +69,24 @@ impl<'t> Match<'t> { } } +impl<'t> std::fmt::Debug for Match<'t> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let mut fmt = f.debug_struct("Match"); + fmt.field("start", &self.start).field("end", &self.end); + if let Ok(s) = std::str::from_utf8(self.as_bytes()) { + fmt.field("bytes", &s); + } else { + // FIXME: It would be nice if this could be printed as a string + // with invalid UTF-8 replaced with hex escapes. A alloc would + // probably okay if that makes it easier, but regex-automata does + // (at time of writing) have internal routines that do this. So + // maybe we should expose them. + fmt.field("bytes", &self.as_bytes()); + } + fmt.finish() + } +} + impl<'t> From> for Range { fn from(m: Match<'t>) -> Range { m.range() @@ -253,12 +283,7 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `get(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t [u8]) -> Option> { - let mut locs = self.capture_locations(); - self.captures_read_at(&mut locs, text, 0).map(move |_| Captures { - text, - locs: locs.0, - named_groups: self.0.capture_name_idx().clone(), - }) + self.captures_at(text, 0) } /// Returns an iterator over all the non-overlapping capture groups matched @@ -537,7 +562,14 @@ impl Regex { /// This method may have the same performance characteristics as /// `is_match`, except it provides an end location for a match. In /// particular, the location returned *may be shorter* than the proper end - /// of the leftmost-first match. + /// of the leftmost-first match that you would find via `Regex::find`. + /// + /// Note that it is not guaranteed that this routine finds the shortest or + /// "earliest" possible match. Instead, the main idea of this API is that + /// it returns the offset at the point at which the internal regex engine + /// has determined that a match has occurred. This may vary depending on + /// which internal regex engine is used, and thus, the offset itself may + /// change. /// /// # Example /// @@ -598,6 +630,25 @@ impl Regex { .map(|(s, e)| Match::new(text, s, e)) } + /// Returns the same as [`Regex::captures`], but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_at<'t>( + &self, + text: &'t [u8], + start: usize, + ) -> Option> { + let mut locs = self.capture_locations(); + self.captures_read_at(&mut locs, text, start).map(move |_| Captures { + text, + locs: locs.0, + named_groups: self.0.capture_name_idx().clone(), + }) + } + /// This is like `captures`, but uses /// [`CaptureLocations`](struct.CaptureLocations.html) /// instead of @@ -667,6 +718,46 @@ impl Regex { self.0.capture_names().len() } + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option { + self.0.static_captures_len().map(|len| len.saturating_add(1)) + } + /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations { @@ -856,6 +947,27 @@ impl<'r> FusedIterator for CaptureNames<'r> {} /// In order to build a value of this type, you'll need to call the /// `capture_locations` method on the `Regex` being used to execute the search. /// The value returned can then be reused in subsequent searches. +/// +/// # Example +/// +/// This example shows how to create and use `CaptureLocations` in a search. +/// +/// ``` +/// use regex::bytes::Regex; +/// +/// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); +/// let mut locs = re.capture_locations(); +/// let m = re.captures_read(&mut locs, b"Bruce Springsteen").unwrap(); +/// assert_eq!(0..17, m.range()); +/// assert_eq!(Some((0, 17)), locs.get(0)); +/// assert_eq!(Some((0, 5)), locs.get(1)); +/// assert_eq!(Some((6, 17)), locs.get(2)); +/// +/// // Asking for an invalid capture group always returns None. +/// assert_eq!(None, locs.get(3)); +/// assert_eq!(None, locs.get(34973498648)); +/// assert_eq!(None, locs.get(9944060567225171988)); +/// ``` #[derive(Clone, Debug)] pub struct CaptureLocations(re_trait::Locations); diff --git a/src/re_set.rs b/src/re_set.rs index a6d886d761..7c8253f0ca 100644 --- a/src/re_set.rs +++ b/src/re_set.rs @@ -289,6 +289,12 @@ impl RegexSet { } } +impl Default for RegexSet { + fn default() -> Self { + RegexSet::empty() + } +} + /// A set of matches returned by a regex set. #[derive(Clone, Debug)] pub struct SetMatches { @@ -315,6 +321,11 @@ impl SetMatches { } /// The total number of regexes in the set that created these matches. + /// + /// **WARNING:** This always returns the same value as [`RegexSet::len`]. + /// In particular, it does *not* return the number of elements yielded by + /// [`SetMatches::iter`]. The only way to determine the total number of + /// matched regexes is to iterate over them. pub fn len(&self) -> usize { self.matches.len() } diff --git a/src/re_trait.rs b/src/re_trait.rs index d0c717df5a..505810c848 100644 --- a/src/re_trait.rs +++ b/src/re_trait.rs @@ -20,7 +20,7 @@ impl Locations { /// not match anything. The positions returned are *always* byte indices /// with respect to the original string matched. pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - let (s, e) = (i * 2, i * 2 + 1); + let (s, e) = (i.checked_mul(2)?, i.checked_mul(2)?.checked_add(1)?); match (self.0.get(s), self.0.get(e)) { (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), _ => None, diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 197510ea0d..57689086dc 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -25,7 +25,7 @@ pub fn escape(text: &str) -> String { /// Match represents a single match of a regex in a haystack. /// /// The lifetime parameter `'t` refers to the lifetime of the matched text. -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Eq, PartialEq)] pub struct Match<'t> { text: &'t str, start: usize, @@ -45,6 +45,18 @@ impl<'t> Match<'t> { self.end } + /// Returns true if and only if this match has a length of zero. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + /// Returns the length, in bytes, of this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + /// Returns the range over the starting and ending byte offsets of the /// match in the haystack. #[inline] @@ -65,6 +77,16 @@ impl<'t> Match<'t> { } } +impl<'t> std::fmt::Debug for Match<'t> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("Match") + .field("start", &self.start) + .field("end", &self.end) + .field("string", &self.as_str()) + .finish() + } +} + impl<'t> From> for &'t str { fn from(m: Match<'t>) -> &'t str { m.as_str() @@ -309,12 +331,7 @@ impl Regex { /// The `0`th capture group is always unnamed, so it must always be /// accessed with `get(0)` or `[0]`. pub fn captures<'t>(&self, text: &'t str) -> Option> { - let mut locs = self.capture_locations(); - self.captures_read_at(&mut locs, text, 0).map(move |_| Captures { - text, - locs: locs.0, - named_groups: self.0.capture_name_idx().clone(), - }) + self.captures_at(text, 0) } /// Returns an iterator over all the non-overlapping capture groups matched @@ -595,7 +612,14 @@ impl Regex { /// This method may have the same performance characteristics as /// `is_match`, except it provides an end location for a match. In /// particular, the location returned *may be shorter* than the proper end - /// of the leftmost-first match. + /// of the leftmost-first match that you would find via `Regex::find`. + /// + /// Note that it is not guaranteed that this routine finds the shortest or + /// "earliest" possible match. Instead, the main idea of this API is that + /// it returns the offset at the point at which the internal regex engine + /// has determined that a match has occurred. This may vary depending on + /// which internal regex engine is used, and thus, the offset itself may + /// change. /// /// # Example /// @@ -615,12 +639,12 @@ impl Regex { self.shortest_match_at(text, 0) } - /// Returns the same as shortest_match, but starts the search at the given - /// offset. + /// Returns the same as `shortest_match`, but starts the search at the + /// given offset. /// /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. + /// context into consideration. For example, the `\A` anchor can only match + /// when `start == 0`. pub fn shortest_match_at( &self, text: &str, @@ -656,6 +680,25 @@ impl Regex { .map(|(s, e)| Match::new(text, s, e)) } + /// Returns the same as [`Regex::captures`], but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + pub fn captures_at<'t>( + &self, + text: &'t str, + start: usize, + ) -> Option> { + let mut locs = self.capture_locations(); + self.captures_read_at(&mut locs, text, start).map(move |_| Captures { + text, + locs: locs.0, + named_groups: self.0.capture_name_idx().clone(), + }) + } + /// This is like `captures`, but uses /// [`CaptureLocations`](struct.CaptureLocations.html) /// instead of @@ -725,6 +768,46 @@ impl Regex { self.0.capture_names().len() } + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option { + self.0.static_captures_len().map(|len| len.saturating_add(1)) + } + /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations { @@ -866,6 +949,27 @@ impl<'r, 't> FusedIterator for SplitN<'r, 't> {} /// In order to build a value of this type, you'll need to call the /// `capture_locations` method on the `Regex` being used to execute the search. /// The value returned can then be reused in subsequent searches. +/// +/// # Example +/// +/// This example shows how to create and use `CaptureLocations` in a search. +/// +/// ``` +/// use regex::Regex; +/// +/// let re = Regex::new(r"(?\w+)\s+(?\w+)").unwrap(); +/// let mut locs = re.capture_locations(); +/// let m = re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); +/// assert_eq!(0..17, m.range()); +/// assert_eq!(Some((0, 17)), locs.get(0)); +/// assert_eq!(Some((0, 5)), locs.get(1)); +/// assert_eq!(Some((6, 17)), locs.get(2)); +/// +/// // Asking for an invalid capture group always returns None. +/// assert_eq!(None, locs.get(3)); +/// assert_eq!(None, locs.get(34973498648)); +/// assert_eq!(None, locs.get(9944060567225171988)); +/// ``` #[derive(Clone, Debug)] pub struct CaptureLocations(re_trait::Locations); diff --git a/tests/replace.rs b/tests/replace.rs index d65be072ff..f23c575515 100644 --- a/tests/replace.rs +++ b/tests/replace.rs @@ -15,7 +15,7 @@ replace!(all, replace_all, r"[0-9]", "age: 26", t!("Z"), "age: ZZ"); replace!( groups, replace, - r"(?-u)(\S+)\s+(\S+)", + r"([^ ]+)[ ]+([^ ]+)", "w1 w2", t!("$2 $1"), "w2 w1" @@ -23,7 +23,7 @@ replace!( replace!( double_dollar, replace, - r"(?-u)(\S+)\s+(\S+)", + r"([^ ]+)[ ]+([^ ]+)", "w1 w2", t!("$2 $$1"), "w2 $1" @@ -33,7 +33,7 @@ replace!( replace!( named, replace_all, - r"(?-u)(?P\S+)\s+(?P\S+)(?P\s*)", + r"(?P[^ ]+)[ ]+(?P[^ ]+)(?P[ ]*)", "w1 w2 w3 w4", t!("$last $first$space"), "w2 w1 w4 w3" @@ -51,7 +51,7 @@ replace!(number_hypen, replace, r"(.)(.)", "ab", t!("$1-$2"), "a-b"); replace!( simple_expand, replace_all, - r"(?-u)(\w) (\w)", + r"([a-z]) ([a-z])", "a b", t!("$2 $1"), "b a" @@ -59,7 +59,7 @@ replace!( replace!( literal_dollar1, replace_all, - r"(?-u)(\w+) (\w+)", + r"([a-z]+) ([a-z]+)", "a b", t!("$$1"), "$1" @@ -67,7 +67,7 @@ replace!( replace!( literal_dollar2, replace_all, - r"(?-u)(\w+) (\w+)", + r"([a-z]+) ([a-z]+)", "a b", t!("$2 $$c $1"), "b $c a" @@ -75,7 +75,7 @@ replace!( replace!( no_expand1, replace, - r"(?-u)(\S+)\s+(\S+)", + r"([^ ]+)[ ]+([^ ]+)", "w1 w2", no_expand!("$2 $1"), "$2 $1" @@ -83,7 +83,7 @@ replace!( replace!( no_expand2, replace, - r"(?-u)(\S+)\s+(\S+)", + r"([^ ]+)[ ]+([^ ]+)", "w1 w2", no_expand!("$$1"), "$$1" diff --git a/tests/set.rs b/tests/set.rs index 37fcf8700c..d1144d6623 100644 --- a/tests/set.rs +++ b/tests/set.rs @@ -65,3 +65,10 @@ fn len_and_empty() { assert_eq!(not_empty.len(), 2); assert!(!not_empty.is_empty()); } + +#[test] +fn default_set_is_empty() { + let set: regex::bytes::RegexSet = Default::default(); + assert_eq!(set.len(), 0); + assert!(set.is_empty()); +} diff --git a/tests/unicode.rs b/tests/unicode.rs index 9b32286247..d7dbdd31b8 100644 --- a/tests/unicode.rs +++ b/tests/unicode.rs @@ -35,6 +35,8 @@ mat!(uni_not_boundary_ogham, r"\d\B", "6 ", None); // We should test more, but there's a lot. Write a script to generate more of // these tests. mat!(uni_class_gencat_cased_letter, r"\p{Cased_Letter}", "A", Some((0, 3))); +mat!(uni_class_gencat_cased_letter2, r"\p{gc=LC}", "A", Some((0, 3))); +mat!(uni_class_gencat_cased_letter3, r"\p{LC}", "A", Some((0, 3))); mat!( uni_class_gencat_close_punctuation, r"\p{Close_Punctuation}", @@ -77,6 +79,7 @@ mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4))); // See: https://github.com/rust-lang/regex/issues/719 mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4))); mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4))); +mat!(uni_class_gencat_format_abbrev3, r"\p{Sc}", "$", Some((0, 1))); mat!( uni_class_gencat_initial_punctuation, r"\p{Initial_Punctuation}",