diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
index 7e7fed6cab65f..40897343cc471 100644
--- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
+++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -732,6 +732,11 @@ private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter w
EmitIndexOfString_RightToLeft();
break;
+ case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
+ case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight:
+ EmitIndexOfStrings_LeftToRight();
+ break;
+
case FindNextStartingPositionMode.LeadingSet_LeftToRight:
case FindNextStartingPositionMode.FixedDistanceSets_LeftToRight:
EmitFixedSet_LeftToRight();
@@ -1041,6 +1046,37 @@ UnicodeCategory.NonSpacingMark or
}
}
+ // Emits a case-sensitive left-to-right search for any one of multiple leading prefixes.
+ void EmitIndexOfStrings_LeftToRight()
+ {
+ RegexFindOptimizations opts = regexTree.FindOptimizations;
+ Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingStrings_LeftToRight or FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight);
+
+ string prefixes = string.Join(", ", opts.LeadingPrefixes.Select(prefix => Literal(prefix)));
+ StringComparison stringComparison = opts.FindMode is FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight ?
+ StringComparison.OrdinalIgnoreCase :
+ StringComparison.Ordinal;
+ string fieldName = GetSHA256FieldName($"s_indexOfAnyStrings_{stringComparison}_", prefixes);
+
+ if (!requiredHelpers.ContainsKey(fieldName))
+ {
+ requiredHelpers.Add(fieldName,
+ [
+ $"/// Supports searching for the specified strings.",
+ $"internal static readonly SearchValues {fieldName} = SearchValues.Create([{prefixes}], StringComparison.{stringComparison});", // explicitly using an array in case prefixes is large
+ ]);
+ }
+
+ writer.WriteLine($"// The pattern has multiple strings that could begin the match. Search for any of them.");
+ writer.WriteLine($"// If none can be found, there's no match.");
+ writer.WriteLine($"int i = inputSpan.Slice(pos).IndexOfAny({HelpersTypeName}.{fieldName});");
+ using (EmitBlock(writer, "if (i >= 0)"))
+ {
+ writer.WriteLine("base.runtextpos = pos + i;");
+ writer.WriteLine("return true;");
+ }
+ }
+
// Emits a case-sensitive right-to-left search for a substring.
void EmitIndexOfString_RightToLeft()
{
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
index c56ad4b5b6e05..ed67df6819023 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs
@@ -1054,6 +1054,21 @@ public static bool IsAscii(ReadOnlySpan s)
#endif
}
+ /// Gets whether the set description string is for two ASCII letters that case to each other under OrdinalIgnoreCase rules.
+ public static bool SetContainsAsciiOrdinalIgnoreCaseCharacter(string set, Span twoChars)
+ {
+ Debug.Assert(twoChars.Length >= 2);
+ return
+ !IsNegated(set) &&
+ GetSetChars(set, twoChars) == 2 &&
+ twoChars[0] < 128 &&
+ twoChars[1] < 128 &&
+ twoChars[0] != twoChars[1] &&
+ char.IsLetter(twoChars[0]) &&
+ char.IsLetter(twoChars[1]) &&
+ (twoChars[0] | 0x20) == (twoChars[1] | 0x20);
+ }
+
/// Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents.
/// This may enumerate negated characters if the set is negated. This will return false if the set has subtraction.
private static bool CanEasilyEnumerateSetContents(string set) =>
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
index bac950c6db2f9..caf8479199d36 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs
@@ -460,6 +460,8 @@ protected void EmitTryFindNextPossibleStartingPosition()
{
case FindNextStartingPositionMode.LeadingString_LeftToRight:
case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight:
+ case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
+ case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight:
case FindNextStartingPositionMode.FixedDistanceString_LeftToRight:
EmitIndexOfString_LeftToRight();
break;
@@ -745,15 +747,19 @@ bool EmitAnchors()
return false;
}
- // Emits a case-sensitive left-to-right search for a substring.
+ // Emits a case-sensitive left-to-right search for a substring or substrings.
void EmitIndexOfString_LeftToRight()
{
RegexFindOptimizations opts = _regexTree.FindOptimizations;
- Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight);
+ Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or
+ FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or
+ FindNextStartingPositionMode.FixedDistanceString_LeftToRight or
+ FindNextStartingPositionMode.LeadingStrings_LeftToRight or
+ FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight);
using RentedLocalBuilder i = RentInt32Local();
- // int i = inputSpan.Slice(pos).IndexOf(prefix);
+ // int i = inputSpan.Slice(pos)...
Ldloca(inputSpan);
Ldloc(pos);
if (opts.FindMode is FindNextStartingPositionMode.FixedDistanceString_LeftToRight &&
@@ -763,11 +769,21 @@ void EmitIndexOfString_LeftToRight()
Add();
}
Call(s_spanSliceIntMethod);
- string literalString = opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ?
- opts.LeadingPrefix :
- opts.FixedDistanceLiteral.String!;
- LoadSearchValues([literalString], opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
- Call(s_spanIndexOfAnySearchValuesString);
+
+ // ...IndexOf(prefix);
+ if (opts.FindMode is FindNextStartingPositionMode.LeadingStrings_LeftToRight or FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight)
+ {
+ LoadSearchValues(opts.LeadingPrefixes, opts.FindMode is FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
+ Call(s_spanIndexOfAnySearchValuesString);
+ }
+ else
+ {
+ string literalString = opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ?
+ opts.LeadingPrefix :
+ opts.FixedDistanceLiteral.String!;
+ LoadSearchValues([literalString], opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
+ Call(s_spanIndexOfAnySearchValuesString);
+ }
Stloc(i);
// if (i < 0) goto ReturnFalse;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
index f40f48e35a6d9..a8dc9f4fd0e58 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs
@@ -137,7 +137,28 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
return;
}
- // We're now left-to-right only and looking for sets.
+ // We're now left-to-right only and looking for multiple prefixes and/or sets.
+
+ // If there are multiple leading strings, we can search for any of them.
+ if (compiled)
+ {
+ if (RegexPrefixAnalyzer.FindPrefixes(root, ignoreCase: true) is { Length: > 1 } caseInsensitivePrefixes)
+ {
+ LeadingPrefixes = caseInsensitivePrefixes;
+ FindMode = FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight;
+ return;
+ }
+
+ // TODO: While some benchmarks benefit from this significantly, others regressed a bit (in particular those with few
+ // matches). Before enabling this, we need to investigate the performance impact on real-world scenarios,
+ // and see if there are ways to reduce the impact.
+ //if (RegexPrefixAnalyzer.FindPrefixes(root, ignoreCase: false) is { Length: > 1 } caseSensitivePrefixes)
+ //{
+ // LeadingPrefixes = caseSensitivePrefixes;
+ // FindMode = FindNextStartingPositionMode.LeadingStrings_LeftToRight;
+ // return;
+ //}
+ }
// Build up a list of all of the sets that are a fixed distance from the start of the expression.
List? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(root, thorough: !interpreter);
@@ -244,6 +265,9 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
/// Gets the leading prefix. May be an empty string.
public string LeadingPrefix { get; } = string.Empty;
+ /// Gets the leading prefixes. May be an empty array.
+ public string[] LeadingPrefixes { get; } = Array.Empty();
+
/// When in fixed distance literal mode, gets the literal and how far it is from the start of the pattern.
public (char Char, string? String, int Distance) FixedDistanceLiteral { get; }
@@ -767,10 +791,16 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan,
return false;
}
+ // Not supported in the interpreter, but we could end up here for patterns so complex the compiler gave up on them.
+
+ case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
+ case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight:
+ return true;
+
// Nothing special to look for. Just return true indicating this is a valid position to try to match.
default:
- Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch);
+ Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch, $"Unexpected FindMode {FindMode}");
return true;
}
}
@@ -810,6 +840,11 @@ internal enum FindNextStartingPositionMode
/// A multi-character ordinal case-insensitive substring at the beginning of the pattern.
LeadingString_OrdinalIgnoreCase_LeftToRight,
+ /// Multiple leading prefix strings
+ LeadingStrings_LeftToRight,
+ /// Multiple leading ordinal case-insensitive prefix strings
+ LeadingStrings_OrdinalIgnoreCase_LeftToRight,
+
/// A set starting the pattern.
LeadingSet_LeftToRight,
/// A set starting the right-to-left pattern.
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
index 5445f696423e4..335f9165856ff 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -2561,14 +2561,7 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil
{
// In particular we want to look for sets that contain only the upper and lowercase variant
// of the same ASCII letter.
- if (RegexCharClass.IsNegated(child.Str!) ||
- RegexCharClass.GetSetChars(child.Str!, twoChars) != 2 ||
- twoChars[0] >= 128 ||
- twoChars[1] >= 128 ||
- twoChars[0] == twoChars[1] ||
- !char.IsLetter(twoChars[0]) ||
- !char.IsLetter(twoChars[1]) ||
- ((twoChars[0] | 0x20) != (twoChars[1] | 0x20)))
+ if (!RegexCharClass.SetContainsAsciiOrdinalIgnoreCaseCharacter(child.Str!, twoChars))
{
break;
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
index 1658e5bcdf2ad..35956b449390d 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs
@@ -11,6 +11,292 @@ namespace System.Text.RegularExpressions
/// Detects various forms of prefixes in the regular expression that can help FindFirstChars optimize its search.
internal static class RegexPrefixAnalyzer
{
+ /// Finds an array of multiple prefixes that a node can begin with.
+ /// The node to search.
+ /// true to find ordinal ignore-case prefixes; false for case-sensitive.
+ ///
+ /// If a fixed set of prefixes is found, such that a match for this node is guaranteed to begin
+ /// with one of those prefixes, an array of those prefixes is returned. Otherwise, null.
+ ///
+ public static string[]? FindPrefixes(RegexNode node, bool ignoreCase)
+ {
+ // Minimum string length for prefixes to be useful. If any prefix has length 1,
+ // then we're generally better off just using IndexOfAny with chars.
+ const int MinPrefixLength = 2;
+
+ // Arbitrary string length limit (with some wiggle room) to avoid creating strings that are longer than is useful and consuming too much memory.
+ const int MaxPrefixLength = 8;
+
+ // Arbitrary limit on the number of prefixes to find. If we find more than this, we're likely to be spending too much time finding prefixes that won't be useful.
+ const int MaxPrefixes = 16;
+
+ // Analyze the node to find prefixes.
+ List results = [new StringBuilder()];
+ FindPrefixesCore(node, results, ignoreCase);
+
+ // If we found too many prefixes or if any found is too short, fail.
+ if (results.Count > MaxPrefixes || !results.TrueForAll(sb => sb.Length >= MinPrefixLength))
+ {
+ return null;
+ }
+
+ // Return the prefixes.
+ string[] resultStrings = new string[results.Count];
+ for (int i = 0; i < results.Count; i++)
+ {
+ resultStrings[i] = results[i].ToString();
+ }
+ return resultStrings;
+
+ //
+ // Updates the results list with found prefixes. All existing strings in the list are treated as existing
+ // discovered prefixes prior to the node being processed. The method returns true if subsequent nodes after
+ // this one should be examined, or returns false if they shouldn't be because the node wasn't guaranteed
+ // to be fully processed.
+ //
+ static bool FindPrefixesCore(RegexNode node, List results, bool ignoreCase)
+ {
+ // If we're too deep to analyze further, we can't trust what we've already computed, so stop iterating.
+ // Also bail if any of our results is already hitting the threshold
+ if (!StackHelper.TryEnsureSufficientExecutionStack() ||
+ !results.TrueForAll(sb => sb.Length < MaxPrefixLength))
+ {
+ return false;
+ }
+
+ // These limits are approximations. We'll stop trying to make strings longer once we exceed the max length,
+ // and if we exceed the max number of prefixes by a non-trivial amount, we'll fail the operation.
+ Span setChars = stackalloc char[MaxPrefixes]; // limit how many chars we get from a set based on the max prefixes we care about
+
+ // Loop down the left side of the tree, looking for a starting node we can handle. We only loop through
+ // atomic and capture nodes, as the child is guaranteed to execute once, as well as loops with a positive
+ // minimum and thus at least one guaranteed iteration.
+ while (true)
+ {
+ switch (node.Kind)
+ {
+ // These nodes are all guaranteed to execute at least once, so we can just
+ // skip through them to their child.
+ case RegexNodeKind.Atomic:
+ case RegexNodeKind.Capture:
+ case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M > 0:
+ node = node.Child(0);
+ continue;
+
+ // Zero-width anchors and assertions don't impact a prefix and may be skipped over.
+ case RegexNodeKind.Bol:
+ case RegexNodeKind.Eol:
+ case RegexNodeKind.Boundary:
+ case RegexNodeKind.ECMABoundary:
+ case RegexNodeKind.NonBoundary:
+ case RegexNodeKind.NonECMABoundary:
+ case RegexNodeKind.Beginning:
+ case RegexNodeKind.Start:
+ case RegexNodeKind.EndZ:
+ case RegexNodeKind.End:
+ case RegexNodeKind.Empty:
+ case RegexNodeKind.UpdateBumpalong:
+ case RegexNodeKind.PositiveLookaround:
+ case RegexNodeKind.NegativeLookaround:
+ return true;
+
+ // If we hit a single character, we can just return that character.
+ // This is only relevant for case-sensitive searches, as for case-insensitive we'd have sets for anything
+ // that produces a different result when case-folded, or for strings composed entirely of characters that
+ // don't participate in case conversion. Single character loops are handled the same as single characters
+ // up to the min iteration limit. We can continue processing after them as well if they're repeaters such
+ // that their min and max are the same.
+ case RegexNodeKind.One or RegexNodeKind.Oneloop or RegexNodeKind.Onelazy or RegexNodeKind.Oneloopatomic when !ignoreCase || !RegexCharClass.ParticipatesInCaseConversion(node.Ch):
+ {
+ int reps = node.Kind is RegexNodeKind.One ? 1 : node.M;
+ foreach (StringBuilder sb in results)
+ {
+ sb.Append(node.Ch, reps);
+ }
+ }
+ return node.Kind is RegexNodeKind.One || node.M == node.N;
+
+ // If we hit a string, we can just return that string.
+ // As with One above, this is only relevant for case-sensitive searches.
+ case RegexNodeKind.Multi:
+ if (!ignoreCase)
+ {
+ foreach (StringBuilder sb in results)
+ {
+ sb.Append(node.Str);
+ }
+ }
+ else
+ {
+ // If we're ignoring case, then only append up through characters that don't participate in case conversion.
+ // If there are any beyond that, we can't go further and need to stop with what we have.
+ foreach (char c in node.Str!)
+ {
+ if (RegexCharClass.ParticipatesInCaseConversion(c))
+ {
+ return false;
+ }
+
+ foreach (StringBuilder sb in results)
+ {
+ sb.Append(c);
+ }
+ }
+ }
+ return true;
+
+ // For case-sensitive, try to extract the characters that comprise it, and if there are
+ // any and there aren't more than the max number of prefixes, we can return
+ // them each as a prefix. Effectively, this is an alternation of the characters
+ // that comprise the set. For case-insensitive, we need the set to be two ASCII letters that case fold to the same thing.
+ // As with One and loops, set loops are handled the same as sets up to the min iteration limit.
+ case RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setlazy or RegexNodeKind.Setloopatomic when !RegexCharClass.IsNegated(node.Str!): // negated sets are too complex to analyze
+ {
+ int charCount = RegexCharClass.GetSetChars(node.Str!, setChars);
+ if (charCount == 0)
+ {
+ return false;
+ }
+
+ int reps = node.Kind is RegexNodeKind.Set ? 1 : node.M;
+ if (!ignoreCase)
+ {
+ int existingCount = results.Count;
+
+ // Duplicate all of the existing strings for all of the new suffixes, other than the first.
+ foreach (char suffix in setChars.Slice(1, charCount - 1))
+ {
+ for (int existing = 0; existing < existingCount; existing++)
+ {
+ StringBuilder newSb = new StringBuilder().Append(results[existing]);
+ newSb.Append(suffix, reps);
+ results.Add(newSb);
+ }
+ }
+
+ // Then append the first suffix to all of the existing strings.
+ for (int existing = 0; existing < existingCount; existing++)
+ {
+ results[existing].Append(setChars[0], reps);
+ }
+ }
+ else
+ {
+ // For ignore-case, we currently only handle the simple (but common) case of a single
+ // ASCII character that case folds to the same char.
+ if (!RegexCharClass.SetContainsAsciiOrdinalIgnoreCaseCharacter(node.Str!, setChars))
+ {
+ return false;
+ }
+
+ // Append it to each.
+ foreach (StringBuilder sb in results)
+ {
+ sb.Append(setChars[1], reps);
+ }
+ }
+ }
+ return node.Kind is RegexNodeKind.Set || node.N == node.M;
+
+ case RegexNodeKind.Concatenate:
+ {
+ int childCount = node.ChildCount();
+ for (int i = 0; i < childCount; i++)
+ {
+ // Atomic and Capture nodes don't impact prefixes, so skip through them.
+ // Unlike earlier, however, we can't skip through loops, as a loop with
+ // more than one iteration impacts the matched sequence for the concatenation,
+ // and since we need a minimum of one, we'd only be able to skip a loop with
+ // both a min and max of 1, which in general is removed as superfluous during
+ // tree optimization. We could keep track of having traversed a loop and then
+ // stop processing the continuation after that, but that complexity isn't
+ // currently worthwhile.
+ if (!FindPrefixesCore(SkipThroughAtomicAndCapture(node.Child(i)), results, ignoreCase))
+ {
+ return false;
+ }
+ }
+ }
+ return true;
+
+ // For alternations, we need to find a prefix for every branch; if we can't compute a
+ // prefix for any one branch, we can't trust the results and need to give up, since we don't
+ // know if our set of prefixes is complete.
+ case RegexNodeKind.Alternate:
+ {
+ // If there are more children than our maximum, just give up immediately, as we
+ // won't be able to get a prefix for every branch and have it be within our max.
+ int childCount = node.ChildCount();
+ Debug.Assert(childCount >= 2); // otherwise it would have been optimized out
+ if (childCount > MaxPrefixes)
+ {
+ return false;
+ }
+
+ // Build up the list of all prefixes across all branches.
+ List? allBranchResults = null;
+ List? alternateBranchResults = [new StringBuilder()];
+ for (int i = 0; i < childCount; i++)
+ {
+ _ = FindPrefixesCore(node.Child(i), alternateBranchResults, ignoreCase);
+
+ Debug.Assert(alternateBranchResults.Count > 0);
+ foreach (StringBuilder sb in alternateBranchResults)
+ {
+ if (sb.Length == 0)
+ {
+ return false;
+ }
+ }
+
+ if (allBranchResults is null)
+ {
+ allBranchResults = alternateBranchResults;
+ alternateBranchResults = [new StringBuilder()];
+ }
+ else
+ {
+ allBranchResults.AddRange(alternateBranchResults);
+ alternateBranchResults.Clear();
+ alternateBranchResults.Add(new StringBuilder());
+ }
+ }
+
+ // At this point, we know we can successfully incorporate the alternation's results
+ // into the main results.
+
+ // Duplicate all of the existing strings for all of the new suffixes, other than the first.
+ int existingCount = results.Count;
+ for (int i = 1; i < allBranchResults!.Count; i++)
+ {
+ StringBuilder suffix = allBranchResults[i];
+ for (int existing = 0; existing < existingCount; existing++)
+ {
+ StringBuilder newSb = new StringBuilder().Append(results[existing]);
+ newSb.Append(suffix);
+ results.Add(newSb);
+ }
+ }
+
+ // Then append the first suffix to all of the existing strings.
+ for (int existing = 0; existing < existingCount; existing++)
+ {
+ results[existing].Append(allBranchResults[0]);
+ }
+ }
+
+ // We don't know that we fully processed every branch, so we can't iterate through what comes after this node.
+ // The results were successfully updated, but return false to indicate that nothing after this node should be examined.
+ return false;
+
+ // Something else we don't recognize, so stop iterating.
+ default:
+ return false;
+ }
+ }
+ }
+ }
+
/// Computes the leading substring in ; may be empty.
public static string FindPrefix(RegexNode node)
{
@@ -787,10 +1073,7 @@ public static (RegexNode LoopNode, (char Char, string? String, StringComparison
// Find the first concatenation. We traverse through atomic and capture nodes as they don't effect flow control. (We don't
// want to explore loops, even if they have a guaranteed iteration, because we may use information about the node to then
// skip the node's execution in the matching algorithm, and we would need to special-case only skipping the first iteration.)
- while (node.Kind is RegexNodeKind.Atomic or RegexNodeKind.Capture)
- {
- node = node.Child(0);
- }
+ node = SkipThroughAtomicAndCapture(node);
if (node.Kind != RegexNodeKind.Concatenate)
{
return null;
@@ -1014,6 +1297,16 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le
}
}
+ /// Walk through a node's children as long as the nodes are atomic or capture.
+ private static RegexNode SkipThroughAtomicAndCapture(RegexNode node)
+ {
+ while (node.Kind is RegexNodeKind.Atomic or RegexNodeKind.Capture)
+ {
+ node = node.Child(0);
+ }
+ return node;
+ }
+
/// Percent occurrences in source text (100 * char count / total count).
private static ReadOnlySpan Frequency =>
[
diff --git a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexPrefixAnalyzerTests.cs b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexPrefixAnalyzerTests.cs
index 9c592d7c57f60..acc77b5a49c0d 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexPrefixAnalyzerTests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexPrefixAnalyzerTests.cs
@@ -70,6 +70,69 @@ public void FindFirstCharClass_StressDeep()
FindFirstCharClass(string.Concat(Enumerable.Repeat($"(a?", nesting).Concat(Enumerable.Repeat(")*", nesting))), 0, null);
}
+ [Theory]
+ // case-sensitive
+ [InlineData("abc", new[] { "abc" }, false)]
+ [InlineData("(abc+|bcd+)", new[] { "abc", "bcd" }, false)]
+ [InlineData("(ab+c|bcd+)", new[] { "ab", "bcd" }, false)]
+ [InlineData("(ab+c|bcd+)*", null, false)]
+ [InlineData("(ab+c|bcd+)+", new[] { "ab", "bcd" }, false)]
+ [InlineData("(ab+c|bcd+){3,5}", new[] { "ab", "bcd" }, false)]
+ [InlineData("abc|def", new[] { "abc", "def" }, false)]
+ [InlineData("ab{4}c|def{5}|g{2,4}h", new[] { "abbbbc", "defffff", "gg" }, false)]
+ [InlineData("abc|def|(ghi|jklm)", new[] { "abc", "def", "ghi", "jklm" }, false)]
+ [InlineData("abc[def]ghi", new[] { "abcdghi", "abceghi", "abcfghi" }, false)]
+ [InlineData("abc[def]ghi|[jkl]m", new[] { "abcdghi", "abceghi", "abcfghi", "jm", "km", "lm" }, false)]
+ [InlineData("agggtaaa|tttaccct", new[] { "agggtaaa", "tttaccct" }, false)]
+ [InlineData("[cgt]gggtaaa|tttaccc[acg]", new[] { "cgggtaaa", "ggggtaaa", "tgggtaaa", "tttaccca", "tttacccc", "tttacccg" }, false)]
+ [InlineData("a[act]ggtaaa|tttacc[agt]t", new[] { "aaggtaaa", "acggtaaa", "atggtaaa", "tttaccat", "tttaccgt", "tttacctt" }, false)]
+ [InlineData("ag[act]gtaaa|tttac[agt]ct", new[] { "agagtaaa", "agcgtaaa", "agtgtaaa", "tttacact", "tttacgct", "tttactct" }, false)]
+ [InlineData("agg[act]taaa|ttta[agt]cct", new[] { "aggataaa", "aggctaaa", "aggttaaa", "tttaacct", "tttagcct", "tttatcct" }, false)]
+ [InlineData(@"\b(abc|def)\b", new[] { "abc", "def" }, false)]
+ [InlineData("^(abc|def)$", new[] { "abc", "def" }, false)]
+ [InlineData("abcdefg|h", null, false)]
+ [InlineData("abc[def]ghi|[jkl]", null, false)]
+ [InlineData("[12][45][789]", new[] { "147", "148", "149", "157", "158", "159", "247", "248", "249", "257", "258", "259" }, false)]
+ [InlineData("[12]a[45]b[789]c", new[] { "1a4b7c", "1a4b8c", "1a4b9c", "1a5b7c", "1a5b8c", "1a5b9c", "2a4b7c", "2a4b8c", "2a4b9c", "2a5b7c", "2a5b8c", "2a5b9c" }, false)]
+ // case-insensitive
+ [InlineData("[Aa][Bb][Cc]", new[] { "abc" }, true)]
+ [InlineData("[Aa][Bbc][Cc]", null, true)]
+ [InlineData(":[Aa]![Bb]@", new[] { ":a!b@" }, true)]
+ [InlineData("(?i)abc", new[] { "abc" }, true)]
+ [InlineData("(?i)(abc+|bcd+)", new[] { "abc", "bcd" }, true)]
+ [InlineData("(?i)(ab+c|bcd+)", new[] { "ab", "bcd" }, true)]
+ [InlineData("(?i)(ab+c|bcd+)*", null, true)]
+ [InlineData("(?i)(ab+c|bcd+)+", new[] { "ab", "bcd" }, true)]
+ [InlineData("(?i)(ab+c|bcd+){3,5}", new[] { "ab", "bcd" }, true)]
+ [InlineData("(?i)abc|def", new[] { "abc", "def" }, true)]
+ [InlineData("(?i)ab{4}c|def{5}|g{2,4}h", new[] { "abbbbc", "defffff", "gg" }, true)]
+ [InlineData("(?i)(((?>abc)|(?>def)))", new[] { "abc", "def" }, true)]
+ [InlineData("(?i)(abc|def|(ghi|jklm))", null, true)]
+ [InlineData("(?i)(abc|def|(ghi|jlmn))", new[] { "abc", "def", "ghi", "jlmn" }, true)]
+ [InlineData("abc", null, true)]
+ [InlineData("abc|def", null, true)]
+ [InlineData("abc|def|(ghi|jklm)", null, true)]
+ [InlineData("://[Aa][Bb]|[Cc]@!", new[] { "://ab", "c@!" }, true)]
+ public void FindPrefixes(string pattern, string[] expectedSet, bool ignoreCase)
+ {
+ RegexTree tree = RegexParser.Parse(pattern, RegexOptions.None, CultureInfo.InvariantCulture);
+ string[] actual = RegexPrefixAnalyzer.FindPrefixes(tree.Root, ignoreCase);
+
+ if (expectedSet is null)
+ {
+ Assert.Null(actual);
+ }
+ else
+ {
+ Assert.NotNull(actual);
+
+ Array.Sort(actual, StringComparer.Ordinal);
+ Array.Sort(expectedSet, StringComparer.Ordinal);
+
+ Assert.Equal(expectedSet, actual);
+ }
+ }
+
private static string FormatSet(string set)
{
if (set is null)