Improve alternation switch optimization in regex source generator (do…

…tnet#98723) The regex source generator has an optimization that tries to emit a switch statement to handle an alternation. If it can prove that an alternation is atomic, either because of a surrounding construct (like an atomic group) or because nothing in the alternation itself might backtrack (like a loop in one of the branches), and if it can prove that none of the branches overlap on the first character they must match (because all branches always begin with a different character from the others), then it can emit a switch over the first character required by each branch. Today, the analysis that leads to this optimization being used only considers branches that start with a specific character (RegexNodeKind.One), a set (RegexNodeKind.Set), a string (RegexNodeKind.Multi), or a concatenation that begins with one of those. Anything else, and it gets knocked off the optimized switch path. With this PR, this is evolved to instead allow those One/Set/Multi constructs to be the first non-zero width construct matched in the branch, but not necessarily the first node, e.g. the branch could be a capture around one of these nodes, or a loop of one of these with a minimum iteration count of at least 1. This PR also adds in support for not just individual chars or sets, but loops of them (normal, lazy, or atomic), again as long as they have a minimum iteration count of 1... this in particular helps with duplicate characters in a row, as earlier optimizations will have likely condensed them into repeaters represented as loops with equal min and max counts. This PR also makes one more tweak, which is that the sets supported may now be larger. Previously the code was allowing for a set to expand to at most 5 characters, an arbitrary limit set primarily to support ignore-case (which would typically result in sets of 2 or 3 characters). But this ignores the fact that previous optimizations may combine sets for a variety of reasons, e.g. an alternation where one branch contains 's' and the next contains 't' would be combined into a single branch for [st]. This limit has now been increased significantly, with little downside; the main limitation is stack consumption, and the new limit is well within typical stackallocs we use ourselves.
jkotas · Mar 4, 2024 · 061d4df · 061d4df
1 parent ab88861
commit 061d4df
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 54 deletions.
diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
@@ -1665,7 +1665,7 @@ void EmitAlternation(RegexNode node)
                 }
 
                 // Detect whether every branch begins with one or more unique characters.
-                const int SetCharsSize = 5; // arbitrary limit (for IgnoreCase, we want this to be at least 3 to handle the vast majority of values)
+                const int SetCharsSize = 64; // arbitrary limit; we want it to be large enough to handle ignore-case of common sets, like hex, the latin alphabet, etc.
                 Span<char> setChars = stackalloc char[SetCharsSize];
                 if (useSwitchedBranches)
                 {
@@ -1675,18 +1675,20 @@ void EmitAlternation(RegexNode node)
                     var seenChars = new HashSet<char>();
                     for (int i = 0; i < childCount && useSwitchedBranches; i++)
                     {
-                        // If it's not a One, Multi, or Set, we can't apply this optimization.
-                        if (node.Child(i).FindBranchOneMultiOrSetStart() is not RegexNode oneMultiOrSet)
+                        // Look for the guaranteed starting node that's a one, multi, set,
+                        // or loop of one of those with at least one minimum iteration. We need to exclude notones.
+                        if (node.Child(i).FindStartingLiteralNode(allowZeroWidth: false) is not RegexNode startingLiteralNode ||
+                            startingLiteralNode.IsNotoneFamily)
                         {
                             useSwitchedBranches = false;
                             break;
                         }
 
                         // If it's a One or a Multi, get the first character and add it to the set.
                         // If it was already in the set, we can't apply this optimization.
-                        if (oneMultiOrSet.Kind is RegexNodeKind.One or RegexNodeKind.Multi)
+                        if (startingLiteralNode.IsOneFamily || startingLiteralNode.Kind is RegexNodeKind.Multi)
                         {
-                            if (!seenChars.Add(oneMultiOrSet.FirstCharOfOneOrMulti()))
+                            if (!seenChars.Add(startingLiteralNode.FirstCharOfOneOrMulti()))
                             {
                                 useSwitchedBranches = false;
                                 break;
@@ -1696,10 +1698,10 @@ void EmitAlternation(RegexNode node)
                         {
                             // The branch begins with a set.  Make sure it's a set of only a few characters
                             // and get them.  If we can't, we can't apply this optimization.
-                            Debug.Assert(oneMultiOrSet.Kind is RegexNodeKind.Set);
+                            Debug.Assert(startingLiteralNode.IsSetFamily);
                             int numChars;
-                            if (RegexCharClass.IsNegated(oneMultiOrSet.Str!) ||
-                                (numChars = RegexCharClass.GetSetChars(oneMultiOrSet.Str!, setChars)) == 0)
+                            if (RegexCharClass.IsNegated(startingLiteralNode.Str!) ||
+                                (numChars = RegexCharClass.GetSetChars(startingLiteralNode.Str!, setChars)) == 0)
                             {
                                 useSwitchedBranches = false;
                                 break;
@@ -1741,7 +1743,7 @@ void EmitSwitchedBranches()
                     writer.WriteLine();
 
                     // Emit a switch statement on the first char of each branch.
-                    using (EmitBlock(writer, $"switch ({sliceSpan}[{sliceStaticPos++}])"))
+                    using (EmitBlock(writer, $"switch ({sliceSpan}[{sliceStaticPos}])"))
                     {
                         Span<char> setChars = stackalloc char[SetCharsSize]; // needs to be same size as detection check in caller
                         int startingSliceStaticPos = sliceStaticPos;
@@ -1751,56 +1753,80 @@ void EmitSwitchedBranches()
                         {
                             sliceStaticPos = startingSliceStaticPos;
 
-                            RegexNode child = node.Child(i);
-                            Debug.Assert(child.Kind is RegexNodeKind.One or RegexNodeKind.Multi or RegexNodeKind.Set or RegexNodeKind.Concatenate, DescribeNode(child, rm));
-                            Debug.Assert(child.Kind is not RegexNodeKind.Concatenate || (child.ChildCount() >= 2 && child.Child(0).Kind is RegexNodeKind.One or RegexNodeKind.Multi or RegexNodeKind.Set));
+                            // We know we're only in this code if every branch has a valid starting literal node. Get it.
+                            // We also get the immediate child. Ideally they're the same, in which case we might be able to
+                            // use the switch as the processing of that node, e.g. if the node is a One, then by matching the
+                            // literal via the switch, we've fully processed it. But there may be other cases in which it's not
+                            // sufficient, e.g. if that one was wrapped in a Capture, we still want to emit the capture code,
+                            // and for simplicity, we still end up emitting the re-evaluation of that character. It's still much
+                            // cheaper to do this than to emit the full alternation code.
 
-                            RegexNode? childStart = child.FindBranchOneMultiOrSetStart();
-                            Debug.Assert(childStart is not null, "Unexpectedly couldn't find the branch starting node.");
+                            RegexNode child = node.Child(i);
+                            RegexNode? startingLiteralNode = child.FindStartingLiteralNode(allowZeroWidth: false);
+                            Debug.Assert(startingLiteralNode is not null, "Unexpectedly couldn't find the branch starting node.");
 
-                            if (childStart.Kind is RegexNodeKind.Set)
+                            // Emit the case for this branch to match on the first character.
+                            if (startingLiteralNode.IsSetFamily)
                             {
-                                int numChars = RegexCharClass.GetSetChars(childStart.Str!, setChars);
+                                int numChars = RegexCharClass.GetSetChars(startingLiteralNode.Str!, setChars);
                                 Debug.Assert(numChars != 0);
                                 writer.WriteLine($"case {string.Join(" or ", setChars.Slice(0, numChars).ToArray().Select(Literal))}:");
                             }
                             else
                             {
-                                writer.WriteLine($"case {Literal(childStart.FirstCharOfOneOrMulti())}:");
+                                writer.WriteLine($"case {Literal(startingLiteralNode.FirstCharOfOneOrMulti())}:");
                             }
                             writer.Indent++;
 
                             // Emit the code for the branch, without the first character that was already matched in the switch.
+                            RegexNode? remainder = null;
+                            HandleChild:
                             switch (child.Kind)
                             {
-                                case RegexNodeKind.Multi:
-                                    EmitNode(CloneMultiWithoutFirstChar(child));
+                                case RegexNodeKind.One:
+                                case RegexNodeKind.Set:
+                                    // The character was handled entirely by the switch. No additional matching is needed.
+                                    sliceStaticPos++;
+                                    break;
+
+                                case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic:
+                                case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic:
+                                    // First character of the loop was handled by the switch. Emit matching code for the remainder of the loop.
+                                    Debug.Assert(child == startingLiteralNode);
+                                    Debug.Assert(child.M > 0);
+                                    sliceStaticPos++;
+                                    EmitNode(child.CloneCharLoopWithOneLessIteration());
                                     writer.WriteLine();
                                     break;
 
-                                case RegexNodeKind.Concatenate:
-                                    var newConcat = new RegexNode(RegexNodeKind.Concatenate, child.Options);
-                                    if (childStart.Kind == RegexNodeKind.Multi)
-                                    {
-                                        newConcat.AddChild(CloneMultiWithoutFirstChar(childStart));
-                                    }
-                                    int concatChildCount = child.ChildCount();
-                                    for (int j = 1; j < concatChildCount; j++)
-                                    {
-                                        newConcat.AddChild(child.Child(j));
-                                    }
-                                    EmitNode(newConcat.Reduce());
+                                case RegexNodeKind.Multi:
+                                    // First character was handled by the switch. Emit matching code for the remainder of the multi string.
+                                    sliceStaticPos++;
+                                    EmitNode(child.Str!.Length == 2 ?
+                                        new RegexNode(RegexNodeKind.One, child.Options, child.Str![1]) :
+                                        new RegexNode(RegexNodeKind.Multi, child.Options, child.Str!.Substring(1)));
                                     writer.WriteLine();
                                     break;
 
-                                    static RegexNode CloneMultiWithoutFirstChar(RegexNode node)
-                                    {
-                                        Debug.Assert(node.Kind is RegexNodeKind.Multi);
-                                        Debug.Assert(node.Str!.Length >= 2);
-                                        return node.Str!.Length == 2 ?
-                                            new RegexNode(RegexNodeKind.One, node.Options, node.Str![1]) :
-                                            new RegexNode(RegexNodeKind.Multi, node.Options, node.Str!.Substring(1));
-                                    }
+                                case RegexNodeKind.Concatenate when child.Child(0) == startingLiteralNode && (startingLiteralNode.IsOneFamily || startingLiteralNode.IsSetFamily || startingLiteralNode.Kind is RegexNodeKind.Multi):
+                                    // This is a concatenation where its first node is the starting literal we found. This is a common
+                                    // enough case that we want to special-case it to avoid duplicating the processing for that character
+                                    // unnecessarily. So, we'll shave off that first node from the concatenation and then handle the remainder.
+                                    remainder = child;
+                                    child = child.Child(0);
+                                    remainder.ReplaceChild(0, new RegexNode(RegexNodeKind.Empty, remainder.Options));
+                                    goto HandleChild; // reprocess just the first node that was saved; the remainder will then be processed below
+
+                                default:
+                                    remainder = child;
+                                    break;
+                            }
+
+                            if (remainder is not null)
+                            {
+                                // Emit a full match for whatever part of the child we haven't yet handled.
+                                EmitNode(remainder);
+                                writer.WriteLine();
                             }
 
                             // This is only ever used for atomic alternations, so we can simply reset the doneLabel

diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs
@@ -83,6 +83,24 @@ public RegexNode(RegexNodeKind kind, RegexOptions options, int m, int n)
             N = n;
         }
 
+        /// <summary>Creates a new node from an existing one/notone/setone {lazy/atomic} loop with one less iteration.</summary>
+        public RegexNode CloneCharLoopWithOneLessIteration()
+        {
+            Debug.Assert(Kind is RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or
+                                 RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or
+                                 RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic);
+            Debug.Assert(M > 0);
+
+            RegexNode newNode = IsSetFamily ?
+                new RegexNode(Kind, Options, Str!) :
+                new RegexNode(Kind, Options, Ch);
+
+            newNode.M = M - 1;
+            newNode.N = N == int.MaxValue ? int.MaxValue : N - 1;
+
+            return newNode;
+        }
+
         /// <summary>Creates a RegexNode representing a single character.</summary>
         /// <param name="ch">The character.</param>
         /// <param name="options">The node's options.</param>
@@ -1361,27 +1379,16 @@ static void ProcessOneOrMulti(RegexNode node, ReadOnlySpan<char> startingSpan)
             return branch.Kind is RegexNodeKind.One or RegexNodeKind.Multi ? branch : null;
         }
 
-        /// <summary>Same as <see cref="FindBranchOneOrMultiStart"/> but also for Sets.</summary>
-        public RegexNode? FindBranchOneMultiOrSetStart()
-        {
-            RegexNode branch = Kind == RegexNodeKind.Concatenate ? Child(0) : this;
-            return branch.Kind is RegexNodeKind.One or RegexNodeKind.Multi or RegexNodeKind.Set ? branch : null;
-        }
-
         /// <summary>Gets the character that begins a One or Multi.</summary>
         public char FirstCharOfOneOrMulti()
         {
-            Debug.Assert(Kind is RegexNodeKind.One or RegexNodeKind.Multi);
+            Debug.Assert(Kind is RegexNodeKind.One or RegexNodeKind.Multi || (IsOneFamily && M > 0));
             Debug.Assert((Options & RegexOptions.RightToLeft) == 0);
-            return Kind == RegexNodeKind.One ? Ch : Str![0];
+            return IsOneFamily ? Ch : Str![0];
         }
 
         /// <summary>Finds the guaranteed beginning literal(s) of the node, or null if none exists.</summary>
-        /// <returns>
-        /// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant.
-        /// The Negated value indicates whether the Char/SetChars should be considered exclusionary.
-        /// </returns>
-        public RegexNode? FindStartingLiteralNode()
+        public RegexNode? FindStartingLiteralNode(bool allowZeroWidth = true)
         {
             RegexNode? node = this;
             while (true)
@@ -1404,7 +1411,7 @@ public char FirstCharOfOneOrMulti()
                         case RegexNodeKind.Capture:
                         case RegexNodeKind.Group:
                         case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M > 0:
-                        case RegexNodeKind.PositiveLookaround:
+                        case RegexNodeKind.PositiveLookaround when allowZeroWidth:
                             node = node.Child(0);
                             continue;
                     }