From 854795b98619b0bb2d9f4cc26673a918e3cd7b6c Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 18 Nov 2021 11:30:02 -0500 Subject: [PATCH] Expand One/Notone/Setlazy support to Lazy The logic added in a recent PR to support simplified code generation for a Regex with {One/Notone/Set}lazy nodes is almost sufficient to support Lazy as well. This fills the gap. It also deletes an erroneous optimization added in .NET 5 that removed top-level Atomic nodes; the idea behind it was that such nodes are meaningless as, at the top-level, nothing can backtrack in anyway, but it then means that any node which is paying attention to whether its parent is Atomic may no longer find it (and that's needed by Lazy). --- .../gen/RegexGenerator.Emitter.cs | 58 +++++++++-------- .../Text/RegularExpressions/RegexCompiler.cs | 45 +++++++------ .../Text/RegularExpressions/RegexNode.cs | 64 +++++++++---------- 3 files changed, 88 insertions(+), 79 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 87a1e7cf21213..7c588f0c3dbd2 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -750,7 +750,7 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, int labelCounter = 0; string DefineLabel(string prefix = "L") => $"{prefix}{labelCounter++}"; - void MarkLabel(string label) => writer.WriteLine($"{label}:"); + void MarkLabel(string label, bool addEmptyStatement = false) => writer.WriteLine($"{label}:{(addEmptyStatement ? " ;" : "")}"); void Goto(string label) => writer.WriteLine($"goto {label};"); string doneLabel = "NoMatch"; string originalDoneLabel = doneLabel; @@ -772,17 +772,11 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, // Emit failure writer.WriteLine("// No match"); - MarkLabel(originalDoneLabel); + MarkLabel(originalDoneLabel, !expressionHasCaptures); if (expressionHasCaptures) { EmitUncaptureUntil("0"); } - else - { - // We can't have a label at the end of the method, so explicitly - // add a "return;" if the End label would otherwise be an issue. - writer.WriteLine("return;"); - } return; static bool IsCaseInsensitive(RegexNode node) => (node.Options & RegexOptions.IgnoreCase) != 0; @@ -1174,15 +1168,11 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck EmitAtomicNodeLoop(node); break; + case RegexNode.Onelazy: + case RegexNode.Notonelazy: + case RegexNode.Setlazy: case RegexNode.Lazyloop: - // An atomic lazy loop amounts to doing the minimum amount of work possible. - // That means iterating as little as is required, which means a repeater - // for the min, and if min is 0, doing nothing. - Debug.Assert(node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic)); - if (node.M > 0) - { - EmitNodeRepeater(node); - } + EmitLazy(node, emitLengthChecksIfRequired); break; case RegexNode.Alternate: @@ -1195,12 +1185,6 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired); break; - case RegexNode.Onelazy: - case RegexNode.Notonelazy: - case RegexNode.Setlazy: - EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired); - break; - case RegexNode.Concatenate: EmitConcatenation(node, subsequent, emitLengthChecksIfRequired); break; @@ -1682,17 +1666,27 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL // It's left pointing to the backtracking label for everything subsequent in the expression. } - void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) + void EmitLazy(RegexNode node, bool emitLengthChecksIfRequired = true) { + bool isSingleChar = node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily; + // Emit the min iterations as a repeater. Any failures here don't necessitate backtracking, // as the lazy itself failed to match. if (node.M > 0) { - EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + if (isSingleChar) + { + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + } + else + { + EmitNodeRepeater(node); + } } - // If the whole thing was actually that repeater, we're done. - if (node.M == node.N) + // If the whole thing was actually that repeater, we're done. Similarly, if this is actually an atomic + // lazy loop, nothing will ever backtrack into this node, so we never need to iterate more than the minimum. + if (node.M == node.N || node.Next is { Type: RegexNode.Atomic }) { return; } @@ -1762,7 +1756,15 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL // for the next time we backtrack. writer.WriteLine($"runtextpos = {nextPos};"); LoadTextSpanLocal(writer); - EmitSingleChar(node); + if (isSingleChar) + { + EmitSingleChar(node); + } + else + { + writer.WriteLine(); + EmitNode(node.Child(0)); + } TransferTextSpanPosToRunTextPos(); writer.WriteLine($"{nextPos} = runtextpos;"); @@ -1772,7 +1774,7 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes writer.WriteLine(); - MarkLabel(endLoopLabel); + MarkLabel(endLoopLabel, addEmptyStatement: true); // We explicitly do not reset doneLabel back to originalDoneLabel. // It's left pointing to the backtracking label for everything subsequent in the expression. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 3a57f1840516c..054df1c5c33d9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -2094,15 +2094,11 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck EmitAtomicNodeLoop(node); break; + case RegexNode.Onelazy: + case RegexNode.Notonelazy: + case RegexNode.Setlazy: case RegexNode.Lazyloop: - // An atomic lazy loop amounts to doing the minimum amount of work possible. - // That means iterating as little as is required, which means a repeater - // for the min, and if min is 0, doing nothing. - Debug.Assert(node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic)); - if (node.M > 0) - { - EmitNodeRepeater(node); - } + EmitLazy(node, emitLengthChecksIfRequired); break; case RegexNode.Atomic: @@ -2119,12 +2115,6 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired); break; - case RegexNode.Onelazy: - case RegexNode.Notonelazy: - case RegexNode.Setlazy: - EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired); - break; - case RegexNode.Concatenate: EmitConcatenation(node, subsequent, emitLengthChecksIfRequired); break; @@ -2558,17 +2548,27 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL MarkLabel(endLoop); } - void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) + void EmitLazy(RegexNode node, bool emitLengthChecksIfRequired = true) { + bool isSingleChar = node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily; + // Emit the min iterations as a repeater. Any failures here don't necessitate backtracking, // as the lazy itself failed to match. if (node.M > 0) { - EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + if (isSingleChar) + { + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + } + else + { + EmitNodeRepeater(node); + } } - // If the whole thing was actually that repeater, we're done. - if (node.M == node.N) + // If the whole thing was actually that repeater, we're done. Similarly, if this is actually an atomic + // lazy loop, nothing will ever backtrack into this node, so we never need to iterate more than the minimum. + if (node.M == node.N || node.Next is { Type: RegexNode.Atomic }) { return; } @@ -2657,7 +2657,14 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL Ldloc(nextPos); Stloc(runtextposLocal); LoadTextSpanLocal(); - EmitSingleChar(node); + if (isSingleChar) + { + EmitSingleChar(node); + } + else + { + EmitNode(node.Child(0)); + } TransferTextSpanPosToRunTextPos(); Ldloc(runtextposLocal); Stloc(nextPos); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 88a2800f8f96b..d53ddaf082c2e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -407,14 +407,6 @@ internal RegexNode FinalOptimize() } } - // Optimization: Unnecessary root atomic. - // If the root node under the implicit Capture is an Atomic, the Atomic is useless as there's nothing - // to backtrack into it, so we can remove it. - while (rootNode.Child(0).Type == Atomic) - { - rootNode.ReplaceChild(0, rootNode.Child(0).Child(0)); - } - // Done optimizing. Return the final tree. #if DEBUG rootNode.ValidateFinalTreeInvariants(); @@ -2250,42 +2242,30 @@ internal bool SupportsSimplifiedCodeGenerationImplementation() case Setlazy: Debug.Assert(Next == null || Next.Type != Atomic, "Loop should have been transformed into an atomic type."); supported = M == N || AncestorsAllowBacktracking(Next); - static bool AncestorsAllowBacktracking(RegexNode? node) - { - while (node is not null) - { - switch (node.Type) - { - case Concatenate: - case Capture: - case Atomic: - node = node.Next; - break; - - default: - return false; - } - } - - return true; - } break; - // {Lazy}Loop repeaters are the same, except their child also needs to be supported. + // Loop repeaters are the same, except their child also needs to be supported. // We also support such loops being atomic. case Loop: - case Lazyloop: supported = (M == N || (Next != null && Next.Type == Atomic)) && Child(0).SupportsSimplifiedCodeGenerationImplementation(); break; - // We can handle atomic as long as we can handle making its child atomic, or - // its child doesn't have that concept. - case Atomic: + // Similarly, as long as the wrapped node supports simplified code gen, + // Lazy is supported if it's a repeater or atomic, but also if it's in + // a place where backtracking is allowed (e.g. it's top-level). + case Lazyloop: + supported = + (M == N || (Next != null && Next.Type == Atomic) || AncestorsAllowBacktracking(Next)) && + Child(0).SupportsSimplifiedCodeGenerationImplementation(); + break; + + // We can handle atomic as long as its child is supported. // Lookahead assertions also only require that the child node be supported. // The RightToLeft check earlier is important to differentiate lookbehind, // which is not supported. + case Atomic: case Require: case Prevent: supported = Child(0).SupportsSimplifiedCodeGenerationImplementation(); @@ -2370,6 +2350,26 @@ static bool AncestorsAllowBacktracking(RegexNode? node) } #endif return supported; + + static bool AncestorsAllowBacktracking(RegexNode? node) + { + while (node is not null) + { + switch (node.Type) + { + case Concatenate: + case Capture: + case Atomic: + node = node.Next; + break; + + default: + return false; + } + } + + return true; + } } /// Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node.