From 61b5c92c1509c9911eb3b3dd3fb684e48e717bc5 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Thu, 9 Dec 2021 21:16:40 -0500 Subject: [PATCH] A little more regex source generator tweaking (#62605) * Remove now unecessary "Variable declared but never used" warning suppression * Avoid declaring inputSpan in FindFirstChar if we don't have to * Add RegexNode.Ref/Bol/Eol to non-backtracking list * Move some additionalDeclarations back to where they're used I was overaggressive in moving these to the beginning. Some are fine where they're needed. * Delete dead code * Emit IsEmpty for more slice.Length checks * Apply suggestions from code review --- .../gen/RegexGenerator.Emitter.cs | 95 ++++++++----------- .../src/System/Threading/StackHelper.cs | 23 ----- 2 files changed, 41 insertions(+), 77 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 97183cfc415d6..2caadf631fd8a 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -36,7 +36,6 @@ public partial class RegexGenerator "#nullable enable", "#pragma warning disable CS0162 // Unreachable code", "#pragma warning disable CS0164 // Unreferenced label", - "#pragma warning disable CS0168 // Variable declared but never used", "#pragma warning disable CS0219 // Variable assigned but never used", "", }; @@ -274,13 +273,11 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, bool hasTextInfo = false; // In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later. - // To handle that, we emit a placeholder value that's not valid C#, and then at the end of the code generation we either - // delete it if no additional declarations are required, or we replace it with the list of additional declarations - // built up while generating code. + // To handle that, we build up a collection of all the declarations to include, track where they should be inserted, + // and then insert them at that position once everything else has been output. var additionalDeclarations = new HashSet(); // Emit locals initialization - writer.WriteLine("global::System.ReadOnlySpan inputSpan = base.runtext;"); writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;"); writer.Flush(); int additionalDeclarationsPosition = ((StringWriter)writer.InnerWriter).GetStringBuilder().Length; @@ -315,7 +312,8 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, { case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive: Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix)); - EmitIndexOf_LeftToRight(code.FindOptimizations.LeadingCaseSensitivePrefix); + additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); + EmitIndexOf(code.FindOptimizations.LeadingCaseSensitivePrefix); break; case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive: @@ -323,7 +321,8 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive: case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive: Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 }); - EmitFixedSet_LeftToRight(); + additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); + EmitFixedSet(); break; default: @@ -338,7 +337,7 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm, } writer.WriteLine(); - writer.WriteLine("// No match"); + writer.WriteLine("// No starting position found"); writer.WriteLine("ReturnFalse:"); writer.WriteLine("base.runtextpos = end;"); writer.WriteLine("return false;"); @@ -368,8 +367,7 @@ bool EmitAnchors() case RegexPrefixAnalyzer.Start: writer.WriteLine("// Start \\G anchor"); - additionalDeclarations.Add("int start = base.runtextstart;"); - using (EmitBlock(writer, "if (pos > start)")) + using (EmitBlock(writer, "if (pos > base.runtextstart)")) { writer.WriteLine("goto ReturnFalse;"); } @@ -400,6 +398,7 @@ bool EmitAnchors() // the other anchors, which all skip all subsequent processing if found, with BOL we just use it // to boost our position to the next line, and then continue normally with any searches. writer.WriteLine("// Beginning-of-line anchor"); + additionalDeclarations.Add("global::System.ReadOnlySpan inputSpan = base.runtext;"); additionalDeclarations.Add("int beginning = base.runtextbeg;"); using (EmitBlock(writer, "if (pos > beginning && inputSpan[pos - 1] != '\\n')")) { @@ -418,8 +417,8 @@ bool EmitAnchors() return false; } - // Emits a case-sensitive left-to-right prefix search for a string at the beginning of the pattern. - void EmitIndexOf_LeftToRight(string prefix) + // Emits a case-sensitive prefix search for a string at the beginning of the pattern. + void EmitIndexOf(string prefix) { writer.WriteLine($"int i = global::System.MemoryExtensions.IndexOf(inputSpan.Slice(pos, end - pos), {Literal(prefix)});"); writer.WriteLine("if (i >= 0)"); @@ -429,9 +428,9 @@ void EmitIndexOf_LeftToRight(string prefix) writer.WriteLine("}"); } - // Emits a left-to-right search for a set at a fixed position from the start of the pattern, + // Emits a search for a set at a fixed position from the start of the pattern, // and potentially other sets at other fixed positions in the pattern. - void EmitFixedSet_LeftToRight() + void EmitFixedSet() { List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = code.FindOptimizations.FixedDistanceSets; (char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0]; @@ -600,7 +599,6 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id) RegexOptions options = (RegexOptions)rm.Options; RegexCode code = rm.Code; - bool hasTimeout = false; // Helper to define names. Names start unadorned, but as soon as there's repetition, // they begin to have a numbered suffix. @@ -608,7 +606,7 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id) // Every RegexTree is rooted in the implicit Capture for the whole expression. // Skip the Capture node. We handle the implicit root capture specially. - RegexNode node = rm.Code.Tree.Root; + RegexNode node = code.Tree.Root; Debug.Assert(node.Type == RegexNode.Capture, "Every generated tree should begin with a capture node"); Debug.Assert(node.ChildCount() == 1, "Capture nodes should have one child"); node = node.Child(0); @@ -635,9 +633,8 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id) } // In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later. - // To handle that, we emit a placeholder value that's not valid C#, and then at the end of the code generation we either - // delete it if no additional declarations are required, or we replace it with the list of additional declarations - // built up while generating code. + // To handle that, we build up a collection of all the declarations to include, track where they should be inserted, + // and then insert them at that position once everything else has been output. var additionalDeclarations = new HashSet(); var additionalLocalFunctions = new Dictionary(); @@ -646,14 +643,11 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id) writer.WriteLine("global::System.ReadOnlySpan inputSpan = base.runtext;"); writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;"); writer.WriteLine($"int original_pos = pos;"); - hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm); + bool hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm); + bool hasTextInfo = EmitInitializeCultureForGoIfNecessary(writer, rm); writer.Flush(); int additionalDeclarationsPosition = ((StringWriter)writer.InnerWriter).GetStringBuilder().Length; int additionalDeclarationsIndent = writer.Indent; - writer.WriteLine(); - - // TextInfo textInfo = CultureInfo.CurrentCulture.TextInfo; // only if the whole expression or any subportion is ignoring case, and we're not using invariant - bool hasTextInfo = EmitInitializeCultureForGoIfNecessary(writer, rm); // The implementation tries to use const indexes into the span wherever possible, which we can do // for all fixed-length constructs. In such cases (e.g. single chars, repeaters, strings, etc.) @@ -936,8 +930,7 @@ void EmitAllBranches() // Save off pos. We'll need to reset this each time a branch fails. string startingPos = ReserveName("alternation_starting_pos"); - additionalDeclarations.Add($"int {startingPos} = 0;"); - writer.WriteLine($"{startingPos} = pos;"); + writer.WriteLine($"int {startingPos} = pos;"); int startingSliceStaticPos = sliceStaticPos; // We need to be able to undo captures in two situations: @@ -964,8 +957,7 @@ void EmitAllBranches() if (expressionHasCaptures && ((node.Options & RegexNode.HasCapturesFlag) != 0 || !isAtomic)) { startingCapturePos = ReserveName("alternation_starting_capturepos"); - additionalDeclarations.Add($"int {startingCapturePos} = 0;"); - writer.WriteLine($"{startingCapturePos} = base.Crawlpos();"); + writer.WriteLine($"int {startingCapturePos} = base.Crawlpos();"); } writer.WriteLine(); @@ -1211,7 +1203,7 @@ void EmitBackreferenceConditional(RegexNode node) // to backtrack to. So, we expose a single Backtrack label and track which branch was // followed in this resumeAt local. string resumeAt = ReserveName("conditionalbackreference_branch"); - additionalDeclarations.Add($"int {resumeAt} = 0;"); + writer.WriteLine($"int {resumeAt} = 0;"); // While it would be nicely readable to use an if/else block, if the branches contain // anything that triggers backtracking, labels will end up being defined, and if they're @@ -1340,7 +1332,12 @@ void EmitExpressionConditional(RegexNode node) { startingCapturePos = ReserveName("conditionalexpression_starting_capturepos"); writer.WriteLine($"int {startingCapturePos} = base.Crawlpos();"); - writer.WriteLine(); + } + + string resumeAt = ReserveName("conditionalexpression_resumeAt"); + if (!isAtomic) + { + writer.WriteLine($"int {resumeAt} = 0;"); } // Emit the conditional expression. We need to reroute any match failures to either the "no" branch @@ -1353,13 +1350,7 @@ void EmitExpressionConditional(RegexNode node) { doneLabel = originalDoneLabel; } - string postConditionalDoneLabel = doneLabel; - string resumeAt = ReserveName("conditionalexpression_resumeAt"); - if (!isAtomic) - { - additionalDeclarations.Add($"int {resumeAt} = 0;"); - } // If we get to this point of the code, the conditional successfully matched, so run the "yes" branch. // Since the "yes" branch may have a different execution path than the "no" branch or the lack of @@ -1370,10 +1361,6 @@ void EmitExpressionConditional(RegexNode node) writer.WriteLine(); TransferSliceStaticPosToPos(); // ensure all subsequent code sees the same sliceStaticPos value by setting it to 0 string postYesDoneLabel = doneLabel; - if (!isAtomic && postYesDoneLabel != originalDoneLabel) - { - writer.WriteLine($"{resumeAt} = 0;"); - } if (postYesDoneLabel != originalDoneLabel || noBranch is not null) { writer.WriteLine($"goto {end};"); @@ -1467,8 +1454,7 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) TransferSliceStaticPosToPos(); string startingPos = ReserveName("capture_starting_pos"); - additionalDeclarations.Add($"int {startingPos} = 0;"); - writer.WriteLine($"{startingPos} = pos;"); + writer.WriteLine($"int {startingPos} = pos;"); writer.WriteLine(); RegexNode child = node.Child(0); @@ -1604,7 +1590,8 @@ node.Type is RegexNode.Atomic or // atomic nodes by definition don't give up any RegexNode.Oneloopatomic or RegexNode.Notoneloopatomic or RegexNode.Setloopatomic or // same for atomic loops RegexNode.One or RegexNode.Notone or RegexNode.Set or // individual characters don't backtrack RegexNode.Multi or // multiple characters don't backtrack - RegexNode.Beginning or RegexNode.Start or RegexNode.End or RegexNode.EndZ or RegexNode.Boundary or RegexNode.NonBoundary or RegexNode.ECMABoundary or RegexNode.NonECMABoundary or // anchors don't backtrack + RegexNode.Ref or // backreferences don't backtrack + RegexNode.Beginning or RegexNode.Bol or RegexNode.Start or RegexNode.End or RegexNode.EndZ or RegexNode.Eol or RegexNode.Boundary or RegexNode.NonBoundary or RegexNode.ECMABoundary or RegexNode.NonECMABoundary or // anchors don't backtrack RegexNode.Nothing or RegexNode.Empty or RegexNode.UpdateBumpalong // empty/nothing don't do anything // Fixed-size repeaters of single characters or atomic don't backtrack || node.Type is RegexNode.Oneloop or RegexNode.Notoneloop or RegexNode.Setloop or RegexNode.Onelazy or RegexNode.Notonelazy or RegexNode.Setlazy && node.M == node.N @@ -1965,14 +1952,14 @@ void EmitAnchors(RegexNode node) break; case RegexNode.End: - using (EmitBlock(writer, $"if ({sliceSpan}.Length > {sliceStaticPos})")) + using (EmitBlock(writer, $"if ({IsSliceLengthGreaterThanSliceStaticPos()})")) { writer.WriteLine($"goto {doneLabel};"); } break; case RegexNode.EndZ: - writer.WriteLine($"if ({sliceSpan}.Length - 1 > {sliceStaticPos} || ({sliceSpan}.Length > {sliceStaticPos} && {sliceSpan}[{sliceStaticPos}] != '\\n'))"); + writer.WriteLine($"if ({sliceSpan}.Length - 1 > {sliceStaticPos} || ({IsSliceLengthGreaterThanSliceStaticPos()} && {sliceSpan}[{sliceStaticPos}] != '\\n'))"); using (EmitBlock(writer, null)) { writer.WriteLine($"goto {doneLabel};"); @@ -1980,11 +1967,15 @@ void EmitAnchors(RegexNode node) break; case RegexNode.Eol: - using (EmitBlock(writer, $"if ({sliceSpan}.Length > {sliceStaticPos} && {sliceSpan}[{sliceStaticPos}] != '\\n')")) + using (EmitBlock(writer, $"if ({IsSliceLengthGreaterThanSliceStaticPos()} && {sliceSpan}[{sliceStaticPos}] != '\\n')")) { writer.WriteLine($"goto {doneLabel};"); } break; + + string IsSliceLengthGreaterThanSliceStaticPos() => + sliceStaticPos == 0 ? $"!{sliceSpan}.IsEmpty" : + $"{sliceSpan}.Length > {sliceStaticPos}"; } } @@ -2222,8 +2213,7 @@ void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true) maxIterations = $"{node.N - node.M}"; iterationCount = ReserveName("lazyloop_iteration"); - additionalDeclarations.Add($"int {iterationCount} = 0;"); - writer.WriteLine($"{iterationCount} = 0;"); + writer.WriteLine($"int {iterationCount} = 0;"); } // Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point. @@ -2366,11 +2356,7 @@ void EmitLazy(RegexNode node) string body = ReserveName("LazyLoopBody"); string endLoop = ReserveName("LazyLoopEnd"); - additionalDeclarations.Add($"int {iterationCount} = 0, {startingPos} = 0, {sawEmpty} = 0;"); - writer.WriteLine($"{iterationCount} = 0;"); - writer.WriteLine($"{startingPos} = pos;"); - writer.WriteLine($"{sawEmpty} = 0;"); - writer.WriteLine(); + writer.WriteLine($"int {iterationCount} = 0, {startingPos} = pos, {sawEmpty} = 0;"); // If the min count is 0, start out by jumping right to what's after the loop. Backtracking // will then bring us back in to do further iterations. @@ -2378,6 +2364,7 @@ void EmitLazy(RegexNode node) { writer.WriteLine($"goto {endLoop};"); } + writer.WriteLine(); // Iteration body MarkLabel(body, emitSemicolon: false); @@ -3279,7 +3266,7 @@ private static void ReplaceAdditionalDeclarations(IndentedTextWriter writer, Has { if (declarations.Count != 0) { - StringBuilder tmp = new StringBuilder().AppendLine(); + var tmp = new StringBuilder(); foreach (string decl in declarations.OrderBy(s => s)) { for (int i = 0; i < indent; i++) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs b/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs index e15d49c78ff4c..18265762dda5d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Threading/StackHelper.cs @@ -31,13 +31,6 @@ public static bool TryEnsureSufficientExecutionStack() // It does so in a way that prevents task inlining (which would defeat the purpose) but that // also plays nicely with the thread pool's sync-over-async aggressive thread injection policies. - /// Calls the provided action on the stack of a different thread pool thread. - /// The action to invoke. - public static void CallOnEmptyStack(Action action) => - Task.Run(() => action()) - .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) - .GetAwaiter().GetResult(); - /// Calls the provided action on the stack of a different thread pool thread. /// The type of the first argument to pass to the function. /// The action to invoke. @@ -114,21 +107,5 @@ public static TResult CallOnEmptyStack(Func func(arg1, arg2, arg3)) .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) .GetAwaiter().GetResult(); - - /// Calls the provided function on the stack of a different thread pool thread. - /// The type of the first argument to pass to the function. - /// The type of the second argument to pass to the function. - /// The type of the third argument to pass to the function. - /// The type of the fourth argument to pass to the function. - /// The return type of the function. - /// The function to invoke. - /// The first argument to pass to the function. - /// The second argument to pass to the function. - /// The third argument to pass to the function. - /// The fourth argument to pass to the function. - public static TResult CallOnEmptyStack(Func func, TArg1 arg1, TArg2 arg2, TArg3 arg3, TArg4 arg4) => - Task.Run(() => func(arg1, arg2, arg3, arg4)) - .ContinueWith(t => t.GetAwaiter().GetResult(), CancellationToken.None, TaskContinuationOptions.ExecuteSynchronously, TaskScheduler.Default) - .GetAwaiter().GetResult(); } }