diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index e758cac88d68e..16bf09065ae84 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -1,19 +1,16 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Buffers.Binary; using System.CodeDom.Compiler; using System.Collections; using System.Collections.Generic; using System.Collections.Immutable; using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; using System.Globalization; using System.IO; using System.Linq; -using System.Net.Cache; -using System.Runtime.InteropServices; using System.Threading; -using System.Web; using Microsoft.CodeAnalysis; using Microsoft.CodeAnalysis.CSharp; @@ -2891,33 +2888,19 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL // We're backtracking. Check the timeout. EmitTimeoutCheckIfNeeded(writer, rm); - if (!rtl && subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal) + if (!rtl && + node.N > 1 && // no point in using IndexOf for small loops, in particular optionals + subsequent?.FindStartingLiteralNode() is RegexNode literalNode && + TryEmitIndexOf(literalNode, useLast: true, negate: false, out int literalLength, out string indexOfExpr)) { writer.WriteLine($"if ({startingPos} >= {endingPos} ||"); - (string lastIndexOfName, string lastIndexOfAnyName) = !literal.Negated ? - ("LastIndexOf", "LastIndexOfAny") : - ("LastIndexOfAnyExcept", "LastIndexOfAnyExcept"); string setEndingPosCondition = $" ({endingPos} = inputSpan.Slice({startingPos}, "; - if (literal.String is not null) - { - setEndingPosCondition += $"Math.Min(inputSpan.Length, {endingPos} + {literal.String.Length - 1}) - {startingPos}).{lastIndexOfName}({Literal(literal.String)}"; - } - else - { - setEndingPosCondition += $"{endingPos} - {startingPos})."; - setEndingPosCondition += literal.SetChars is not null ? literal.SetChars.Length switch - { - 2 => $"{lastIndexOfAnyName}({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])}", - 3 => $"{lastIndexOfAnyName}({Literal(literal.SetChars[0])}, {Literal(literal.SetChars[1])}, {Literal(literal.SetChars[2])}", - _ => $"{lastIndexOfAnyName}({Literal(literal.SetChars)}", - } : - literal.Range.LowInclusive == literal.Range.HighInclusive ? $"{lastIndexOfName}({Literal(literal.Range.LowInclusive)}" : - $"{lastIndexOfAnyName}InRange({Literal(literal.Range.LowInclusive)}, {Literal(literal.Range.HighInclusive)}"; - } - setEndingPosCondition += ")) < 0)"; + setEndingPosCondition = literalLength > 1 ? + $"{setEndingPosCondition}Math.Min(inputSpan.Length, {endingPos} + {literalLength - 1}) - {startingPos})" : + $"{setEndingPosCondition}{endingPos} - {startingPos})"; - using (EmitBlock(writer, setEndingPosCondition)) + using (EmitBlock(writer, $"{setEndingPosCondition}.{indexOfExpr}) < 0)")) { Goto(doneLabel); } @@ -3098,7 +3081,7 @@ literal.SetChars is not null || (false, _) => $"{startingPos} = {sliceSpan}.IndexOfAny({Literal($"{node.Ch}{literal.SetChars}")});", }); } - else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char + else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One { overlap = literal.Range.LowInclusive == node.Ch; writer.WriteLine(overlap ? @@ -3131,26 +3114,13 @@ literal.SetChars is not null || else if (iterationCount is null && node.Kind is RegexNodeKind.Setlazy && node.Str == RegexCharClass.AnyClass && - subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal2) + subsequent?.FindStartingLiteralNode() is RegexNode literal2 && + TryEmitIndexOf(literal2, useLast: false, negate: false, out _, out string? indexOfExpr)) { // e.g. ".*?string" with RegexOptions.Singleline // This lazy loop will consume all characters until the subsequent literal. If the subsequent literal // isn't found, the loop fails. We can implement it to just search for that literal. - (string indexOfName, string indexOfAnyName) = !literal2.Negated ? - ("IndexOf", "IndexOfAny") : - ("IndexOfAnyExcept", "IndexOfAnyExcept"); - writer.WriteLine($"{startingPos} = {sliceSpan}."); - writer.WriteLine( - literal2.String is not null ? $"{indexOfName}({Literal(literal2.String)});" : - literal2.SetChars is not null ? literal2.SetChars.Length switch - { - 2 => $"{indexOfAnyName}({Literal(literal2.SetChars[0])}, {Literal(literal2.SetChars[1])});", - 3 => $"{indexOfAnyName}({Literal(literal2.SetChars[0])}, {Literal(literal2.SetChars[1])}, {Literal(literal2.SetChars[2])});", - _ => $"{indexOfAnyName}({Literal(literal2.SetChars)});", - } : - literal2.Range.LowInclusive == literal2.Range.HighInclusive ? $"{indexOfName}({Literal(literal2.Range.LowInclusive)});" : - $"{indexOfAnyName}InRange({Literal(literal2.Range.LowInclusive)}, {Literal(literal2.Range.HighInclusive)});"); - + writer.WriteLine($"{startingPos} = {sliceSpan}.{indexOfExpr};"); using (EmitBlock(writer, $"if ({startingPos} < 0)")) { Goto(doneLabel); @@ -3543,6 +3513,15 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true) EmitSingleChar(node); } } + else if (node.IsSetFamily && node.Str == RegexCharClass.AnyClass) + { + // This is a repeater for anything, which means we only care about length and can jump past that length. + if (emitLengthCheck) + { + EmitSpanLengthCheck(iterations); + } + sliceStaticPos += iterations; + } else if (iterations <= MaxUnrollSize) { // if ((uint)(sliceStaticPos + iterations - 1) >= (uint)slice.Length || @@ -3577,20 +3556,37 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthCheck = true) if (emitLengthCheck) { EmitSpanLengthCheck(iterations); + writer.WriteLine(); } - string repeaterSpan = "repeaterSlice"; // As this repeater doesn't wrap arbitrary node emits, this shouldn't conflict with anything - writer.WriteLine($"ReadOnlySpan {repeaterSpan} = {sliceSpan}.Slice({sliceStaticPos}, {iterations});"); - using (EmitBlock(writer, $"for (int i = 0; i < {repeaterSpan}.Length; i++)")) + // If we're able to vectorize the search, do so. Otherwise, fall back to a loop. + // For the loop, we're validating that each char matches the target node. + // For IndexOf, we're looking for the first thing that _doesn't_ match the target node, + // and thus similarly validating that everything does. + if (TryEmitIndexOf(node, useLast: false, negate: true, out _, out string? indexOfExpr)) { - string tmpTextSpanLocal = sliceSpan; // we want EmitSingleChar to refer to this temporary - int tmpSliceStaticPos = sliceStaticPos; - sliceSpan = repeaterSpan; - sliceStaticPos = 0; - EmitSingleChar(node, emitLengthCheck: false, offset: "i"); - sliceSpan = tmpTextSpanLocal; - sliceStaticPos = tmpSliceStaticPos; + using (EmitBlock(writer, $"if ({sliceSpan}.Slice({sliceStaticPos}, {iterations}).{indexOfExpr} >= 0)")) + { + Goto(doneLabel); + } } + else + { + string repeaterSpan = "repeaterSlice"; // As this repeater doesn't wrap arbitrary node emits, this shouldn't conflict with anything + writer.WriteLine($"ReadOnlySpan {repeaterSpan} = {sliceSpan}.Slice({sliceStaticPos}, {iterations});"); + + using (EmitBlock(writer, $"for (int i = 0; i < {repeaterSpan}.Length; i++)")) + { + string tmpTextSpanLocal = sliceSpan; // we want EmitSingleChar to refer to this temporary + int tmpSliceStaticPos = sliceStaticPos; + sliceSpan = repeaterSpan; + sliceStaticPos = 0; + EmitSingleChar(node, emitLengthCheck: false, offset: "i"); + sliceSpan = tmpTextSpanLocal; + sliceStaticPos = tmpSliceStaticPos; + } + } + sliceStaticPos += iterations; } } @@ -3618,9 +3614,6 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = int minIterations = node.M; int maxIterations = node.N; bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; - - Span setChars = stackalloc char[5]; // 5 is max optimized by IndexOfAny today - int numSetChars = 0; string iterationLocal = ReserveName("iteration"); if (rtl) @@ -3655,61 +3648,6 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = writer.WriteLine(); } } - else if ((node.IsOneFamily || node.IsNotoneFamily) && maxIterations == int.MaxValue) - { - // For One or Notone, we're looking for a specific character, as everything until we find - // it (or its negation in the case of One) is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive, - // we can use the vectorized IndexOf{AnyExcept} to do the search, rather than open-coding it. The unbounded - // restriction is purely for simplicity; it could be removed in the future with additional code to - // handle the unbounded case. - - writer.Write($"int {iterationLocal} = {sliceSpan}"); - if (sliceStaticPos > 0) - { - writer.Write($".Slice({sliceStaticPos})"); - } - string op = node.IsNotoneFamily ? "IndexOf" : "IndexOfAnyExcept"; - writer.WriteLine($".{op}({Literal(node.Ch)});"); - - using (EmitBlock(writer, $"if ({iterationLocal} < 0)")) - { - writer.WriteLine(sliceStaticPos > 0 ? - $"{iterationLocal} = {sliceSpan}.Length - {sliceStaticPos};" : - $"{iterationLocal} = {sliceSpan}.Length;"); - } - writer.WriteLine(); - } - else if (node.IsSetFamily && - maxIterations == int.MaxValue && - (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0) - { - // If the set contains only a few characters (if it contained 1 and was negated, it should - // have been reduced to a Notone), we can use an IndexOfAny{Except} to find any of the target characters. - // As with the notoneloopatomic above, the unbounded constraint is purely for simplicity. - Debug.Assert(numSetChars > 1); - - writer.Write($"int {iterationLocal} = {sliceSpan}"); - if (sliceStaticPos != 0) - { - writer.Write($".Slice({sliceStaticPos})"); - } - writer.WriteLine((numSetChars, RegexCharClass.IsNegated(node.Str!)) switch - { - (2, true) => $".IndexOfAny({Literal(setChars[0])}, {Literal(setChars[1])});", - (3, true) => $".IndexOfAny({Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])});", - (_, true) => $".IndexOfAny({Literal(setChars.Slice(0, numSetChars).ToString())});", - (2, false) => $".IndexOfAnyExcept({Literal(setChars[0])}, {Literal(setChars[1])});", - (3, false) => $".IndexOfAnyExcept({Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])});", - (_, false) => $".IndexOfAnyExcept({Literal(setChars.Slice(0, numSetChars).ToString())});", - }); - using (EmitBlock(writer, $"if ({iterationLocal} < 0)")) - { - writer.WriteLine(sliceStaticPos > 0 ? - $"{iterationLocal} = {sliceSpan}.Length - {sliceStaticPos};" : - $"{iterationLocal} = {sliceSpan}.Length;"); - } - writer.WriteLine(); - } else if (node.IsSetFamily && maxIterations == int.MaxValue && node.Str == RegexCharClass.AnyClass) { // .* was used with RegexOptions.Singleline, which means it'll consume everything. Just jump to the end. @@ -3718,20 +3656,18 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = TransferSliceStaticPosToPos(); writer.WriteLine($"int {iterationLocal} = inputSpan.Length - pos;"); } - else if (node.IsSetFamily && - maxIterations == int.MaxValue && - RegexCharClass.TryGetSingleRange(node.Str!, out char rangeLowInclusive, out char rangeHighInclusive)) + else if (maxIterations == int.MaxValue && TryEmitIndexOf(node, useLast: false, negate: true, out _, out string indexOfExpr)) { - // If the set contains a single range, we can use an IndexOfAny{Except}InRange to find any of the target characters. - // As with the cases above, the unbounded constraint is purely for simplicity. - string indexOfMethod = RegexCharClass.IsNegated(node.Str!) ? "IndexOfAnyInRange" : "IndexOfAnyExceptInRange"; + // We're unbounded and we can use an IndexOf method to perform the search. The unbounded restriction is + // purely for simplicity; it could be removed in the future with additional code to handle that case. writer.Write($"int {iterationLocal} = {sliceSpan}"); if (sliceStaticPos != 0) { writer.Write($".Slice({sliceStaticPos})"); } - writer.WriteLine($".{indexOfMethod}({Literal(rangeLowInclusive)}, {Literal(rangeHighInclusive)});"); + writer.WriteLine($".{indexOfExpr};"); + using (EmitBlock(writer, $"if ({iterationLocal} < 0)")) { writer.WriteLine(sliceStaticPos > 0 ? @@ -3745,14 +3681,9 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = // For everything else, do a normal loop. string expr = $"{sliceSpan}[{iterationLocal}]"; - if (node.IsSetFamily) - { - expr = MatchCharacterClass(options, expr, node.Str!, negate: false, additionalDeclarations, requiredHelpers); - } - else - { - expr = $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}"; - } + expr = node.IsSetFamily ? + MatchCharacterClass(options, expr, node.Str!, negate: false, additionalDeclarations, requiredHelpers) : + $"{expr} {(node.IsOneFamily ? "==" : "!=")} {Literal(node.Ch)}"; if (minIterations != 0 || maxIterations != int.MaxValue) { @@ -4348,6 +4279,85 @@ private static void EmitTimeoutCheckIfNeeded(IndentedTextWriter writer, RegexMet } } + /// Tries to create an IndexOf expression for the node. + /// The RegexNode. If it's a loop, only the one/notone/set aspect of the node is factored in. + /// true to use LastIndexOf variants; false to use IndexOf variants. + /// true to search for the opposite of the node. + /// 0 if returns false. If it returns true, string.Length for a multi, otherwise 1. + /// The resulting expression if it returns true; otherwise, null. + /// true if an expression could be produced; otherwise, false. + private static bool TryEmitIndexOf( + RegexNode node, + bool useLast, bool negate, + out int literalLength, [NotNullWhen(true)] out string? indexOfExpr) + { + string last = useLast ? "Last" : ""; + + if (node.Kind == RegexNodeKind.Multi) + { + Debug.Assert(!negate, "Negation isn't appropriate for a multi"); + indexOfExpr = $"{last}IndexOf({Literal(node.Str)})"; + literalLength = node.Str.Length; + return true; + } + + if (node.IsOneFamily) + { + indexOfExpr = negate ? $"{last}IndexOfAnyExcept({Literal(node.Ch)})" : $"{last}IndexOf({Literal(node.Ch)})"; + literalLength = 1; + return true; + } + + if (node.IsNotoneFamily) + { + indexOfExpr = negate ? $"{last}IndexOf({Literal(node.Ch)})" : $"{last}IndexOfAnyExcept({Literal(node.Ch)})"; + literalLength = 1; + return true; + } + + if (node.IsSetFamily) + { + bool negated = RegexCharClass.IsNegated(node.Str) ^ negate; + + Span setChars = stackalloc char[5]; // current max that's vectorized + int setCharsCount; + if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0) + { + (string indexOfName, string indexOfAnyName) = !negated ? + ("IndexOf", "IndexOfAny") : + ("IndexOfAnyExcept", "IndexOfAnyExcept"); + + setChars = setChars.Slice(0, setCharsCount); + indexOfExpr = setChars.Length switch + { + 1 => $"{last}{indexOfName}({Literal(setChars[0])})", + 2 => $"{last}{indexOfAnyName}({Literal(setChars[0])}, {Literal(setChars[1])})", + 3 => $"{last}{indexOfAnyName}({Literal(setChars[0])}, {Literal(setChars[1])}, {Literal(setChars[2])})", + _ => $"{last}{indexOfAnyName}({Literal(setChars.ToString())})", + }; + + literalLength = 1; + return true; + } + + if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) + { + string indexOfAnyInRangeName = !negated ? + "IndexOfAnyInRange" : + "IndexOfAnyExceptInRange"; + + indexOfExpr = $"{last}{indexOfAnyInRangeName}({Literal(lowInclusive)}, {Literal(highInclusive)})"; + + literalLength = 1; + return true; + } + } + + indexOfExpr = null; + literalLength = 0; + return false; + } + private static string MatchCharacterClass(RegexOptions options, string chExpr, string charClass, bool negate, HashSet additionalDeclarations, Dictionary requiredHelpers) { // We need to perform the equivalent of calling RegexRunner.CharInClass(ch, charClass), diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 2eee61f47e2ce..0ed046e282fdd 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -2792,7 +2792,7 @@ void EmitSingleChar(RegexNode node, bool emitLengthCheck = true, LocalBuilder? o // if (loadedChar != ch) goto doneLabel; if (node.IsSetFamily) { - EmitMatchCharacterClass(node.Str!); + EmitMatchCharacterClass(node.Str); BrfalseFar(doneLabel); } else @@ -3176,7 +3176,10 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL BleFar(doneLabel); } - if (!rtl && subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal) + if (!rtl && + node.N > 1 && + subsequent?.FindStartingLiteralNode() is RegexNode literal && + CanEmitIndexOf(literal, out int literalLength)) { // endingPos = inputSpan.Slice(startingPos, Math.Min(inputSpan.Length, endingPos + literal.Length - 1) - startingPos).LastIndexOf(literal); // if (endingPos < 0) @@ -3185,65 +3188,28 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL // } Ldloca(inputSpan); Ldloc(startingPos); - if (literal.String is not null) + if (literalLength > 1) { - Debug.Assert(!literal.Negated, "strings should not be negated"); + // Math.Min(inputSpan.Length, endingPos + literal.Length - 1) - startingPos Ldloca(inputSpan); Call(s_spanGetLengthMethod); Ldloc(endingPos); - Ldc(literal.String.Length - 1); + Ldc(literalLength - 1); Add(); Call(s_mathMinIntInt); - Ldloc(startingPos); - Sub(); - Call(s_spanSliceIntIntMethod); - Ldstr(literal.String); - Call(s_stringAsSpanMethod); - Call(s_spanLastIndexOfSpan); } else { + // endingPos - startingPos Ldloc(endingPos); - Ldloc(startingPos); - Sub(); - Call(s_spanSliceIntIntMethod); - if (literal.SetChars is not null) - { - switch (literal.SetChars.Length) - { - case 2: - Ldc(literal.SetChars[0]); - Ldc(literal.SetChars[1]); - Call(literal.Negated ? s_spanLastIndexOfAnyExceptCharChar : s_spanLastIndexOfAnyCharChar); - break; - - case 3: - Ldc(literal.SetChars[0]); - Ldc(literal.SetChars[1]); - Ldc(literal.SetChars[2]); - Call(literal.Negated ? s_spanLastIndexOfAnyExceptCharCharChar : s_spanLastIndexOfAnyCharCharChar); - break; - - default: - Ldstr(literal.SetChars); - Call(s_stringAsSpanMethod); - Call(literal.Negated ? s_spanLastIndexOfAnyExceptSpan : s_spanLastIndexOfAnySpan); - break; - } - } - else if (literal.Range.LowInclusive == literal.Range.HighInclusive) - { - Ldc(literal.Range.LowInclusive); - Call(literal.Negated ? s_spanLastIndexOfAnyExceptChar : s_spanLastIndexOfChar); - } - else - { - Ldc(literal.Range.LowInclusive); - Ldc(literal.Range.HighInclusive); - Call(literal.Negated ? s_spanLastIndexOfAnyExceptInRange : s_spanLastIndexOfAnyInRange); - } } + Ldloc(startingPos); + Sub(); + Call(s_spanSliceIntIntMethod); + + EmitIndexOf(literal, useLast: true, negate: false); Stloc(endingPos); + Ldloc(endingPos); Ldc(0); BltFar(doneLabel); @@ -3487,7 +3453,7 @@ literal.SetChars is not null || break; } } - else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // char literal + else if (literal.Range.LowInclusive == literal.Range.HighInclusive) // single char from a RegexNode.One { overlap = literal.Range.LowInclusive == node.Ch; if (overlap) @@ -3557,7 +3523,8 @@ literal.SetChars is not null || iterationCount is null && node.Kind is RegexNodeKind.Setlazy && node.Str == RegexCharClass.AnyClass && - subsequent?.FindStartingLiteral() is RegexNode.StartingLiteralData literal2) + subsequent?.FindStartingLiteralNode() is RegexNode literal2 && + CanEmitIndexOf(literal2, out _)) { // e.g. ".*?string" with RegexOptions.Singleline // This lazy loop will consume all characters until the subsequent literal. If the subsequent literal @@ -3565,50 +3532,7 @@ node.Kind is RegexNodeKind.Setlazy && // startingPos = slice.IndexOf(literal); Ldloc(slice); - if (literal2.String is not null) - { - Debug.Assert(!literal2.Negated, "strings should not be negated"); - Ldstr(literal2.String); - Call(s_stringAsSpanMethod); - Call(s_spanIndexOfSpan); - } - else if (literal2.SetChars is not null) - { - switch (literal2.SetChars.Length) - { - case 2: - Ldc(literal2.SetChars[0]); - Ldc(literal2.SetChars[1]); - Call(literal2.Negated ? s_spanIndexOfAnyExceptCharChar : s_spanIndexOfAnyCharChar); - break; - - case 3: - Ldc(literal2.SetChars[0]); - Ldc(literal2.SetChars[1]); - Ldc(literal2.SetChars[2]); - Call(literal2.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar); - break; - - default: - Ldstr(literal2.SetChars); - Call(s_stringAsSpanMethod); - Call(literal2.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan); - break; - } - } - else - { - Ldc(literal2.Range.LowInclusive); - if (literal2.Range.LowInclusive == literal2.Range.HighInclusive) - { - Call(literal2.Negated ? s_spanIndexOfAnyExceptChar : s_spanIndexOfChar); - } - else - { - Ldc(literal2.Range.HighInclusive); - Call(literal2.Negated ? s_spanIndexOfAnyExceptInRange : s_spanIndexOfAnyInRange); - } - } + EmitIndexOf(node, useLast: false, negate: false); Stloc(startingPos); // if (startingPos < 0) goto doneLabel; @@ -4114,6 +4038,13 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthChecksIfRequired = tr EmitSpanLengthCheck(iterations); } + // If this is a repeater for anything,we only care about length and can jump past that length. + if (node.IsSetFamily && node.Str == RegexCharClass.AnyClass) + { + sliceStaticPos += iterations; + return; + } + // Arbitrary limit for unrolling vs creating a loop. We want to balance size in the generated // code with other costs, like the (small) overhead of slicing to create the temp span to iterate. const int MaxUnrollSize = 16; @@ -4132,48 +4063,61 @@ void EmitSingleCharRepeater(RegexNode node, bool emitLengthChecksIfRequired = tr else { // ReadOnlySpan tmp = slice.Slice(sliceStaticPos, iterations); - // for (int i = 0; i < tmp.Length; i++) - // { - // TimeoutCheck(); - // if (tmp[i] != ch) goto Done; - // } - // sliceStaticPos += iterations; - - Label conditionLabel = DefineLabel(); - Label bodyLabel = DefineLabel(); - - using RentedLocalBuilder spanLocal = RentReadOnlySpanCharLocal(); Ldloca(slice); Ldc(sliceStaticPos); Ldc(iterations); Call(s_spanSliceIntIntMethod); - Stloc(spanLocal); - using RentedLocalBuilder iterationLocal = RentInt32Local(); - Ldc(0); - Stloc(iterationLocal); - BrFar(conditionLabel); + // If we're able to vectorize the search, do so. Otherwise, fall back to a loop. + // For the loop, we're validating that each char matches the target node. + // For IndexOf, we're looking for the first thing that _doesn't_ match the target node, + // and thus similarly validating that everything does. + if (CanEmitIndexOf(node, out _)) + { + // if (tmp.IndexOf(...) >= 0) goto doneLabel; + EmitIndexOf(node, useLast: false, negate: true); + Ldc(0); + BgeFar(doneLabel); + } + else + { + using RentedLocalBuilder spanLocal = RentReadOnlySpanCharLocal(); + Stloc(spanLocal); - MarkLabel(bodyLabel); + // for (int i = 0; i < tmp.Length; i++) + // { + // if (tmp[i] != ch) goto Done; + // } - LocalBuilder tmpTextSpanLocal = slice; // we want EmitSingleChar to refer to this temporary - int tmpTextSpanPos = sliceStaticPos; - slice = spanLocal; - sliceStaticPos = 0; - EmitSingleChar(node, emitLengthCheck: false, offset: iterationLocal); - slice = tmpTextSpanLocal; - sliceStaticPos = tmpTextSpanPos; + Label conditionLabel = DefineLabel(); + Label bodyLabel = DefineLabel(); - Ldloc(iterationLocal); - Ldc(1); - Add(); - Stloc(iterationLocal); + using RentedLocalBuilder iterationLocal = RentInt32Local(); + Ldc(0); + Stloc(iterationLocal); + BrFar(conditionLabel); - MarkLabel(conditionLabel); - Ldloc(iterationLocal); - Ldloca(spanLocal); - Call(s_spanGetLengthMethod); - BltFar(bodyLabel); + MarkLabel(bodyLabel); + + LocalBuilder tmpTextSpanLocal = slice; // we want EmitSingleChar to refer to this temporary + int tmpTextSpanPos = sliceStaticPos; + slice = spanLocal; + sliceStaticPos = 0; + EmitSingleChar(node, emitLengthCheck: false, offset: iterationLocal); + slice = tmpTextSpanLocal; + sliceStaticPos = tmpTextSpanPos; + + Ldloc(iterationLocal); + Ldc(1); + Add(); + Stloc(iterationLocal); + + MarkLabel(conditionLabel); + Ldloc(iterationLocal); + Ldloca(spanLocal); + Call(s_spanGetLengthMethod); + BltFar(bodyLabel); + } sliceStaticPos += iterations; } @@ -4202,14 +4146,9 @@ void EmitSingleCharAtomicLoop(RegexNode node) int minIterations = node.M; int maxIterations = node.N; bool rtl = (node.Options & RegexOptions.RightToLeft) != 0; - using RentedLocalBuilder iterationLocal = RentInt32Local(); - Label atomicLoopDoneLabel = DefineLabel(); - Span setChars = stackalloc char[5]; // max optimized by IndexOfAny today - int numSetChars = 0; - if (rtl) { TransferSliceStaticPosToPos(); // we don't use static position for rtl @@ -4242,7 +4181,7 @@ void EmitSingleCharAtomicLoop(RegexNode node) LdindU2(); if (node.IsSetFamily) { - EmitMatchCharacterClass(node.Str!); + EmitMatchCharacterClass(node.Str); BrfalseFar(atomicLoopDoneLabel); } else @@ -4277,103 +4216,6 @@ void EmitSingleCharAtomicLoop(RegexNode node) BrFar(bodyLabel); } } - else if ((node.IsOneFamily || node.IsNotoneFamily) && maxIterations == int.MaxValue) - { - // For One or Notone, we're looking for a specific character, as everything until we find - // it (or its negation in the case of One) is consumed by the loop. If we're unbounded, such as with ".*" and if we're case-sensitive, - // we can use the vectorized IndexOf{AnyExcept} to do the search, rather than open-coding it. The unbounded - // restriction is purely for simplicity; it could be removed in the future with additional code to - // handle the unbounded case. - - // int i = slice.Slice(sliceStaticPos).IndexOf(char); - if (sliceStaticPos > 0) - { - Ldloca(slice); - Ldc(sliceStaticPos); - Call(s_spanSliceIntMethod); - } - else - { - Ldloc(slice); - } - Ldc(node.Ch); - Call(node.IsNotoneFamily ? s_spanIndexOfChar : s_spanIndexOfAnyExceptChar); - Stloc(iterationLocal); - - // if (i >= 0) goto atomicLoopDoneLabel; - Ldloc(iterationLocal); - Ldc(0); - BgeFar(atomicLoopDoneLabel); - - // i = slice.Length - sliceStaticPos; - Ldloca(slice); - Call(s_spanGetLengthMethod); - if (sliceStaticPos > 0) - { - Ldc(sliceStaticPos); - Sub(); - } - Stloc(iterationLocal); - } - else if (node.IsSetFamily && - maxIterations == int.MaxValue && - (numSetChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0) - { - // If the set contains only a few characters (if it contained 1 and was negated, it should - // have been reduced to a Notone), we can use an IndexOfAny{Except} to find any of the target characters. - // As with the notoneloopatomic above, the unbounded constraint is purely for simplicity. - Debug.Assert(numSetChars > 1); - bool negated = RegexCharClass.IsNegated(node.Str!); - - // int i = slice.Slice(sliceStaticPos).IndexOfAny(ch1, ch2, ...); - if (sliceStaticPos > 0) - { - Ldloca(slice); - Ldc(sliceStaticPos); - Call(s_spanSliceIntMethod); - } - else - { - Ldloc(slice); - } - switch (numSetChars) - { - case 2: - Ldc(setChars[0]); - Ldc(setChars[1]); - Call(negated ? s_spanIndexOfAnyCharChar : s_spanIndexOfAnyExceptCharChar); - break; - - case 3: - Ldc(setChars[0]); - Ldc(setChars[1]); - Ldc(setChars[2]); - Call(negated ? s_spanIndexOfAnyCharCharChar : s_spanIndexOfAnyExceptCharCharChar); - break; - - default: - Ldstr(setChars.Slice(0, numSetChars).ToString()); - Call(s_stringAsSpanMethod); - Call(negated ? s_spanIndexOfAnySpan : s_spanIndexOfAnyExceptSpan); - break; - } - Stloc(iterationLocal); - - // if (i >= 0) goto atomicLoopDoneLabel; - Ldloc(iterationLocal); - Ldc(0); - BgeFar(atomicLoopDoneLabel); - - // i = slice.Length - sliceStaticPos; - Ldloca(slice); - Call(s_spanGetLengthMethod); - if (sliceStaticPos > 0) - { - Ldc(sliceStaticPos); - Sub(); - } - Stloc(iterationLocal); - } else if (node.IsSetFamily && maxIterations == int.MaxValue && node.Str == RegexCharClass.AnyClass) { // .* was used with RegexOptions.Singleline, which means it'll consume everything. Just jump to the end. @@ -4387,14 +4229,12 @@ void EmitSingleCharAtomicLoop(RegexNode node) Sub(); Stloc(iterationLocal); } - else if (node.IsSetFamily && - maxIterations == int.MaxValue && - RegexCharClass.TryGetSingleRange(node.Str!, out char rangeLowInclusive, out char rangeHighInclusive)) + else if (maxIterations == int.MaxValue && CanEmitIndexOf(node, out _)) { - // If the set contains a single range, we can use an IndexOfAny{Except}InRange to find any of the target characters. - // As with the cases above, the unbounded constraint is purely for simplicity. + // We're unbounded and we can use an IndexOf method to perform the search. The unbounded restriction is + // purely for simplicity; it could be removed in the future with additional code to handle that case. - // int i = slice.Slice(sliceStaticPos).IndexOfAny{Except}InRange(rangeLowInclusive, rangeHighInclusive); + // int i = slice.Slice(sliceStaticPos).IndexOf(...); if (sliceStaticPos > 0) { Ldloca(slice); @@ -4405,9 +4245,8 @@ void EmitSingleCharAtomicLoop(RegexNode node) { Ldloc(slice); } - Ldc(rangeLowInclusive); - Ldc(rangeHighInclusive); - Call(RegexCharClass.IsNegated(node.Str!) ? s_spanIndexOfAnyInRange : s_spanIndexOfAnyExceptInRange); + + EmitIndexOf(node, useLast: false, negate: true); Stloc(iterationLocal); // if (i >= 0) goto atomicLoopDoneLabel; @@ -4457,7 +4296,7 @@ void EmitSingleCharAtomicLoop(RegexNode node) LdindU2(); if (node.IsSetFamily) { - EmitMatchCharacterClass(node.Str!); + EmitMatchCharacterClass(node.Str); BrfalseFar(atomicLoopDoneLabel); } else @@ -4579,7 +4418,7 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) LdindU2(); if (node.IsSetFamily) { - EmitMatchCharacterClass(node.Str!); + EmitMatchCharacterClass(node.Str); BrfalseFar(skipUpdatesLabel); } else @@ -5013,6 +4852,175 @@ void EmitLoop(RegexNode node) } } + // Gets whether an IndexOf expression can be emitted for the node. + // The RegexNode. If it's a loop, only the one/notone/set aspect of the node is factored in. + // 0 if returns false. If it returns true, string.Length for a multi, otherwise 1. + // true if an IndexOf can be emitted; otherwise, false. + bool CanEmitIndexOf(RegexNode node, out int literalLength) + { + if (node.Kind == RegexNodeKind.Multi) + { + literalLength = node.Str!.Length; + return true; + } + + if (node.IsOneFamily || node.IsNotoneFamily) + { + literalLength = 1; + return true; + } + + if (node.IsSetFamily) + { + Span setChars = stackalloc char[5]; // current max that's vectorized + int setCharsCount; + if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0) + { + literalLength = 1; + return true; + } + + if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) + { + literalLength = 1; + return true; + } + } + + literalLength = 0; + return false; + } + + // Emits the code for IndexOf call based on the node. + // The RegexNode. If it's a loop, only the one/notone/set aspect of the node is factored in. + // true to use LastIndexOf variants; false to use IndexOf variants. + // true to search for the opposite of the node. + void EmitIndexOf(RegexNode node, bool useLast, bool negate) + { + if (node.Kind == RegexNodeKind.Multi) + { + // IndexOf(span) + Debug.Assert(!negate, "Negation isn't appropriate for a multi"); + Ldstr(node.Str!); + Call(s_stringAsSpanMethod); + Call(useLast ? s_spanLastIndexOfSpan : s_spanIndexOfSpan); + return; + } + + if (node.IsOneFamily || node.IsNotoneFamily) + { + // IndexOf{AnyExcept}(char) + + if (node.IsNotoneFamily) + { + negate = !negate; + } + + Ldc(node.Ch); + Call((useLast, negate) switch + { + (false, false) => s_spanIndexOfChar, + (false, true) => s_spanIndexOfAnyExceptChar, + (true, false) => s_spanLastIndexOfChar, + (true, true) => s_spanLastIndexOfAnyExceptChar, + }); + return; + } + + if (node.IsSetFamily) + { + bool negated = RegexCharClass.IsNegated(node.Str) ^ negate; + + // IndexOfAny{Except}(ch1, ...) + Span setChars = stackalloc char[5]; // current max that's vectorized + int setCharsCount; + if ((setCharsCount = RegexCharClass.GetSetChars(node.Str, setChars)) > 0) + { + setChars = setChars.Slice(0, setCharsCount); + switch (setChars.Length) + { + case 1: + Ldc(setChars[0]); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfChar, + (false, true) => s_spanIndexOfAnyExceptChar, + (true, false) => s_spanLastIndexOfChar, + (true, true) => s_spanLastIndexOfAnyExceptChar, + }); + return; + + case 2: + Ldc(setChars[0]); + Ldc(setChars[1]); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfAnyCharChar, + (false, true) => s_spanIndexOfAnyExceptCharChar, + (true, false) => s_spanLastIndexOfAnyCharChar, + (true, true) => s_spanLastIndexOfAnyExceptCharChar, + }); + return; + + case 3: + Ldc(setChars[0]); + Ldc(setChars[1]); + Ldc(setChars[2]); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfAnyCharCharChar, + (false, true) => s_spanIndexOfAnyExceptCharCharChar, + (true, false) => s_spanLastIndexOfAnyCharCharChar, + (true, true) => s_spanLastIndexOfAnyExceptCharCharChar, + }); + return; + + default: + Ldstr(setChars.ToString()); + Call(s_stringAsSpanMethod); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfAnySpan, + (false, true) => s_spanIndexOfAnyExceptSpan, + (true, false) => s_spanLastIndexOfAnySpan, + (true, true) => s_spanLastIndexOfAnyExceptSpan, + }); + return; + } + } + + // IndexOfAny{Except}InRange + if (RegexCharClass.TryGetSingleRange(node.Str, out char lowInclusive, out char highInclusive)) + { + if (lowInclusive == highInclusive) + { + Ldc(lowInclusive); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfChar, + (false, true) => s_spanIndexOfAnyExceptChar, + (true, false) => s_spanLastIndexOfChar, + (true, true) => s_spanLastIndexOfAnyExceptChar, + }); + return; + } + + Ldc(lowInclusive); + Ldc(highInclusive); + Call((useLast, negated) switch + { + (false, false) => s_spanIndexOfAnyInRange, + (false, true) => s_spanIndexOfAnyExceptInRange, + (true, false) => s_spanLastIndexOfAnyInRange, + (true, true) => s_spanLastIndexOfAnyExceptInRange, + }); + return; + } + } + + Debug.Fail("We should never get here. This method should only be called if CanEmitIndexOf returned true, and all of the same cases should be covered."); + } + // // If the expression contains captures, pops a crawl position from the stack and uncaptures // until that position is reached. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index d2eef1c622f6d..80ec75bda8809 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -1381,10 +1381,8 @@ public char FirstCharOfOneOrMulti() /// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant. /// The Negated value indicates whether the Char/SetChars should be considered exclusionary. /// - public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max optimized by IndexOfAny today + public RegexNode? FindStartingLiteralNode() { - Debug.Assert(maxSetCharacters >= 0 && maxSetCharacters <= 128, $"{nameof(maxSetCharacters)} == {maxSetCharacters} should be small enough to be stack allocated."); - RegexNode? node = this; while (true) { @@ -1394,31 +1392,12 @@ public char FirstCharOfOneOrMulti() { case RegexNodeKind.One: case RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy when node.M > 0: - return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: false); - case RegexNodeKind.Notone: case RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy when node.M > 0: - return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: true); - - case RegexNodeKind.Multi: - return new StartingLiteralData(range: default, @string: node.Str, setChars: null, negated: false); - case RegexNodeKind.Set: case RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy when node.M > 0: - Span setChars = stackalloc char[maxSetCharacters]; - int numChars; - if ((numChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0) - { - setChars = setChars.Slice(0, numChars); - return new StartingLiteralData(range: default, @string: null, setChars: setChars.ToString(), negated: RegexCharClass.IsNegated(node.Str!)); - } - - if (RegexCharClass.TryGetSingleRange(node.Str!, out char lowInclusive, out char highInclusive)) - { - Debug.Assert(lowInclusive < highInclusive); - return new StartingLiteralData(range: (lowInclusive, highInclusive), @string: null, setChars: null, negated: RegexCharClass.IsNegated(node.Str!)); - } - break; + case RegexNodeKind.Multi: + return node; case RegexNodeKind.Atomic: case RegexNodeKind.Concatenate: @@ -1435,6 +1414,49 @@ public char FirstCharOfOneOrMulti() } } + /// Finds the guaranteed beginning literal(s) of the node, or null if none exists. + /// + /// A tuple of data about the literal: only one of the Char/String/SetChars fields is relevant. + /// The Negated value indicates whether the Char/SetChars should be considered exclusionary. + /// + public StartingLiteralData? FindStartingLiteral(int maxSetCharacters = 5) // 5 is max optimized by IndexOfAny today + { + Debug.Assert(maxSetCharacters >= 0 && maxSetCharacters <= 128, $"{nameof(maxSetCharacters)} == {maxSetCharacters} should be small enough to be stack allocated."); + + if (FindStartingLiteralNode() is RegexNode node) + { + switch (node.Kind) + { + case RegexNodeKind.One or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy: + return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: false); + + case RegexNodeKind.Notone or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy: + return new StartingLiteralData(range: (node.Ch, node.Ch), @string: null, setChars: null, negated: true); + + case RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy: + Span setChars = stackalloc char[maxSetCharacters]; + int numChars; + if ((numChars = RegexCharClass.GetSetChars(node.Str!, setChars)) != 0) + { + setChars = setChars.Slice(0, numChars); + return new StartingLiteralData(range: default, @string: null, setChars: setChars.ToString(), negated: RegexCharClass.IsNegated(node.Str!)); + } + + if (RegexCharClass.TryGetSingleRange(node.Str!, out char lowInclusive, out char highInclusive)) + { + Debug.Assert(lowInclusive < highInclusive); + return new StartingLiteralData(range: (lowInclusive, highInclusive), @string: null, setChars: null, negated: RegexCharClass.IsNegated(node.Str!)); + } + break; + + case RegexNodeKind.Multi: + return new StartingLiteralData(range: default, @string: node.Str, setChars: null, negated: false); + } + } + + return null; + } + /// Data about a starting literal as returned by . public readonly struct StartingLiteralData { @@ -2767,6 +2789,7 @@ static bool ExceedsMaxDepthAllowedDepth(RegexNode node, int allowedDepth) } /// Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node. + [MemberNotNullWhen(true, nameof(Str))] public bool IsSetFamily => Kind is RegexNodeKind.Set or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic or RegexNodeKind.Setlazy; /// Gets whether the node is a One/Oneloop/Oneloopatomic/Onelazy node.