Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expand Regex One/Notone/Setlazy simple code gen support to Lazy #61784

Merged
merged 1 commit into from
Nov 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -750,7 +750,7 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm,

int labelCounter = 0;
string DefineLabel(string prefix = "L") => $"{prefix}{labelCounter++}";
void MarkLabel(string label) => writer.WriteLine($"{label}:");
void MarkLabel(string label, bool addEmptyStatement = false) => writer.WriteLine($"{label}:{(addEmptyStatement ? " ;" : "")}");
void Goto(string label) => writer.WriteLine($"goto {label};");
string doneLabel = "NoMatch";
string originalDoneLabel = doneLabel;
Expand All @@ -772,17 +772,11 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm,

// Emit failure
writer.WriteLine("// No match");
MarkLabel(originalDoneLabel);
MarkLabel(originalDoneLabel, !expressionHasCaptures);
if (expressionHasCaptures)
{
EmitUncaptureUntil("0");
}
else
{
// We can't have a label at the end of the method, so explicitly
// add a "return;" if the End label would otherwise be an issue.
writer.WriteLine("return;");
}
return;

static bool IsCaseInsensitive(RegexNode node) => (node.Options & RegexOptions.IgnoreCase) != 0;
Expand Down Expand Up @@ -1174,15 +1168,11 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
EmitAtomicNodeLoop(node);
break;

case RegexNode.Onelazy:
case RegexNode.Notonelazy:
case RegexNode.Setlazy:
case RegexNode.Lazyloop:
// An atomic lazy loop amounts to doing the minimum amount of work possible.
// That means iterating as little as is required, which means a repeater
// for the min, and if min is 0, doing nothing.
Debug.Assert(node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic));
if (node.M > 0)
{
EmitNodeRepeater(node);
}
EmitLazy(node, emitLengthChecksIfRequired);
break;

case RegexNode.Alternate:
Expand All @@ -1195,12 +1185,6 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired);
break;

case RegexNode.Onelazy:
case RegexNode.Notonelazy:
case RegexNode.Setlazy:
EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
break;

case RegexNode.Concatenate:
EmitConcatenation(node, subsequent, emitLengthChecksIfRequired);
break;
Expand Down Expand Up @@ -1682,17 +1666,27 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
// It's left pointing to the backtracking label for everything subsequent in the expression.
}

void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true)
void EmitLazy(RegexNode node, bool emitLengthChecksIfRequired = true)
{
bool isSingleChar = node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily;

// Emit the min iterations as a repeater. Any failures here don't necessitate backtracking,
// as the lazy itself failed to match.
if (node.M > 0)
{
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
if (isSingleChar)
{
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
}
else
{
EmitNodeRepeater(node);
}
}

// If the whole thing was actually that repeater, we're done.
if (node.M == node.N)
// If the whole thing was actually that repeater, we're done. Similarly, if this is actually an atomic
// lazy loop, nothing will ever backtrack into this node, so we never need to iterate more than the minimum.
if (node.M == node.N || node.Next is { Type: RegexNode.Atomic })
{
return;
}
Expand Down Expand Up @@ -1762,7 +1756,15 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL
// for the next time we backtrack.
writer.WriteLine($"runtextpos = {nextPos};");
LoadTextSpanLocal(writer);
EmitSingleChar(node);
if (isSingleChar)
{
EmitSingleChar(node);
}
else
{
writer.WriteLine();
EmitNode(node.Child(0));
}
TransferTextSpanPosToRunTextPos();
writer.WriteLine($"{nextPos} = runtextpos;");

Expand All @@ -1772,7 +1774,7 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL
doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes

writer.WriteLine();
MarkLabel(endLoopLabel);
MarkLabel(endLoopLabel, addEmptyStatement: true);

// We explicitly do not reset doneLabel back to originalDoneLabel.
// It's left pointing to the backtracking label for everything subsequent in the expression.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2094,15 +2094,11 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
EmitAtomicNodeLoop(node);
break;

case RegexNode.Onelazy:
case RegexNode.Notonelazy:
case RegexNode.Setlazy:
case RegexNode.Lazyloop:
// An atomic lazy loop amounts to doing the minimum amount of work possible.
// That means iterating as little as is required, which means a repeater
// for the min, and if min is 0, doing nothing.
Debug.Assert(node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic));
if (node.M > 0)
{
EmitNodeRepeater(node);
}
EmitLazy(node, emitLengthChecksIfRequired);
break;

case RegexNode.Atomic:
Expand All @@ -2119,12 +2115,6 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired);
break;

case RegexNode.Onelazy:
case RegexNode.Notonelazy:
case RegexNode.Setlazy:
EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
break;

case RegexNode.Concatenate:
EmitConcatenation(node, subsequent, emitLengthChecksIfRequired);
break;
Expand Down Expand Up @@ -2558,17 +2548,27 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
MarkLabel(endLoop);
}

void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true)
void EmitLazy(RegexNode node, bool emitLengthChecksIfRequired = true)
{
bool isSingleChar = node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily;

// Emit the min iterations as a repeater. Any failures here don't necessitate backtracking,
// as the lazy itself failed to match.
if (node.M > 0)
{
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
if (isSingleChar)
{
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
}
else
{
EmitNodeRepeater(node);
}
}

// If the whole thing was actually that repeater, we're done.
if (node.M == node.N)
// If the whole thing was actually that repeater, we're done. Similarly, if this is actually an atomic
// lazy loop, nothing will ever backtrack into this node, so we never need to iterate more than the minimum.
if (node.M == node.N || node.Next is { Type: RegexNode.Atomic })
{
return;
}
Expand Down Expand Up @@ -2657,7 +2657,14 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL
Ldloc(nextPos);
Stloc(runtextposLocal);
LoadTextSpanLocal();
EmitSingleChar(node);
if (isSingleChar)
{
EmitSingleChar(node);
}
else
{
EmitNode(node.Child(0));
}
TransferTextSpanPosToRunTextPos();
Ldloc(runtextposLocal);
Stloc(nextPos);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -407,14 +407,6 @@ internal RegexNode FinalOptimize()
}
}

// Optimization: Unnecessary root atomic.
// If the root node under the implicit Capture is an Atomic, the Atomic is useless as there's nothing
// to backtrack into it, so we can remove it.
while (rootNode.Child(0).Type == Atomic)
{
rootNode.ReplaceChild(0, rootNode.Child(0).Child(0));
}

// Done optimizing. Return the final tree.
#if DEBUG
rootNode.ValidateFinalTreeInvariants();
Expand Down Expand Up @@ -2250,42 +2242,30 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
case Setlazy:
Debug.Assert(Next == null || Next.Type != Atomic, "Loop should have been transformed into an atomic type.");
supported = M == N || AncestorsAllowBacktracking(Next);
static bool AncestorsAllowBacktracking(RegexNode? node)
{
while (node is not null)
{
switch (node.Type)
{
case Concatenate:
case Capture:
case Atomic:
node = node.Next;
break;

default:
return false;
}
}

return true;
}
break;

// {Lazy}Loop repeaters are the same, except their child also needs to be supported.
// Loop repeaters are the same, except their child also needs to be supported.
// We also support such loops being atomic.
case Loop:
case Lazyloop:
supported =
(M == N || (Next != null && Next.Type == Atomic)) &&
Child(0).SupportsSimplifiedCodeGenerationImplementation();
break;

// We can handle atomic as long as we can handle making its child atomic, or
// its child doesn't have that concept.
case Atomic:
// Similarly, as long as the wrapped node supports simplified code gen,
// Lazy is supported if it's a repeater or atomic, but also if it's in
// a place where backtracking is allowed (e.g. it's top-level).
case Lazyloop:
supported =
(M == N || (Next != null && Next.Type == Atomic) || AncestorsAllowBacktracking(Next)) &&
Child(0).SupportsSimplifiedCodeGenerationImplementation();
break;

// We can handle atomic as long as its child is supported.
// Lookahead assertions also only require that the child node be supported.
// The RightToLeft check earlier is important to differentiate lookbehind,
// which is not supported.
case Atomic:
case Require:
case Prevent:
supported = Child(0).SupportsSimplifiedCodeGenerationImplementation();
Expand Down Expand Up @@ -2370,6 +2350,26 @@ static bool AncestorsAllowBacktracking(RegexNode? node)
}
#endif
return supported;

static bool AncestorsAllowBacktracking(RegexNode? node)
{
while (node is not null)
{
switch (node.Type)
{
case Concatenate:
case Capture:
case Atomic:
node = node.Next;
break;

default:
return false;
}
}

return true;
}
}

/// <summary>Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node.</summary>
Expand Down