Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatically anchor regexes beginning with .* #1706

Merged
merged 1 commit into from
Jan 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -174,16 +174,26 @@ private void MakeRep(int type, int min, int max)
}

/// <summary>Performs additional optimizations on an entire tree prior to being used.</summary>
/// <remarks>
/// Some optimizations are performed by the parser while parsing, and others are performed
/// as nodes are being added to the tree. The optimizations here expect the tree to be fully
/// formed, as they inspect relationships between nodes that may not have been in place as
/// individual nodes were being processed/added to the tree.
/// </remarks>
internal RegexNode FinalOptimize()
{
RegexNode rootNode = this;
Debug.Assert(rootNode.Type == Capture && rootNode.ChildCount() == 1);
Debug.Assert(rootNode.Type == Capture);
Debug.Assert(rootNode.Next is null);
Debug.Assert(rootNode.ChildCount() == 1);

if ((Options & RegexOptions.RightToLeft) == 0) // only apply optimization when LTR to avoid needing additional code for the rarer RTL case
{
// Optimization: backtracking removal at expression end.
// If we find backtracking construct at the end of the regex, we can instead make it non-backtracking,
// since nothing would ever backtrack into it anyway. Doing this then makes the construct available
// to implementations that don't support backtracking.
if ((Options & RegexOptions.RightToLeft) == 0 && // only apply optimization when LTR to avoid needing additional code for the rarer RTL case
(Options & RegexOptions.Compiled) != 0) // only apply when we're compiling, as that's the only time it would make a meaningful difference
if ((Options & RegexOptions.Compiled) != 0) // only apply when we're compiling, as that's the only time it would make a meaningful difference
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
{
// Walk the tree, starting from the sole child of the root implicit capture.
RegexNode node = rootNode.Child(0);
Expand Down Expand Up @@ -231,6 +241,60 @@ internal RegexNode FinalOptimize()
}
}

// Optimization: implicit anchoring.
// If the expression begins with a .* loop, add an anchor to the beginning:
// - If Singleline is set such that '.' eats anything, the .* will zip to the end of the string and then backtrack through
// the whole thing looking for a match; since it will have examined everything, there's no benefit to examining it all
// again, and we can anchor to beginning.
// - If Singleline is not set, then '.' eats anything up until a '\n' and backtracks from there, so we can similarly avoid
// re-examining that content and anchor to the beginning of lines.
// We are currently very conservative here, only examining concat nodes. This could be loosened in the future, e.g. to
// explore captures (but think through any implications of there being a back ref to that capture), to explore loops and
// lazy loops a positive minimum (but the anchor shouldn't be part of the loop), to explore alternations and support adding
// an anchor if all of them begin with appropriate star loops (though this could also be accomplished by factoring out the
// loops to be before the alternation), etc.
{
RegexNode node = rootNode.Child(0); // skip implicit root capture node
while (true)
{
bool singleline = (node.Options & RegexOptions.Singleline) != 0;
switch (node.Type)
{
case Concatenate:
node = node.Child(0);
continue;

case Setloop when singleline && node.N == int.MaxValue && node.Str == RegexCharClass.AnyClass:
case Setloopatomic when singleline && node.N == int.MaxValue && node.Str == RegexCharClass.AnyClass:
case Notoneloop when !singleline && node.N == int.MaxValue && node.Ch == '\n':
case Notoneloopatomic when !singleline && node.N == int.MaxValue && node.Ch == '\n':
RegexNode? parent = node.Next;
var anchor = new RegexNode(singleline ? Beginning : Bol, node.Options);
Debug.Assert(parent != null);
if (parent.Type == Concatenate)
{
Debug.Assert(parent.ChildCount() >= 2);
Debug.Assert(parent.Children is List<RegexNode>);
anchor.Next = parent;
((List<RegexNode>)parent.Children).Insert(0, anchor);
}
else
{
Debug.Assert(parent.Type == Capture && parent.Next is null, "Only valid capture is the implicit root capture");
var concat = new RegexNode(Concatenate, parent.Options);
concat.AddChild(anchor);
concat.AddChild(node);
parent.ReplaceChild(0, concat);
}
break;
}

break;
}
}
}

// Optimization: Unnecessary root atomic.
// If the root node under the implicit Capture is an Atomic, the Atomic is useless as there's nothing
// to backtrack into it, so we can remove it.
if (rootNode.Child(0).Type == Atomic)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,49 @@ public static IEnumerable<object[]> Groups_Basic_TestData()
yield return new object[] { null, @"(?:(?:[ab]c[de]f){3}){2}", "acdfbcdfacefbcefbcefbcdfacdef", RegexOptions.None, new string[] { "acdfbcdfacefbcefbcefbcdf" } };
yield return new object[] { null, @"(?:(?:[ab]c[de]f){3}hello){2}", "aaaaaacdfbcdfacefhellobcefbcefbcdfhellooooo", RegexOptions.None, new string[] { "acdfbcdfacefhellobcefbcefbcdfhello" } };

// Anchoring loops beginning with .* / .+
yield return new object[] { null, @".*", "", RegexOptions.None, new string[] { "" } };
yield return new object[] { null, @".*", "\n\n\n\n", RegexOptions.None, new string[] { "" } };
yield return new object[] { null, @".*", "\n\n\n\n", RegexOptions.Singleline, new string[] { "\n\n\n\n" } };
yield return new object[] { null, @".*[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "1" } };
yield return new object[] { null, @"(?s).*(?-s)[1a]", "1\n\n\n\n", RegexOptions.None, new string[] { "1" } };
yield return new object[] { null, @"(?s).*(?-s)[1a]", "\n\n\n\n1", RegexOptions.None, new string[] { "\n\n\n\n1" } };
yield return new object[] { null, @".*|.*|.*", "", RegexOptions.None, new string[] { "" } };
yield return new object[] { null, @".*123|abc", "abc\n123", RegexOptions.None, new string[] { "abc" } };
yield return new object[] { null, @".*123|abc", "abc\n123", RegexOptions.Singleline, new string[] { "abc\n123" } };
yield return new object[] { null, @".*", "\n", RegexOptions.None, new string[] { "" } };
yield return new object[] { null, @".*\n", "\n", RegexOptions.None, new string[] { "\n" } };
yield return new object[] { null, @".*", "\n", RegexOptions.Singleline, new string[] { "\n" } };
yield return new object[] { null, @".*\n", "\n", RegexOptions.Singleline, new string[] { "\n" } };
yield return new object[] { null, @".*", "abc", RegexOptions.None, new string[] { "abc" } };
yield return new object[] { null, @".*abc", "abc", RegexOptions.None, new string[] { "abc" } };
yield return new object[] { null, @".*abc|ghi", "ghi", RegexOptions.None, new string[] { "ghi" } };
yield return new object[] { null, @".*abc|.*ghi", "abcghi", RegexOptions.None, new string[] { "abc" } };
yield return new object[] { null, @".*abc|.*ghi", "bcghi", RegexOptions.None, new string[] { "bcghi" } };
yield return new object[] { null, @".*abc|.+c", " \n \n bc", RegexOptions.None, new string[] { " bc" } };
yield return new object[] { null, @".*abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } };
yield return new object[] { null, @".*abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } };
yield return new object[] { null, @".*abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } };
yield return new object[] { null, @"(.*)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } };
yield return new object[] { null, @".*\nabc", "\n123\nabc", RegexOptions.None, new string[] { "123\nabc" } };
yield return new object[] { null, @".*\nabc", "\n123\nabc", RegexOptions.Singleline, new string[] { "\n123\nabc" } };
yield return new object[] { null, @".*abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc abc abc" } };
yield return new object[] { null, @".*abc", "abc abc abc \nabc", RegexOptions.Singleline, new string[] { "abc abc abc \nabc" } };
yield return new object[] { null, @".*?abc", "abc abc abc \nabc", RegexOptions.None, new string[] { "abc" } };
yield return new object[] { null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.None, new string[] { "123abc" } };
yield return new object[] { null, @"[^\n]*abc", "123abc\n456abc\n789abc", RegexOptions.Singleline, new string[] { "123abc" } };
yield return new object[] { null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.None, new string[] { "456abc" } };
yield return new object[] { null, @"[^\n]*abc", "123ab\n456abc\n789abc", RegexOptions.Singleline, new string[] { "456abc" } };
yield return new object[] { null, @".+", "a", RegexOptions.None, new string[] { "a" } };
yield return new object[] { null, @".+", "\nabc", RegexOptions.None, new string[] { "abc" } };
yield return new object[] { null, @".+", "\n", RegexOptions.Singleline, new string[] { "\n" } };
yield return new object[] { null, @".+", "\nabc", RegexOptions.Singleline, new string[] { "\nabc" } };
yield return new object[] { null, @".+abc", "aaaabc", RegexOptions.None, new string[] { "aaaabc" } };
yield return new object[] { null, @".+abc", "12345 abc", RegexOptions.None, new string[] { "12345 abc" } };
yield return new object[] { null, @".+abc", "12345\n abc", RegexOptions.None, new string[] { " abc" } };
yield return new object[] { null, @".+abc", "12345\n abc", RegexOptions.Singleline, new string[] { "12345\n abc" } };
yield return new object[] { null, @"(.+)abc\1", "\n12345abc12345", RegexOptions.Singleline, new string[] { "12345abc12345", "12345" } };

// Grouping Constructs Invalid Regular Expressions
yield return new object[] { null, @"()", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } };
yield return new object[] { null, @"(?<cat>)", "cat", RegexOptions.None, new string[] { string.Empty, string.Empty } };
Expand Down