-
Notifications
You must be signed in to change notification settings - Fork 4.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Changing the logic for how we deal with RegexOptions.IgnoreCase match…
…ing. (#67184) * Changing the logic for how we deal with RegexOptions.IgnoreCase matching. * Addressing first round of feedback * Addressing more feedback. * - Ensure that Backreferences use the same case behavior that the casing table does when using IgnoreCase. - Addressing more feedback. * Apply suggestions from code review Co-authored-by: Stephen Toub <stoub@microsoft.com> * Address more feedback * Fix allocation regression for patterns with a lot of ascii letters * Skip few tests in Browser and .NET Framework * Skip one more test that shouldn't be ran on wasm * Address more PR Feedback * More feedback * Skip tests that are failing in NLS-globalization queues Co-authored-by: Stephen Toub <stoub@microsoft.com>
- Loading branch information
1 parent
b4c76da
commit 90908d5
Showing
44 changed files
with
2,281 additions
and
1,800 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
282 changes: 94 additions & 188 deletions
282
src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
7 changes: 6 additions & 1 deletion
7
.../System.Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunner.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
8 changes: 6 additions & 2 deletions
8
....Text.RegularExpressions/src/System/Text/RegularExpressions/CompiledRegexRunnerFactory.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,24 +1,28 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
|
||
using System.Globalization; | ||
using System.Reflection.Emit; | ||
|
||
namespace System.Text.RegularExpressions | ||
{ | ||
internal sealed class CompiledRegexRunnerFactory : RegexRunnerFactory | ||
{ | ||
private readonly DynamicMethod _scanMethod; | ||
/// <summary>This field will only be set if the pattern has backreferences and uses RegexOptions.IgnoreCase</summary> | ||
private readonly CultureInfo? _culture; | ||
|
||
// Delegate is lazily created to avoid forcing JIT'ing until the regex is actually executed. | ||
private CompiledRegexRunner.ScanDelegate? _scan; | ||
|
||
public CompiledRegexRunnerFactory(DynamicMethod scanMethod) | ||
public CompiledRegexRunnerFactory(DynamicMethod scanMethod, CultureInfo? culture) | ||
{ | ||
_scanMethod = scanMethod; | ||
_culture = culture; | ||
} | ||
|
||
protected internal override RegexRunner CreateInstance() => | ||
new CompiledRegexRunner( | ||
_scan ??= _scanMethod.CreateDelegate<CompiledRegexRunner.ScanDelegate>()); | ||
_scan ??= _scanMethod.CreateDelegate<CompiledRegexRunner.ScanDelegate>(), _culture); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
39 changes: 39 additions & 0 deletions
39
...es/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCaseBehavior.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
|
||
using System.Globalization; | ||
|
||
namespace System.Text.RegularExpressions | ||
{ | ||
/// <summary> | ||
/// When a regular expression specifies the option <see cref="RegexOptions.IgnoreCase"/> then comparisons between the input and the | ||
/// pattern will made case-insensitively. In order to support this, we need to define which case mappings shall be used for the comparisons. | ||
/// A case mapping exists whenever you have two characters 'A' and 'B', where either 'A' is the ToLower() representation of 'B' or both 'A' and 'B' lowercase to the | ||
/// same character. Note that we don't consider a mapping when the only relationship between 'A' and 'B' is that one is the ToUpper() representation of the other. This | ||
/// is for backwards compatibility since, in Regex, we have only consider ToLower() for case insensitive comparisons. Given the case mappings vary depending on the culture, | ||
/// Regex supports 3 main different behaviors or mappings: Invariant, NonTurkish, and Turkish. This is in order to match the behavior of all .NET supported cultures | ||
/// current behavior for ToLower(). As a side note, there should be no cases where 'A'.ToLower() == 'B' but 'A'.ToLower() != 'B'.ToLower(). This aspect is important since | ||
/// for backreferences we make use a.ToLower() == b.ToLower() for comparisons so if there was such a case then it would lead to inconsistencies between how we handle | ||
/// backreferences vs how we handle other case insensitive comparisons. | ||
/// </summary> | ||
internal enum RegexCaseBehavior | ||
{ | ||
/// <summary> | ||
/// Invariant case-mappings are used. This includes all of the common mappings across cultures. This behavior is used when either the user | ||
/// specified <see cref="RegexOptions.CultureInvariant"/> or when the CurrentCulture is <see cref="CultureInfo.InvariantCulture"/>. | ||
/// </summary> | ||
Invariant, | ||
|
||
/// <summary> | ||
/// These are all the same mappings used by Invariant behavior, with an additional one: \u0130 => \u0069 | ||
/// This mode will be used when CurrentCulture is not Invariant or any of the tr/az cultures. | ||
/// </summary> | ||
NonTurkish, | ||
|
||
/// <summary> | ||
/// These are all the same mappings used by non-Turkish behavior, with the exception of: \u0049 => \u0069 which mapping doesn't exist | ||
/// on this behavior and with the additional mapping of: \u0069 => \u0131. This mode will be used when CurrentCulture is any of the tr/az cultures. | ||
/// </summary> | ||
Turkish | ||
} | ||
} |
Oops, something went wrong.