From 6826ad8b1dd4df7600807ed2ec0d752061008914 Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Fri, 14 Feb 2020 17:36:13 -0800 Subject: [PATCH 1/2] Build native case folding layer --- .../Interop.Casing.cs | 6 +- .../System.Globalization.Native/pal_casing.c | 85 +++++++++---------- .../System.Globalization.Native/pal_casing.h | 13 +-- .../pal_collation.c | 40 +++------ .../System.Globalization.Native/pal_icushim.h | 2 + .../src/System/Globalization/TextInfo.Unix.cs | 29 +++---- 6 files changed, 80 insertions(+), 95 deletions(-) diff --git a/src/libraries/Common/src/Interop/Unix/System.Globalization.Native/Interop.Casing.cs b/src/libraries/Common/src/Interop/Unix/System.Globalization.Native/Interop.Casing.cs index 503a864d693c6..06875dbfb55ec 100644 --- a/src/libraries/Common/src/Interop/Unix/System.Globalization.Native/Interop.Casing.cs +++ b/src/libraries/Common/src/Interop/Unix/System.Globalization.Native/Interop.Casing.cs @@ -14,10 +14,10 @@ internal static partial class Globalization [DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_ChangeCase")] internal static extern unsafe void ChangeCase(char* src, int srcLen, char* dstBuffer, int dstBufferCapacity, bool bToUpper); - [DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_ChangeCaseInvariant")] - internal static extern unsafe void ChangeCaseInvariant(char* src, int srcLen, char* dstBuffer, int dstBufferCapacity, bool bToUpper); - [DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_ChangeCaseTurkish")] internal static extern unsafe void ChangeCaseTurkish(char* src, int srcLen, char* dstBuffer, int dstBufferCapacity, bool bToUpper); + + [DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_SimpleCaseFold")] + internal static extern unsafe void SimpleCaseFold(char* src, int srcLen, char* dstBuffer, int dstBufferCapacity); } } diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_casing.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_casing.c index 353542390ca7c..7e8fc790c821a 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_casing.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_casing.c @@ -13,11 +13,32 @@ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wsign-conversion" + +// Performs simple case folding of a code point, but forbids a non-ASCII code point +// from folding to an ASCII code point. If this occurs the API will return the original +// code point value. +static UChar32 CaseFoldCodePoint(UChar32 codePoint) +{ + UChar32 codePointFolded = u_foldCase(codePoint, U_FOLD_CASE_DEFAULT); + + // Subtracting 0x80 from the code point value will cause ASCII code points to become negative + // and non-ASCII code points to become non-negative. Since these code paths are expected to + // be called when we have a mix of ASCII and non-ASCII chars, this allows the branch condition + // to almost always evaluate to false. + + if ((codePoint - 0x80) ^ (codePointFolded - 0x80) < 0) + { + codePointFolded = codePoint; + } + + return codePointFolded; +} + /* Function: ChangeCase -Performs upper or lower casing of a string into a new buffer. +Performs simple case mapping (upper or lower) of a string into a new buffer. No special casing is performed beyond that provided by ICU. */ void GlobalizationNative_ChangeCase( @@ -61,13 +82,12 @@ void GlobalizationNative_ChangeCase( /* Function: -ChangeCaseInvariant +ChangeCaseTurkish -Performs upper or lower casing of a string into a new buffer. -Special casing is performed to ensure that invariant casing -matches that of Windows in certain situations, e.g. Turkish i's. +Performs upper or lower casing of a string into a new buffer, performing special +casing for Turkish. */ -void GlobalizationNative_ChangeCaseInvariant( +void GlobalizationNative_ChangeCaseTurkish( const UChar* lpSrc, int32_t cwSrcLength, UChar* lpDst, int32_t cwDstLength, int32_t bToUpper) { // See algorithmic comment in ChangeCase. @@ -80,11 +100,10 @@ void GlobalizationNative_ChangeCaseInvariant( { while (srcIdx < cwSrcLength) { - // On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131) - // capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049). - // We special case it to match the Windows invariant behavior. + // In turkish casing, LATIN SMALL LETTER I (U+0069) upper cases to LATIN + // CAPITAL LETTER I WITH DOT ABOVE (U+0130). U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); - dstCodepoint = ((srcCodepoint == (UChar32)0x0131) ? (UChar32)0x0131 : u_toupper(srcCodepoint)); + dstCodepoint = ((srcCodepoint == (UChar32)0x0069) ? (UChar32)0x0130 : u_toupper(srcCodepoint)); U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); assert(isError == FALSE && srcIdx == dstIdx); } @@ -93,11 +112,10 @@ void GlobalizationNative_ChangeCaseInvariant( { while (srcIdx < cwSrcLength) { - // On Windows with InvariantCulture, the LATIN CAPITAL LETTER I WITH DOT ABOVE (U+0130) - // lower cases to itself, whereas with ICU it lower cases to LATIN SMALL LETTER I (U+0069). - // We special case it to match the Windows invariant behavior. + // In turkish casing, LATIN CAPITAL LETTER I (U+0049) lower cases to + // LATIN SMALL LETTER DOTLESS I (U+0131). U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); - dstCodepoint = ((srcCodepoint == (UChar32)0x0130) ? (UChar32)0x0130 : u_tolower(srcCodepoint)); + dstCodepoint = ((srcCodepoint == (UChar32)0x0049) ? (UChar32)0x0131 : u_tolower(srcCodepoint)); U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); assert(isError == FALSE && srcIdx == dstIdx); } @@ -106,43 +124,24 @@ void GlobalizationNative_ChangeCaseInvariant( /* Function: -ChangeCaseTurkish +SimpleCaseFold -Performs upper or lower casing of a string into a new buffer, performing special -casing for Turkish. +Performs simple case folding of a string into a new buffer. +Non-ASCII code points are not mapped to ASCII code points. */ -void GlobalizationNative_ChangeCaseTurkish( - const UChar* lpSrc, int32_t cwSrcLength, UChar* lpDst, int32_t cwDstLength, int32_t bToUpper) +void GlobalizationNative_SimpleCaseFold( + const UChar* lpSrc, int32_t cwSrcLength, UChar* lpDst, int32_t cwDstLength) { - // See algorithmic comment in ChangeCase. - UBool isError = FALSE; int32_t srcIdx = 0, dstIdx = 0; UChar32 srcCodepoint, dstCodepoint; - if (bToUpper) - { - while (srcIdx < cwSrcLength) - { - // In turkish casing, LATIN SMALL LETTER I (U+0069) upper cases to LATIN - // CAPITAL LETTER I WITH DOT ABOVE (U+0130). - U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); - dstCodepoint = ((srcCodepoint == (UChar32)0x0069) ? (UChar32)0x0130 : u_toupper(srcCodepoint)); - U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); - assert(isError == FALSE && srcIdx == dstIdx); - } - } - else + while (srcIdx < cwSrcLength) { - while (srcIdx < cwSrcLength) - { - // In turkish casing, LATIN CAPITAL LETTER I (U+0049) lower cases to - // LATIN SMALL LETTER DOTLESS I (U+0131). - U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); - dstCodepoint = ((srcCodepoint == (UChar32)0x0049) ? (UChar32)0x0131 : u_tolower(srcCodepoint)); - U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); - assert(isError == FALSE && srcIdx == dstIdx); - } + U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint); + dstCodepoint = CaseFoldCodePoint(srcCodepoint); + U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); + assert(isError == FALSE && srcIdx == dstIdx); } } diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_casing.h b/src/libraries/Native/Unix/System.Globalization.Native/pal_casing.h index 3ea29dd73b4e7..dcf34675b6fe2 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_casing.h +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_casing.h @@ -5,20 +5,21 @@ #include "pal_compiler.h" #include "pal_locale.h" +static UChar32 CaseFoldCodePoint(UChar32 codePoint); + DLLEXPORT void GlobalizationNative_ChangeCase(const UChar* lpSrc, int32_t cwSrcLength, UChar* lpDst, int32_t cwDstLength, int32_t bToUpper); -DLLEXPORT void GlobalizationNative_ChangeCaseInvariant(const UChar* lpSrc, - int32_t cwSrcLength, - UChar* lpDst, - int32_t cwDstLength, - int32_t bToUpper); - DLLEXPORT void GlobalizationNative_ChangeCaseTurkish(const UChar* lpSrc, int32_t cwSrcLength, UChar* lpDst, int32_t cwDstLength, int32_t bToUpper); + +DLLEXPORT void GlobalizationNative_SimpleCaseFold(const UChar* lpSrc, + int32_t cwSrcLength, + UChar* lpDst, + int32_t cwDstLength); diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c index 4b07e752dce8b..ff184adc95834 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c @@ -10,6 +10,7 @@ #include #include +#include "pal_casing.h" #include "pal_collation.h" c_static_assert_msg(UCOL_EQUAL == 0, "managed side requires 0 for equal strings"); @@ -527,22 +528,9 @@ AreEqualOrdinalIgnoreCase */ static int AreEqualOrdinalIgnoreCase(UChar32 one, UChar32 two) { - // Return whether the two characters are identical or would be identical if they were upper-cased. + // Return whether the two characters are identical or would be identical if they were case-folded. - if (one == two) - { - return TRUE; - } - - if (one == 0x0131 || two == 0x0131) - { - // On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131) - // capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049). - // We special case it to match the Windows invariant behavior. - return FALSE; - } - - return u_toupper(one) == u_toupper(two); + return (one == two) || (CaseFoldCodePoint(one) == CaseFoldCodePoint(two)); } /* @@ -857,21 +845,21 @@ int32_t GlobalizationNative_CompareStringOrdinalIgnoreCase( U16_NEXT(lpStr2, str2Idx, cwStr2Length, str2Codepoint); #pragma clang diagnostic pop - if (str1Codepoint != str2Codepoint && u_toupper(str1Codepoint) != u_toupper(str2Codepoint)) + if (str1Codepoint == str2Codepoint) { - return str1Codepoint < str2Codepoint ? -1 : 1; + continue; // exact code point match } - } - if (cwStr1Length < cwStr2Length) - { - return -1; - } + str1Codepoint = CaseFoldCodePoint(str1Codepoint); + str2Codepoint = CaseFoldCodePoint(str2Codepoint); - if (cwStr2Length < cwStr1Length) - { - return 1; + if (str1Codepoint == str2Codepoint) + { + continue; // case folded code point match + } + + return str1Codepoint - str2Codepoint; // mismatch } - return 0; + return cwStr1Length - cwStr2Length; } diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim.h b/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim.h index fff2364026c49..68b54fee46664 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim.h +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_icushim.h @@ -37,6 +37,7 @@ // List of all functions from the ICU libraries that are used in the System.Globalization.Native.so #define FOR_ALL_UNCONDITIONAL_ICU_FUNCTIONS \ PER_FUNCTION_BLOCK(u_charsToUChars, libicuuc) \ + PER_FUNCTION_BLOCK(u_foldCase, libicuuc) \ PER_FUNCTION_BLOCK(u_getVersion, libicuuc) \ PER_FUNCTION_BLOCK(u_strlen, libicuuc) \ PER_FUNCTION_BLOCK(u_strncpy, libicuuc) \ @@ -145,6 +146,7 @@ FOR_ALL_ICU_FUNCTIONS // Redefine all calls to ICU functions as calls through pointers that are set // to the functions of the selected version of ICU in the initialization. #define u_charsToUChars(...) u_charsToUChars_ptr(__VA_ARGS__) +#define u_foldcase(...) u_foldCase_ptr(__VA_ARGS__) #define u_getVersion(...) u_getVersion_ptr(__VA_ARGS__) #define u_strlen(...) u_strlen_ptr(__VA_ARGS__) #define u_strncpy(...) u_strncpy_ptr(__VA_ARGS__) diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.Unix.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.Unix.cs index 02e7c5f50b969..4d5b28684fe3f 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.Unix.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.Unix.cs @@ -23,33 +23,28 @@ private static bool NeedsTurkishCasing(string localeName) { Debug.Assert(localeName != null); - return CultureInfo.GetCultureInfo(localeName).CompareInfo.Compare("\u0131", "I", CompareOptions.IgnoreCase) == 0; + return _cultureName.Length > 0 /* invariant has an empty string culture name */ + && CultureInfo.GetCultureInfo(localeName).CompareInfo.Compare("\u0131", "I", CompareOptions.IgnoreCase) == 0; } - private bool IsInvariant { get { return _cultureName.Length == 0; } } - internal unsafe void ChangeCase(char* src, int srcLen, char* dstBuffer, int dstBufferCapacity, bool bToUpper) { Debug.Assert(!GlobalizationMode.Invariant); - if (IsInvariant) + TryAgain: + + if (_needsTurkishCasing == Tristate.False) // most common path + { + Interop.Globalization.ChangeCase(src, srcLen, dstBuffer, dstBufferCapacity, bToUpper); + } + else if (_needsTurkishCasing == Tristate.True) { - Interop.Globalization.ChangeCaseInvariant(src, srcLen, dstBuffer, dstBufferCapacity, bToUpper); + Interop.Globalization.ChangeCaseTurkish(src, srcLen, dstBuffer, dstBufferCapacity, bToUpper); } else { - if (_needsTurkishCasing == Tristate.NotInitialized) - { - _needsTurkishCasing = NeedsTurkishCasing(_textInfoName) ? Tristate.True : Tristate.False; - } - if (_needsTurkishCasing == Tristate.True) - { - Interop.Globalization.ChangeCaseTurkish(src, srcLen, dstBuffer, dstBufferCapacity, bToUpper); - } - else - { - Interop.Globalization.ChangeCase(src, srcLen, dstBuffer, dstBufferCapacity, bToUpper); - } + _needsTurkishCasing = NeedsTurkishCasing(_textInfoName) ? Tristate.True : Tristate.False; + goto TryAgain; } } From 16fde36318c5b073ac1cd986225001344f540ee2 Mon Sep 17 00:00:00 2001 From: Levi Broderick Date: Fri, 14 Feb 2020 18:30:00 -0800 Subject: [PATCH 2/2] Hook StringComparer.OrdinalIgnoreCase through new system --- .../System.Private.CoreLib.Shared.projitems | 2 + .../src/System/Globalization/CompareInfo.cs | 41 +++++++++---- .../src/System/Globalization/TextInfo.Unix.cs | 9 +++ .../System/Globalization/TextInfo.Windows.cs | 7 +++ .../src/System/Globalization/TextInfo.cs | 61 +++++++++++++++++-- .../System/Marvin.OrdinalIgnoreCase.Unix.cs | 33 ++++++++++ .../Marvin.OrdinalIgnoreCase.Windows.cs | 35 +++++++++++ .../src/System/Marvin.OrdinalIgnoreCase.cs | 10 +-- 8 files changed, 175 insertions(+), 23 deletions(-) create mode 100644 src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.Unix.cs create mode 100644 src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.Windows.cs diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems index 2118574c4d786..1562e50521c39 100644 --- a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems +++ b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems @@ -1434,6 +1434,7 @@ + @@ -1682,6 +1683,7 @@ + diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.cs index 3971e9caa7896..496c72f2d87ec 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.cs @@ -7,6 +7,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Serialization; +using System.Text; using System.Text.Unicode; using Internal.Runtime.CompilerServices; @@ -485,15 +486,17 @@ internal static int CompareOrdinalIgnoreCase(ref char strA, int lengthA, ref cha ref char charA = ref strA; ref char charB = ref strB; + uint currentA, currentB; + // in InvariantMode we support all range and not only the ascii characters. char maxChar = (GlobalizationMode.Invariant ? (char)0xFFFF : (char)0x7F); - while (length != 0 && charA <= maxChar && charB <= maxChar) + while (length != 0 && (currentA = charA) <= maxChar && (currentB = charB) <= maxChar) { // Ordinal equals or lowercase equals if the result ends up in the a-z range - if (charA == charB || - ((charA | 0x20) == (charB | 0x20) && - (uint)((charA | 0x20) - 'a') <= (uint)('z' - 'a'))) + if (currentA == currentB || + ((currentA | 0x20) == (currentB | 0x20) && + ((currentA | 0x20) - 'a') <= (uint)('z' - 'a'))) { length--; charA = ref Unsafe.Add(ref charA, 1); @@ -501,21 +504,35 @@ internal static int CompareOrdinalIgnoreCase(ref char strA, int lengthA, ref cha } else { - int currentA = charA; - int currentB = charB; + // Simple case map both chars if needed - // Uppercase both chars if needed - if ((uint)(charA - 'a') <= 'z' - 'a') +#pragma warning disable CS0162 // Unreachable code detected: one of the two blocks below isn't relevant depending on target platform + if (TextInfo.CaseFoldToUpper) { - currentA -= 0x20; + if (UnicodeUtility.IsInRangeInclusive(currentA, 'a', 'z')) + { + currentA -= 0x20; + } + if (UnicodeUtility.IsInRangeInclusive(currentB, 'a', 'z')) + { + currentB -= 0x20; + } } - if ((uint)(charB - 'a') <= 'z' - 'a') + else { - currentB -= 0x20; + if (UnicodeUtility.IsInRangeInclusive(currentA, 'A', 'Z')) + { + currentA += 0x20; + } + if (UnicodeUtility.IsInRangeInclusive(currentB, 'A', 'Z')) + { + currentB += 0x20; + } } +#pragma warning restore CS0162 // Unreachable code detected // Return the (case-insensitive) difference between them. - return currentA - currentB; + return (int)(currentA - currentB); } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.Unix.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.Unix.cs index 4d5b28684fe3f..22865d00c7bd7 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.Unix.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.Unix.cs @@ -11,6 +11,8 @@ namespace System.Globalization { public partial class TextInfo { + internal const bool CaseFoldToUpper = false; // ICU folds ASCII characters to lowercase + private Tristate _needsTurkishCasing = Tristate.NotInitialized; private void FinishInitialization() { } @@ -48,5 +50,12 @@ internal unsafe void ChangeCase(char* src, int srcLen, char* dstBuffer, int dstB } } + private unsafe void CaseFold(char* pSource, int pSourceLen, char* pResult, int pResultLen) + { + Debug.Assert(!GlobalizationMode.Invariant); + + Interop.Globalization.SimpleCaseFold(pSource, pSourceLen, pResult, pResultLen); + } + } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.Windows.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.Windows.cs index 948644769da09..a3d5fd041b4bb 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.Windows.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.Windows.cs @@ -8,6 +8,8 @@ namespace System.Globalization { public partial class TextInfo { + internal const bool CaseFoldToUpper = true; // NLS doesn't perform case folding; it normalizes to uppercase + private unsafe void FinishInitialization() { _sortHandle = CompareInfo.GetSortHandle(_textInfoName); @@ -42,6 +44,11 @@ private unsafe void ChangeCase(char* pSource, int pSourceLen, char* pResult, int Debug.Assert(ret == pSourceLen, "Expected getting the same length of the original string"); } + private unsafe void CaseFold(char* pSource, int pSourceLen, char* pResult, int pResultLen) + { + throw new PlatformNotSupportedException(); // this code path shouldn't get hit on Windows + } + // PAL Ends here private IntPtr _sortHandle; diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.cs index f8e2e0c451345..f3ddcedcd28ab 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/TextInfo.cs @@ -203,8 +203,12 @@ private void ChangeCaseCommon(ReadOnlySpan source, Span private unsafe void ChangeCaseCommon(ref char source, ref char destination, int charCount) where TConversion : struct { - Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion)); - bool toUpper = typeof(TConversion) == typeof(ToUpperConversion); // JIT will treat this as a constant in release builds + Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion) || typeof(TConversion) == typeof(ToCaseFoldConversion)); + + // JIT treats the below as constants + + bool toCaseFold = typeof(TConversion) == typeof(ToCaseFoldConversion); + bool toUpper = typeof(TConversion) == typeof(ToUpperConversion) || (CaseFoldToUpper && typeof(TConversion) == typeof(ToCaseFoldConversion)); Debug.Assert(!GlobalizationMode.Invariant); Debug.Assert(charCount >= 0); @@ -301,7 +305,14 @@ private unsafe void ChangeCaseCommon(ref char source, ref char dest // has a case conversion that's different from the invariant culture, even for ASCII data (e.g., tr-TR converts // 'i' (U+0069) to Latin Capital Letter I With Dot Above (U+0130)). - ChangeCase(pSource + currIdx, charCount, pDestination + currIdx, charCount, toUpper); + if (toCaseFold) + { + CaseFold(pSource + currIdx, charCount, pDestination + currIdx, charCount); + } + else + { + ChangeCase(pSource + currIdx, charCount, pDestination + currIdx, charCount, toUpper); + } } Return: @@ -310,8 +321,12 @@ private unsafe void ChangeCaseCommon(ref char source, ref char dest private unsafe string ChangeCaseCommon(string source) where TConversion : struct { - Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion)); - bool toUpper = typeof(TConversion) == typeof(ToUpperConversion); // JIT will treat this as a constant in release builds + Debug.Assert(typeof(TConversion) == typeof(ToUpperConversion) || typeof(TConversion) == typeof(ToLowerConversion) || typeof(TConversion) == typeof(ToCaseFoldConversion)); + + // JIT treats the below as constants + + bool toCaseFold = typeof(TConversion) == typeof(ToCaseFoldConversion); + bool toUpper = typeof(TConversion) == typeof(ToUpperConversion) || (CaseFoldToUpper && typeof(TConversion) == typeof(ToCaseFoldConversion)); Debug.Assert(!GlobalizationMode.Invariant); Debug.Assert(source != null); @@ -406,7 +421,14 @@ private unsafe string ChangeCaseCommon(string source) where TConver // and run the culture-aware logic over the remainder of the data fixed (char* pResult = result) { - ChangeCase(pSource + currIdx, source.Length - (int)currIdx, pResult + currIdx, result.Length - (int)currIdx, toUpper); + if (toCaseFold) + { + CaseFold(pSource + currIdx, source.Length - (int)currIdx, pResult + currIdx, result.Length - (int)currIdx); + } + else + { + ChangeCase(pSource + currIdx, source.Length - (int)currIdx, pResult + currIdx, result.Length - (int)currIdx, toUpper); + } } return result; } @@ -572,6 +594,30 @@ internal static char ToUpperAsciiInvariant(char c) return c; } + // For internal use only. Performs simple case folding of the source into the destination. + internal void ToCaseFold(ReadOnlySpan source, Span destination) + { + Debug.Assert(source.Length <= destination.Length); + + if (GlobalizationMode.Invariant) + { +#pragma warning disable CS0162 // Unreachable code detected: one of the two blocks below isn't relevant depending on target platform + if (CaseFoldToUpper) + { + ToUpperAsciiInvariant(source, destination); + } + else + { + ToLowerAsciiInvariant(source, destination); + } +#pragma warning restore CS0162 // Unreachable code detected + } + else + { + ChangeCaseCommon(source, destination); + } + } + private static bool IsAscii(char c) => c < 0x80; private bool IsAsciiCasingSameAsInvariant @@ -870,5 +916,8 @@ private readonly struct ToUpperConversion { } // A dummy struct that is used for 'ToLower' in generic parameters private readonly struct ToLowerConversion { } + + // A dummy struct that is used for 'ToCaseFold' in generic parameters + private readonly struct ToCaseFoldConversion { } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.Unix.cs b/src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.Unix.cs new file mode 100644 index 0000000000000..bbf2caffd19e3 --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.Unix.cs @@ -0,0 +1,33 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Globalization; +using System.Runtime.CompilerServices; +using System.Text.Unicode; + +namespace System +{ + internal static partial class Marvin + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint CaseFoldTwoAsciiChars(uint value) + { + Debug.Assert(Utf16Utility.AllCharsInUInt32AreAscii(value)); + + // ICU's case folding is a lowercase conversion in the ASCII range. + + return Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int CaseFoldBuffer(ReadOnlySpan source, Span destination) + { + Debug.Assert(destination.Length >= source.Length); + + TextInfo.Invariant.ToCaseFold(source, destination); + return source.Length; // case folding doesn't change the UTF-16 code unit count + } + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.Windows.cs b/src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.Windows.cs new file mode 100644 index 0000000000000..a9b78a0509a57 --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.Windows.cs @@ -0,0 +1,35 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Text.Unicode; + +namespace System +{ + internal static partial class Marvin + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint CaseFoldTwoAsciiChars(uint value) + { + Debug.Assert(Utf16Utility.AllCharsInUInt32AreAscii(value)); + + // NLS doesn't have a concept of case folding. Instead, "removing case information" + // means that data is normalized to uppercase. + + return Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int CaseFoldBuffer(ReadOnlySpan source, Span destination) + { + Debug.Assert(destination.Length >= source.Length); + + // NLS doesn't have a concept of case folding. Instead, "removing case information" + // means that data is normalized to uppercase. + + return source.ToUpperInvariant(destination); + } + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.cs b/src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.cs index c8ff3fb7eaefe..8e0eea6efa8aa 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Marvin.OrdinalIgnoreCase.cs @@ -38,7 +38,7 @@ public static int ComputeHash32OrdinalIgnoreCase(ref char data, int count, uint { goto NotAscii; } - p0 += Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue); + p0 += CaseFoldTwoAsciiChars(tempValue); Block(ref p0, ref p1); byteOffset += 4; @@ -57,7 +57,7 @@ public static int ComputeHash32OrdinalIgnoreCase(ref char data, int count, uint } // addition is written with -0x80u to allow fall-through to next statement rather than jmp past it - p0 += Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(tempValue) + (0x800000u - 0x80u); + p0 += CaseFoldTwoAsciiChars(tempValue) + (0x800000u - 0x80u); } p0 += 0x80u; @@ -78,10 +78,10 @@ private static int ComputeHash32OrdinalIgnoreCaseSlow(ref char data, int count, char[]? borrowedArr = null; Span scratch = (uint)count <= 64 ? stackalloc char[64] : (borrowedArr = ArrayPool.Shared.Rent(count)); - int charsWritten = new ReadOnlySpan(ref data, count).ToUpperInvariant(scratch); - Debug.Assert(charsWritten == count); // invariant case conversion should involve simple folding; preserve code unit count + int charsWritten = CaseFoldBuffer(new ReadOnlySpan(ref data, count), scratch); + Debug.Assert(charsWritten == count); // simple case folding should preserve code unit count - // Slice the array to the size returned by ToUpperInvariant. + // Slice the array to the size returned by the case folding operation. // Multiplication below will not overflow since going from positive Int32 to UInt32. int hash = ComputeHash32(ref Unsafe.As(ref MemoryMarshal.GetReference(scratch)), (uint)charsWritten * 2, p0, p1);