Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICU comparison routines should use case folding, not case mapping #8

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ internal static partial class Globalization
[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_ChangeCase")]
internal static extern unsafe void ChangeCase(char* src, int srcLen, char* dstBuffer, int dstBufferCapacity, bool bToUpper);

[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_ChangeCaseInvariant")]
internal static extern unsafe void ChangeCaseInvariant(char* src, int srcLen, char* dstBuffer, int dstBufferCapacity, bool bToUpper);

[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_ChangeCaseTurkish")]
internal static extern unsafe void ChangeCaseTurkish(char* src, int srcLen, char* dstBuffer, int dstBufferCapacity, bool bToUpper);

[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_SimpleCaseFold")]
internal static extern unsafe void SimpleCaseFold(char* src, int srcLen, char* dstBuffer, int dstBufferCapacity);
}
}
85 changes: 42 additions & 43 deletions src/libraries/Native/Unix/System.Globalization.Native/pal_casing.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,32 @@
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wsign-conversion"


// Performs simple case folding of a code point, but forbids a non-ASCII code point
// from folding to an ASCII code point. If this occurs the API will return the original
// code point value.
static UChar32 CaseFoldCodePoint(UChar32 codePoint)
{
UChar32 codePointFolded = u_foldCase(codePoint, U_FOLD_CASE_DEFAULT);

// Subtracting 0x80 from the code point value will cause ASCII code points to become negative
// and non-ASCII code points to become non-negative. Since these code paths are expected to
// be called when we have a mix of ASCII and non-ASCII chars, this allows the branch condition
// to almost always evaluate to false.

if ((codePoint - 0x80) ^ (codePointFolded - 0x80) < 0)
{
codePointFolded = codePoint;
}

return codePointFolded;
}

/*
Function:
ChangeCase

Performs upper or lower casing of a string into a new buffer.
Performs simple case mapping (upper or lower) of a string into a new buffer.
No special casing is performed beyond that provided by ICU.
*/
void GlobalizationNative_ChangeCase(
Expand Down Expand Up @@ -61,13 +82,12 @@ void GlobalizationNative_ChangeCase(

/*
Function:
ChangeCaseInvariant
ChangeCaseTurkish

Performs upper or lower casing of a string into a new buffer.
Special casing is performed to ensure that invariant casing
matches that of Windows in certain situations, e.g. Turkish i's.
Performs upper or lower casing of a string into a new buffer, performing special
casing for Turkish.
*/
void GlobalizationNative_ChangeCaseInvariant(
void GlobalizationNative_ChangeCaseTurkish(
const UChar* lpSrc, int32_t cwSrcLength, UChar* lpDst, int32_t cwDstLength, int32_t bToUpper)
{
// See algorithmic comment in ChangeCase.
Expand All @@ -80,11 +100,10 @@ void GlobalizationNative_ChangeCaseInvariant(
{
while (srcIdx < cwSrcLength)
{
// On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131)
// capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049).
// We special case it to match the Windows invariant behavior.
// In turkish casing, LATIN SMALL LETTER I (U+0069) upper cases to LATIN
// CAPITAL LETTER I WITH DOT ABOVE (U+0130).
U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
dstCodepoint = ((srcCodepoint == (UChar32)0x0131) ? (UChar32)0x0131 : u_toupper(srcCodepoint));
dstCodepoint = ((srcCodepoint == (UChar32)0x0069) ? (UChar32)0x0130 : u_toupper(srcCodepoint));
U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
assert(isError == FALSE && srcIdx == dstIdx);
}
Expand All @@ -93,11 +112,10 @@ void GlobalizationNative_ChangeCaseInvariant(
{
while (srcIdx < cwSrcLength)
{
// On Windows with InvariantCulture, the LATIN CAPITAL LETTER I WITH DOT ABOVE (U+0130)
// lower cases to itself, whereas with ICU it lower cases to LATIN SMALL LETTER I (U+0069).
// We special case it to match the Windows invariant behavior.
// In turkish casing, LATIN CAPITAL LETTER I (U+0049) lower cases to
// LATIN SMALL LETTER DOTLESS I (U+0131).
U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
dstCodepoint = ((srcCodepoint == (UChar32)0x0130) ? (UChar32)0x0130 : u_tolower(srcCodepoint));
dstCodepoint = ((srcCodepoint == (UChar32)0x0049) ? (UChar32)0x0131 : u_tolower(srcCodepoint));
U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
assert(isError == FALSE && srcIdx == dstIdx);
}
Expand All @@ -106,43 +124,24 @@ void GlobalizationNative_ChangeCaseInvariant(

/*
Function:
ChangeCaseTurkish
SimpleCaseFold

Performs upper or lower casing of a string into a new buffer, performing special
casing for Turkish.
Performs simple case folding of a string into a new buffer.
Non-ASCII code points are not mapped to ASCII code points.
*/
void GlobalizationNative_ChangeCaseTurkish(
const UChar* lpSrc, int32_t cwSrcLength, UChar* lpDst, int32_t cwDstLength, int32_t bToUpper)
void GlobalizationNative_SimpleCaseFold(
const UChar* lpSrc, int32_t cwSrcLength, UChar* lpDst, int32_t cwDstLength)
{
// See algorithmic comment in ChangeCase.

UBool isError = FALSE;
int32_t srcIdx = 0, dstIdx = 0;
UChar32 srcCodepoint, dstCodepoint;

if (bToUpper)
{
while (srcIdx < cwSrcLength)
{
// In turkish casing, LATIN SMALL LETTER I (U+0069) upper cases to LATIN
// CAPITAL LETTER I WITH DOT ABOVE (U+0130).
U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
dstCodepoint = ((srcCodepoint == (UChar32)0x0069) ? (UChar32)0x0130 : u_toupper(srcCodepoint));
U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
assert(isError == FALSE && srcIdx == dstIdx);
}
}
else
while (srcIdx < cwSrcLength)
{
while (srcIdx < cwSrcLength)
{
// In turkish casing, LATIN CAPITAL LETTER I (U+0049) lower cases to
// LATIN SMALL LETTER DOTLESS I (U+0131).
U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
dstCodepoint = ((srcCodepoint == (UChar32)0x0049) ? (UChar32)0x0131 : u_tolower(srcCodepoint));
U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
assert(isError == FALSE && srcIdx == dstIdx);
}
U16_NEXT(lpSrc, srcIdx, cwSrcLength, srcCodepoint);
dstCodepoint = CaseFoldCodePoint(srcCodepoint);
U16_APPEND(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
assert(isError == FALSE && srcIdx == dstIdx);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,21 @@
#include "pal_compiler.h"
#include "pal_locale.h"

static UChar32 CaseFoldCodePoint(UChar32 codePoint);

DLLEXPORT void GlobalizationNative_ChangeCase(const UChar* lpSrc,
int32_t cwSrcLength,
UChar* lpDst,
int32_t cwDstLength,
int32_t bToUpper);

DLLEXPORT void GlobalizationNative_ChangeCaseInvariant(const UChar* lpSrc,
int32_t cwSrcLength,
UChar* lpDst,
int32_t cwDstLength,
int32_t bToUpper);

DLLEXPORT void GlobalizationNative_ChangeCaseTurkish(const UChar* lpSrc,
int32_t cwSrcLength,
UChar* lpDst,
int32_t cwDstLength,
int32_t bToUpper);

DLLEXPORT void GlobalizationNative_SimpleCaseFold(const UChar* lpSrc,
int32_t cwSrcLength,
UChar* lpDst,
int32_t cwDstLength);
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <search.h>
#include <string.h>

#include "pal_casing.h"
#include "pal_collation.h"

c_static_assert_msg(UCOL_EQUAL == 0, "managed side requires 0 for equal strings");
Expand Down Expand Up @@ -527,22 +528,9 @@ AreEqualOrdinalIgnoreCase
*/
static int AreEqualOrdinalIgnoreCase(UChar32 one, UChar32 two)
{
// Return whether the two characters are identical or would be identical if they were upper-cased.
// Return whether the two characters are identical or would be identical if they were case-folded.

if (one == two)
{
return TRUE;
}

if (one == 0x0131 || two == 0x0131)
{
// On Windows with InvariantCulture, the LATIN SMALL LETTER DOTLESS I (U+0131)
// capitalizes to itself, whereas with ICU it capitalizes to LATIN CAPITAL LETTER I (U+0049).
// We special case it to match the Windows invariant behavior.
return FALSE;
}

return u_toupper(one) == u_toupper(two);
return (one == two) || (CaseFoldCodePoint(one) == CaseFoldCodePoint(two));
}

/*
Expand Down Expand Up @@ -857,21 +845,21 @@ int32_t GlobalizationNative_CompareStringOrdinalIgnoreCase(
U16_NEXT(lpStr2, str2Idx, cwStr2Length, str2Codepoint);
#pragma clang diagnostic pop

if (str1Codepoint != str2Codepoint && u_toupper(str1Codepoint) != u_toupper(str2Codepoint))
if (str1Codepoint == str2Codepoint)
{
return str1Codepoint < str2Codepoint ? -1 : 1;
continue; // exact code point match
}
}

if (cwStr1Length < cwStr2Length)
{
return -1;
}
str1Codepoint = CaseFoldCodePoint(str1Codepoint);
str2Codepoint = CaseFoldCodePoint(str2Codepoint);

if (cwStr2Length < cwStr1Length)
{
return 1;
if (str1Codepoint == str2Codepoint)
{
continue; // case folded code point match
}

return str1Codepoint - str2Codepoint; // mismatch
}

return 0;
return cwStr1Length - cwStr2Length;
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
// List of all functions from the ICU libraries that are used in the System.Globalization.Native.so
#define FOR_ALL_UNCONDITIONAL_ICU_FUNCTIONS \
PER_FUNCTION_BLOCK(u_charsToUChars, libicuuc) \
PER_FUNCTION_BLOCK(u_foldCase, libicuuc) \
PER_FUNCTION_BLOCK(u_getVersion, libicuuc) \
PER_FUNCTION_BLOCK(u_strlen, libicuuc) \
PER_FUNCTION_BLOCK(u_strncpy, libicuuc) \
Expand Down Expand Up @@ -145,6 +146,7 @@ FOR_ALL_ICU_FUNCTIONS
// Redefine all calls to ICU functions as calls through pointers that are set
// to the functions of the selected version of ICU in the initialization.
#define u_charsToUChars(...) u_charsToUChars_ptr(__VA_ARGS__)
#define u_foldcase(...) u_foldCase_ptr(__VA_ARGS__)
#define u_getVersion(...) u_getVersion_ptr(__VA_ARGS__)
#define u_strlen(...) u_strlen_ptr(__VA_ARGS__)
#define u_strncpy(...) u_strncpy_ptr(__VA_ARGS__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1434,6 +1434,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\IO\PathHelper.Windows.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\IO\PathInternal.Windows.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\IO\DisableMediaInsertionPrompt.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Marvin.OrdinalIgnoreCase.Windows.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\PasteArguments.Windows.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Loader\LibraryNameVariation.Windows.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\MemoryFailPoint.Windows.cs" />
Expand Down Expand Up @@ -1682,6 +1683,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\IO\PathInternal.Unix.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\IO\PersistedFiles.Unix.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\IO\PersistedFiles.Names.Unix.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Marvin.OrdinalIgnoreCase.Unix.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\PasteArguments.Unix.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\Loader\LibraryNameVariation.Unix.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Runtime\MemoryFailPoint.Unix.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Serialization;
using System.Text;
using System.Text.Unicode;
using Internal.Runtime.CompilerServices;

Expand Down Expand Up @@ -485,37 +486,53 @@ internal static int CompareOrdinalIgnoreCase(ref char strA, int lengthA, ref cha
ref char charA = ref strA;
ref char charB = ref strB;

uint currentA, currentB;

// in InvariantMode we support all range and not only the ascii characters.
char maxChar = (GlobalizationMode.Invariant ? (char)0xFFFF : (char)0x7F);

while (length != 0 && charA <= maxChar && charB <= maxChar)
while (length != 0 && (currentA = charA) <= maxChar && (currentB = charB) <= maxChar)
{
// Ordinal equals or lowercase equals if the result ends up in the a-z range
if (charA == charB ||
((charA | 0x20) == (charB | 0x20) &&
(uint)((charA | 0x20) - 'a') <= (uint)('z' - 'a')))
if (currentA == currentB ||
((currentA | 0x20) == (currentB | 0x20) &&
((currentA | 0x20) - 'a') <= (uint)('z' - 'a')))
{
length--;
charA = ref Unsafe.Add(ref charA, 1);
charB = ref Unsafe.Add(ref charB, 1);
}
else
{
int currentA = charA;
int currentB = charB;
// Simple case map both chars if needed

// Uppercase both chars if needed
if ((uint)(charA - 'a') <= 'z' - 'a')
#pragma warning disable CS0162 // Unreachable code detected: one of the two blocks below isn't relevant depending on target platform
if (TextInfo.CaseFoldToUpper)
{
currentA -= 0x20;
if (UnicodeUtility.IsInRangeInclusive(currentA, 'a', 'z'))
{
currentA -= 0x20;
}
if (UnicodeUtility.IsInRangeInclusive(currentB, 'a', 'z'))
{
currentB -= 0x20;
}
}
if ((uint)(charB - 'a') <= 'z' - 'a')
else
{
currentB -= 0x20;
if (UnicodeUtility.IsInRangeInclusive(currentA, 'A', 'Z'))
{
currentA += 0x20;
}
if (UnicodeUtility.IsInRangeInclusive(currentB, 'A', 'Z'))
{
currentB += 0x20;
}
}
#pragma warning restore CS0162 // Unreachable code detected

// Return the (case-insensitive) difference between them.
return currentA - currentB;
return (int)(currentA - currentB);
}
}

Expand Down
Loading