Skip to content

Commit

Permalink
Add numerical ordering option for string comparison operations (#109861)
Browse files Browse the repository at this point in the history
Add numerical ordering option for string comparison operations
  • Loading branch information
PranavSenthilnathan authored Nov 25, 2024
1 parent 8f68b59 commit 45bd118
Show file tree
Hide file tree
Showing 20 changed files with 381 additions and 47 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,9 @@ public static string GetDistroVersionString()
public static bool IsNotHybridGlobalization => !IsHybridGlobalization;
public static bool IsNotHybridGlobalizationOnApplePlatform => !IsHybridGlobalizationOnApplePlatform;

// This can be removed once numeric comparisons are supported on Apple platforms
public static bool IsNumericComparisonSupported => !IsHybridGlobalizationOnApplePlatform;

// HG on apple platforms implies ICU
public static bool IsIcuGlobalization => !IsInvariantGlobalization && (IsHybridGlobalizationOnApplePlatform || ICUVersion > new Version(0, 0, 0, 0));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,13 @@ private unsafe SortKey IcuCreateSortKey(string source, CompareOptions options)
throw new ArgumentException(SR.Argument_InvalidFlag, nameof(options));
}

#if TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS
if (GlobalizationMode.Hybrid)
{
AssertComparisonSupported(options);
}
#endif

byte[] keyData;
fixed (char* pSource = source)
{
Expand Down Expand Up @@ -776,6 +783,11 @@ private unsafe int IcuGetSortKey(ReadOnlySpan<char> source, Span<byte> destinati
throw new PlatformNotSupportedException(GetPNSEWithReason("GetSortKey", "non-invariant culture"));
return InvariantGetSortKey(source, destination, options);
}
#elif TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS
if (GlobalizationMode.Hybrid)
{
AssertComparisonSupported(options);
}
#endif

// It's ok to pass nullptr (for empty buffers) to ICU's sort key routines.
Expand Down Expand Up @@ -827,6 +839,11 @@ private unsafe int IcuGetSortKeyLength(ReadOnlySpan<char> source, CompareOptions
throw new PlatformNotSupportedException(GetPNSEWithReason("GetSortKeyLength", "non-invariant culture"));
return InvariantGetSortKeyLength(source, options);
}
#elif TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS
if (GlobalizationMode.Hybrid)
{
AssertComparisonSupported(options);
}
#endif

// It's ok to pass nullptr (for empty buffers) to ICU's sort key routines.
Expand Down Expand Up @@ -889,6 +906,11 @@ private unsafe int IcuGetHashCodeOfString(ReadOnlySpan<char> source, CompareOpti
ReadOnlySpan<char> sanitizedSource = SanitizeForInvariantHash(source, options);
return InvariantGetHashCode(sanitizedSource, options);
}
#elif TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS
if (GlobalizationMode.Hybrid)
{
AssertComparisonSupported(options);
}
#endif

// according to ICU User Guide the performance of ucol_getSortKey is worse when it is called with null output buffer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,7 @@ private static unsafe bool NlsIsSortable(ReadOnlySpan<char> text)
private const int NORM_IGNOREWIDTH = 0x00020000; // Does not differentiate between a single-byte character and the same character as a double-byte character.
private const int NORM_LINGUISTIC_CASING = 0x08000000; // use linguistic rules for casing
private const int SORT_STRINGSORT = 0x00001000; // Treats punctuation the same as symbols.
private const int SORT_DIGITSASNUMBERS = 0x00000008; // Treat digits as numbers during sorting, for example, sort "2" before "10".

private static int GetNativeCompareFlags(CompareOptions options)
{
Expand All @@ -595,6 +596,7 @@ private static int GetNativeCompareFlags(CompareOptions options)
if ((options & CompareOptions.IgnoreSymbols) != 0) { nativeCompareFlags |= NORM_IGNORESYMBOLS; }
if ((options & CompareOptions.IgnoreWidth) != 0) { nativeCompareFlags |= NORM_IGNOREWIDTH; }
if ((options & CompareOptions.StringSort) != 0) { nativeCompareFlags |= SORT_STRINGSORT; }
if ((options & CompareOptions.NumericOrdering) != 0) { nativeCompareFlags |= SORT_DIGITSASNUMBERS; }

// TODO: Can we try for GetNativeCompareFlags to never
// take Ordinal or OrdinalIgnoreCase. This value is not part of Win32, we just handle it special
Expand All @@ -607,6 +609,7 @@ private static int GetNativeCompareFlags(CompareOptions options)
CompareOptions.IgnoreNonSpace |
CompareOptions.IgnoreSymbols |
CompareOptions.IgnoreWidth |
CompareOptions.NumericOrdering |
CompareOptions.StringSort)) == 0) ||
(options == CompareOptions.Ordinal), "[CompareInfo.GetNativeCompareFlags]Expected all flags to be handled");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,19 +175,19 @@ private ReadOnlySpan<char> SanitizeForInvariantHash(ReadOnlySpan<char> source, C
}

private static bool IndexingOptionsNotSupported(CompareOptions options) =>
(options & CompareOptions.IgnoreSymbols) == CompareOptions.IgnoreSymbols;
(options & (CompareOptions.IgnoreSymbols | CompareOptions.NumericOrdering)) != 0;

private static bool CompareOptionsNotSupported(CompareOptions options) =>
(options & CompareOptions.IgnoreWidth) == CompareOptions.IgnoreWidth ||
((options & CompareOptions.IgnoreNonSpace) == CompareOptions.IgnoreNonSpace && (options & CompareOptions.IgnoreKanaType) != CompareOptions.IgnoreKanaType);
((options & CompareOptions.IgnoreNonSpace) == CompareOptions.IgnoreNonSpace && (options & CompareOptions.IgnoreKanaType) == 0);

private static string GetPNSE(CompareOptions options) =>
SR.Format(SR.PlatformNotSupported_HybridGlobalizationWithCompareOptions, options);

private static bool CompareOptionsNotSupportedForCulture(CompareOptions options, string cultureName) =>
(options == CompareOptions.IgnoreKanaType &&
((options & ~CompareOptions.NumericOrdering) == CompareOptions.IgnoreKanaType &&
(string.IsNullOrEmpty(cultureName) || cultureName.Split('-')[0] != "ja")) ||
(options == CompareOptions.None &&
((options & ~CompareOptions.NumericOrdering) == CompareOptions.None &&
(cultureName.Split('-')[0] == "ja"));

private static string GetPNSEForCulture(CompareOptions options, string cultureName) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ public sealed partial class CompareInfo : IDeserializationCallback
// Mask used to check if Compare() / GetHashCode(string) / GetSortKey has the right flags.
private const CompareOptions ValidCompareMaskOffFlags =
~(CompareOptions.IgnoreCase | CompareOptions.IgnoreSymbols | CompareOptions.IgnoreNonSpace |
CompareOptions.IgnoreWidth | CompareOptions.IgnoreKanaType | CompareOptions.StringSort);
CompareOptions.IgnoreWidth | CompareOptions.IgnoreKanaType | CompareOptions.StringSort |
CompareOptions.NumericOrdering);

// Cache the invariant CompareInfo
internal static readonly CompareInfo Invariant = CultureInfo.InvariantCulture.CompareInfo;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,75 @@

namespace System.Globalization
{
/// <summary>
/// Defines the string comparison options to use with <see cref="CompareInfo"/>.
/// </summary>
[Flags]
public enum CompareOptions
{
/// <summary>
/// Indicates the default option settings for string comparisons
/// </summary>
None = 0x00000000,

/// <summary>
/// Indicates that the string comparison must ignore case.
/// </summary>
IgnoreCase = 0x00000001,

/// <summary>
/// Indicates that the string comparison must ignore nonspacing combining characters, such as diacritics.
/// The <see href="https://go.microsoft.com/fwlink/?linkid=37123">Unicode Standard</see> defines combining characters as
/// characters that are combined with base characters to produce a new character. Nonspacing combining characters do not
/// occupy a spacing position by themselves when rendered.
/// </summary>
IgnoreNonSpace = 0x00000002,

/// <summary>
/// Indicates that the string comparison must ignore symbols, such as white-space characters, punctuation, currency symbols,
/// the percent sign, mathematical symbols, the ampersand, and so on.
/// </summary>
IgnoreSymbols = 0x00000004,

/// <summary>
/// Indicates that the string comparison must ignore the Kana type. Kana type refers to Japanese hiragana and katakana characters, which represent phonetic sounds in the Japanese language.
/// Hiragana is used for native Japanese expressions and words, while katakana is used for words borrowed from other languages, such as "computer" or "Internet".
/// A phonetic sound can be expressed in both hiragana and katakana. If this value is selected, the hiragana character for one sound is considered equal to the katakana character for the same sound.
/// </summary>
IgnoreKanaType = 0x00000008,

/// <summary>
/// Indicates that the string comparison must ignore the character width. For example, Japanese katakana characters can be written as full-width or half-width.
/// If this value is selected, the katakana characters written as full-width are considered equal to the same characters written as half-width.
/// </summary>
IgnoreWidth = 0x00000010,

/// <summary>
/// Indicates that the string comparison must sort sequences of digits (Unicode general category "Nd") based on their numeric value.
/// For example, "2" comes before "10". Non-digit characters such as decimal points, minus or plus signs, etc.
/// are not considered as part of the sequence and will terminate it. This flag is not valid for indexing
/// (such as <see cref="CompareInfo.IndexOf(string, string, CompareOptions)"/>, <see cref="CompareInfo.IsPrefix(string, string, CompareOptions)"/>, etc.).
/// </summary>
NumericOrdering = 0x00000020,

/// <summary>
/// String comparison must ignore case, then perform an ordinal comparison. This technique is equivalent to
/// converting the string to uppercase using the invariant culture and then performing an ordinal comparison on the result.
/// This value cannot be combined with other <see cref="CompareOptions" /> values and must be used alone.
/// </summary>
OrdinalIgnoreCase = 0x10000000, // This flag can not be used with other flags.

/// <summary>
/// Indicates that the string comparison must use the string sort algorithm. In a string sort, the hyphen and the apostrophe,
/// as well as other nonalphanumeric symbols, come before alphanumeric characters.
/// </summary>
StringSort = 0x20000000,

/// <summary>
/// Indicates that the string comparison must use successive Unicode UTF-16 encoded values of the string (code unit by code unit comparison),
/// leading to a fast comparison but one that is culture-insensitive. A string starting with a code unit XXXX16 comes before a string starting with YYYY16,
/// if XXXX16 is less than YYYY16. This value cannot be combined with other <see cref="CompareOptions" /> values and must be used alone.
/// </summary>
Ordinal = 0x40000000, // This flag can not be used with other flags.
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,9 @@ public sealed class CultureAwareComparer : StringComparer, IAlternateEqualityCom
internal static readonly CultureAwareComparer InvariantCaseSensitiveInstance = new CultureAwareComparer(CompareInfo.Invariant, CompareOptions.None);
internal static readonly CultureAwareComparer InvariantIgnoreCaseInstance = new CultureAwareComparer(CompareInfo.Invariant, CompareOptions.IgnoreCase);

private const CompareOptions ValidCompareMaskOffFlags = ~(CompareOptions.IgnoreCase | CompareOptions.IgnoreSymbols | CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreWidth | CompareOptions.IgnoreKanaType | CompareOptions.StringSort);
private const CompareOptions ValidCompareMaskOffFlags =
~(CompareOptions.IgnoreCase | CompareOptions.IgnoreSymbols | CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreKanaType |
CompareOptions.IgnoreWidth | CompareOptions.NumericOrdering | CompareOptions.StringSort);

private readonly CompareInfo _compareInfo; // Do not rename (binary serialization)
private readonly CompareOptions _options;
Expand Down
1 change: 1 addition & 0 deletions src/libraries/System.Runtime/ref/System.Runtime.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9215,6 +9215,7 @@ public enum CompareOptions
IgnoreSymbols = 4,
IgnoreKanaType = 8,
IgnoreWidth = 16,
NumericOrdering = 32,
OrdinalIgnoreCase = 268435456,
StringSort = 536870912,
Ordinal = 1073741824,
Expand Down
Loading

0 comments on commit 45bd118

Please sign in to comment.