Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Char.GetUnicodeCategory to returns correct results #41200

Merged
merged 2 commits into from
Aug 24, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 30 additions & 35 deletions src/libraries/System.Private.CoreLib/src/System/Char.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ namespace System
// standard.
tarekgh marked this conversation as resolved.
Show resolved Hide resolved
private static ReadOnlySpan<byte> Latin1CharInfo => new byte[]
{
// 0 1 2 3 4 5 6 7 8 9 A B C D E F
0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x8E, 0x8E, 0x8E, 0x8E, 0x0E, 0x0E, // U+0000..U+000F
0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, // U+0010..U+001F
0x8B, 0x18, 0x18, 0x18, 0x1A, 0x18, 0x18, 0x18, 0x14, 0x15, 0x18, 0x19, 0x18, 0x13, 0x18, 0x18, // U+0020..U+002F
Expand All @@ -64,25 +65,19 @@ namespace System
0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x14, 0x19, 0x15, 0x19, 0x0E, // U+0070..U+007F
0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x8E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, // U+0080..U+008F
0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, // U+0090..U+009F
0x8B, 0x18, 0x1A, 0x1A, 0x1A, 0x1A, 0x1C, 0x1C, 0x1B, 0x1C, 0x21, 0x16, 0x19, 0x13, 0x1C, 0x1B, // U+00A0..U+00AF
0x1C, 0x19, 0x0A, 0x0A, 0x1B, 0x21, 0x1C, 0x18, 0x1B, 0x0A, 0x21, 0x17, 0x0A, 0x0A, 0x0A, 0x18, // U+00B0..U+00BF
0x8B, 0x18, 0x1A, 0x1A, 0x1A, 0x1A, 0x1C, 0x14, 0x1B, 0x1C, 0x04, 0x16, 0x19, 0x0F, 0x1C, 0x1B, // U+00A0..U+00AF
0x1C, 0x19, 0x0A, 0x0A, 0x1B, 0x21, 0x18, 0x18, 0x1B, 0x0A, 0x04, 0x17, 0x0A, 0x0A, 0x0A, 0x18, // U+00B0..U+00BF
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, // U+00C0..U+00CF
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x19, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x21, // U+00D0..U+00DF
0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // U+00E0..U+00EF
0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x19, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, 0x21, // U+00F0..U+00FF
};

// Return true for all characters below or equal U+00ff, which is ASCII + Latin-1 Supplement.
private static bool IsLatin1(char ch)
{
return (uint)ch < (uint)Latin1CharInfo.Length;
}
private static bool IsLatin1(char ch) => (uint)ch < (uint)Latin1CharInfo.Length;

// Return true for all characters below or equal U+007f, which is ASCII.
private static bool IsAscii(char ch)
{
return (uint)ch <= '\x007f';
}
private static bool IsAscii(char ch) => (uint)ch <= '\x007f';

// Return the Unicode category for Unicode character <= 0x00ff.
private static UnicodeCategory GetLatin1UnicodeCategory(char ch)
Expand All @@ -96,7 +91,7 @@ private static UnicodeCategory GetLatin1UnicodeCategory(char ch)
//

//
// Overriden Instance Methods
// Overridden Instance Methods
//

// Calculate a hashcode for a 2 byte Unicode character.
Expand Down Expand Up @@ -200,7 +195,7 @@ public static bool TryParse([NotNullWhen(true)] string? s, out char result)
//
// Static Methods
//
/*=================================ISDIGIT======================================
/*=================================IsDigit======================================
**A wrapper for char. Returns a boolean indicating whether **
**character c is considered to be a digit. **
==============================================================================*/
Expand All @@ -226,17 +221,17 @@ internal static bool CheckLetter(UnicodeCategory uc)
return IsInRange(uc, UnicodeCategory.UppercaseLetter, UnicodeCategory.OtherLetter);
}

/*=================================ISLETTER=====================================
/*=================================IsLetter=====================================
**A wrapper for char. Returns a boolean indicating whether **
**character c is considered to be a letter. **
==============================================================================*/
// Determines whether a character is a letter.
public static bool IsLetter(char c)
{
if (IsLatin1(c))
if (IsAscii(c))
{
// For the version of the Unicode standard the Char type is locked to, the
// Latin-1 range doesn't include letters in categories other than "upper" and "lower".
// ASCII range doesn't include letters in categories other than "upper" and "lower".
return (Latin1CharInfo[c] & (IsUpperCaseLetterFlag | IsLowerCaseLetterFlag)) != 0;
}
return CheckLetter(CharUnicodeInfo.GetUnicodeCategory(c));
Expand All @@ -248,7 +243,7 @@ private static bool IsWhiteSpaceLatin1(char c)
return (Latin1CharInfo[c] & IsWhiteSpaceFlag) != 0;
}

/*===============================ISWHITESPACE===================================
/*===============================IsWhiteSpace===================================
**A wrapper for char. Returns a boolean indicating whether **
**character c is considered to be a whitespace character. **
==============================================================================*/
Expand All @@ -264,7 +259,7 @@ public static bool IsWhiteSpace(char c)
}

/*===================================IsUpper====================================
**Arguments: c -- the characater to be checked.
**Arguments: c -- the character to be checked.
**Returns: True if c is an uppercase character.
==============================================================================*/
// Determines whether a character is upper-case.
Expand All @@ -278,7 +273,7 @@ public static bool IsUpper(char c)
}

/*===================================IsLower====================================
**Arguments: c -- the characater to be checked.
**Arguments: c -- the character to be checked.
**Returns: True if c is an lowercase character.
==============================================================================*/
// Determines whether a character is lower-case.
Expand All @@ -297,7 +292,7 @@ internal static bool CheckPunctuation(UnicodeCategory uc)
}

/*================================IsPunctuation=================================
**Arguments: c -- the characater to be checked.
**Arguments: c -- the character to be checked.
**Returns: True if c is an punctuation mark
==============================================================================*/
// Determines whether a character is a punctuation mark.
Expand Down Expand Up @@ -340,7 +335,7 @@ public static char ToUpper(char c, CultureInfo culture)
return culture.TextInfo.ToUpper(c);
}

/*=================================TOUPPER======================================
/*=================================ToUpper======================================
**A wrapper for char.ToUpperCase. Converts character c to its **
**uppercase equivalent. If c is already an uppercase character or is not an **
**alphabetic, nothing happens. **
Expand All @@ -367,7 +362,7 @@ public static char ToLower(char c, CultureInfo culture)
return culture.TextInfo.ToLower(c);
}

/*=================================TOLOWER======================================
/*=================================ToLower======================================
**A wrapper for char.ToLowerCase. Converts character c to its **
**lowercase equivalent. If c is already a lowercase character or is not an **
**alphabetic, nothing happens. **
Expand Down Expand Up @@ -498,7 +493,7 @@ public static bool IsDigit(string s, int index)
{
return IsInRange(c, '0', '9');
}
return CharUnicodeInfo.GetUnicodeCategory(s, index) == UnicodeCategory.DecimalDigitNumber;
return CharUnicodeInfo.GetUnicodeCategoryInternal(s, index) == UnicodeCategory.DecimalDigitNumber;
}

public static bool IsLetter(string s, int index)
Expand All @@ -510,12 +505,12 @@ public static bool IsLetter(string s, int index)
throw new ArgumentOutOfRangeException(nameof(index));
}
char c = s[index];
if (IsLatin1(c))
if (IsAscii(c))
GrabYourPitchforks marked this conversation as resolved.
Show resolved Hide resolved
{
// The Latin-1 range doesn't include letters in categories other than "upper" and "lower"
// The ASCII range doesn't include letters in categories other than "upper" and "lower"
return (Latin1CharInfo[c] & (IsUpperCaseLetterFlag | IsLowerCaseLetterFlag)) != 0;
}
return CheckLetter(CharUnicodeInfo.GetUnicodeCategory(s, index));
return CheckLetter(CharUnicodeInfo.GetUnicodeCategoryInternal(s, index));
}

public static bool IsLetterOrDigit(string s, int index)
Expand All @@ -530,8 +525,8 @@ public static bool IsLetterOrDigit(string s, int index)
if (IsLatin1(c))
{
return CheckLetterOrDigit(GetLatin1UnicodeCategory(c));
}
return CheckLetterOrDigit(CharUnicodeInfo.GetUnicodeCategory(s, index));
}
tarekgh marked this conversation as resolved.
Show resolved Hide resolved
return CheckLetterOrDigit(CharUnicodeInfo.GetUnicodeCategoryInternal(s, index));
}

public static bool IsLower(string s, int index)
Expand All @@ -548,7 +543,7 @@ public static bool IsLower(string s, int index)
return (Latin1CharInfo[c] & IsLowerCaseLetterFlag) != 0;
}

return CharUnicodeInfo.GetUnicodeCategory(s, index) == UnicodeCategory.LowercaseLetter;
return CharUnicodeInfo.GetUnicodeCategoryInternal(s, index) == UnicodeCategory.LowercaseLetter;
}

/*=================================CheckNumber=====================================
Expand Down Expand Up @@ -590,7 +585,7 @@ public static bool IsNumber(string s, int index)
}
return CheckNumber(GetLatin1UnicodeCategory(c));
}
return CheckNumber(CharUnicodeInfo.GetUnicodeCategory(s, index));
return CheckNumber(CharUnicodeInfo.GetUnicodeCategoryInternal(s, index));
}

////////////////////////////////////////////////////////////////////////
Expand All @@ -614,11 +609,11 @@ public static bool IsPunctuation(string s, int index)
{
return CheckPunctuation(GetLatin1UnicodeCategory(c));
}
return CheckPunctuation(CharUnicodeInfo.GetUnicodeCategory(s, index));
return CheckPunctuation(CharUnicodeInfo.GetUnicodeCategoryInternal(s, index));
}

/*================================= CheckSeparator ============================
** Check if the specified UnicodeCategory belongs to the seprator categories.
** Check if the specified UnicodeCategory belongs to the separator categories.
==============================================================================*/

internal static bool CheckSeparator(UnicodeCategory uc)
Expand Down Expand Up @@ -655,7 +650,7 @@ public static bool IsSeparator(string s, int index)
{
return IsSeparatorLatin1(c);
}
return CheckSeparator(CharUnicodeInfo.GetUnicodeCategory(s, index));
return CheckSeparator(CharUnicodeInfo.GetUnicodeCategoryInternal(s, index));
}

public static bool IsSurrogate(char c)
Expand Down Expand Up @@ -707,7 +702,7 @@ public static bool IsSymbol(string s, int index)
{
return CheckSymbol(GetLatin1UnicodeCategory(c));
}
return CheckSymbol(CharUnicodeInfo.GetUnicodeCategory(s, index));
return CheckSymbol(CharUnicodeInfo.GetUnicodeCategoryInternal(s, index));
}

public static bool IsUpper(string s, int index)
Expand All @@ -724,7 +719,7 @@ public static bool IsUpper(string s, int index)
return (Latin1CharInfo[c] & IsUpperCaseLetterFlag) != 0;
}

return CharUnicodeInfo.GetUnicodeCategory(s, index) == UnicodeCategory.UppercaseLetter;
return CharUnicodeInfo.GetUnicodeCategoryInternal(s, index) == UnicodeCategory.UppercaseLetter;
}

public static bool IsWhiteSpace(string s, int index)
Expand Down Expand Up @@ -779,7 +774,7 @@ public static double GetNumericValue(string s, int index)
{
throw new ArgumentOutOfRangeException(nameof(index));
}
return CharUnicodeInfo.GetNumericValue(s, index);
return CharUnicodeInfo.GetNumericValueInternal(s, index);
}

/*================================= IsHighSurrogate ============================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using System.Text;
using System.Text.Unicode;
using Internal.Runtime.CompilerServices;
tarekgh marked this conversation as resolved.
Show resolved Hide resolved
using System.Runtime.CompilerServices;

namespace System.Globalization
{
Expand Down Expand Up @@ -224,9 +225,12 @@ public static double GetNumericValue(string s, int index)
ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.index);
}

return GetNumericValueNoBoundsCheck((uint)GetCodePointFromString(s, index));
return GetNumericValueInternal(s, index);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static double GetNumericValueInternal(string s, int index) => GetNumericValueNoBoundsCheck((uint)GetCodePointFromString(s, index));

private static double GetNumericValueNoBoundsCheck(uint codePoint)
{
nuint offset = GetNumericGraphemeTableOffsetNoBoundsChecks(codePoint);
Expand Down
Loading