Skip to content

Commit

Permalink
Normalization APIs using the spans (dotnet#110465)
Browse files Browse the repository at this point in the history
* Normalization APIs using the spans

* Address the feedback

* Update src/libraries/System.Private.CoreLib/src/System/Globalization/Normalization.Icu.cs

Co-authored-by: Günther Foidl <gue@korporal.at>

* Fix comment indent

---------

Co-authored-by: Günther Foidl <gue@korporal.at>
Co-authored-by: Eric StJohn <ericstj@microsoft.com>
  • Loading branch information
3 people authored Dec 9, 2024
1 parent 016d356 commit a5af0ab
Show file tree
Hide file tree
Showing 8 changed files with 461 additions and 89 deletions.
57 changes: 30 additions & 27 deletions src/libraries/System.Private.CoreLib/src/Resources/Strings.resx
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
<?xml version="1.0" encoding="utf-8"?>
<root>
<!--
Microsoft ResX Schema
<!--
Microsoft ResX Schema
Version 2.0
The primary goals of this format is to allow a simple XML format
that is mostly human readable. The generation and parsing of the
various data types are done through the TypeConverter classes
The primary goals of this format is to allow a simple XML format
that is mostly human readable. The generation and parsing of the
various data types are done through the TypeConverter classes
associated with the data types.
Example:
... ado.net/XML headers & schema ...
<resheader name="resmimetype">text/microsoft-resx</resheader>
<resheader name="version">2.0</resheader>
Expand All @@ -26,36 +26,36 @@
<value>[base64 mime encoded string representing a byte array form of the .NET Framework object]</value>
<comment>This is a comment</comment>
</data>
There are any number of "resheader" rows that contain simple
There are any number of "resheader" rows that contain simple
name/value pairs.
Each data row contains a name, and value. The row also contains a
type or mimetype. Type corresponds to a .NET class that support
text/value conversion through the TypeConverter architecture.
Classes that don't support this are serialized and stored with the
Each data row contains a name, and value. The row also contains a
type or mimetype. Type corresponds to a .NET class that support
text/value conversion through the TypeConverter architecture.
Classes that don't support this are serialized and stored with the
mimetype set.
The mimetype is used for serialized objects, and tells the
ResXResourceReader how to depersist the object. This is currently not
The mimetype is used for serialized objects, and tells the
ResXResourceReader how to depersist the object. This is currently not
extensible. For a given mimetype the value must be set accordingly:
Note - application/x-microsoft.net.object.binary.base64 is the format
that the ResXResourceWriter will generate, however the reader can
Note - application/x-microsoft.net.object.binary.base64 is the format
that the ResXResourceWriter will generate, however the reader can
read any of the formats listed below.
mimetype: application/x-microsoft.net.object.binary.base64
value : The object must be serialized with
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Binary.BinaryFormatter
: and then encoded with base64 encoding.
mimetype: application/x-microsoft.net.object.soap.base64
value : The object must be serialized with
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Soap.SoapFormatter
: and then encoded with base64 encoding.
mimetype: application/x-microsoft.net.object.bytearray.base64
value : The object must be serialized into a byte array
value : The object must be serialized into a byte array
: using a System.ComponentModel.TypeConverter
: and then encoded with base64 encoding.
-->
Expand Down Expand Up @@ -1315,6 +1315,9 @@
<data name="Argument_InvalidNormalizationForm" xml:space="preserve">
<value>Invalid or unsupported normalization form.</value>
</data>
<data name="Argument_UnsupportedNormalizationFormInBrowser" xml:space="preserve">
<value>`NormalizationForm.FormKC` and `NormalizationForm.FormKD` are not supported in browser environments or WebAssembly.</value>
</data>
<data name="Argument_InvalidNumberStyles" xml:space="preserve">
<value>An undefined NumberStyles value is being used.</value>
</data>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,31 +10,33 @@ namespace System.Globalization
{
internal static partial class Normalization
{
private static unsafe bool IcuIsNormalized(string strInput, NormalizationForm normalizationForm)
private static unsafe bool IcuIsNormalized(ReadOnlySpan<char> source, NormalizationForm normalizationForm)
{
Debug.Assert(!GlobalizationMode.Invariant);
Debug.Assert(!GlobalizationMode.UseNls);
Debug.Assert(!source.IsEmpty);
Debug.Assert(normalizationForm is NormalizationForm.FormC or NormalizationForm.FormD or NormalizationForm.FormKC or NormalizationForm.FormKD);

ValidateArguments(strInput, normalizationForm);
ValidateArguments(source, normalizationForm, nameof(source));

int ret;
fixed (char* pInput = strInput)
fixed (char* pInput = source)
{
#if TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS
if (GlobalizationMode.Hybrid)
{
ret = Interop.Globalization.IsNormalizedNative(normalizationForm, pInput, strInput.Length);
ret = Interop.Globalization.IsNormalizedNative(normalizationForm, pInput, source.Length);
}
else
#endif
{
ret = Interop.Globalization.IsNormalized(normalizationForm, pInput, strInput.Length);
ret = Interop.Globalization.IsNormalized(normalizationForm, pInput, source.Length);
}
}

if (ret == -1)
{
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(source));
}

return ret == 1;
Expand All @@ -44,6 +46,7 @@ private static unsafe string IcuNormalize(string strInput, NormalizationForm nor
{
Debug.Assert(!GlobalizationMode.Invariant);
Debug.Assert(!GlobalizationMode.UseNls);
Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD);

ValidateArguments(strInput, normalizationForm);

Expand Down Expand Up @@ -114,25 +117,95 @@ private static unsafe string IcuNormalize(string strInput, NormalizationForm nor
}
}

private static void ValidateArguments(string strInput, NormalizationForm normalizationForm)
private static unsafe bool IcuTryNormalize(ReadOnlySpan<char> source, Span<char> destination, out int charsWritten, NormalizationForm normalizationForm = NormalizationForm.FormC)
{
Debug.Assert(strInput != null);
Debug.Assert(!GlobalizationMode.Invariant);
Debug.Assert(!GlobalizationMode.UseNls);
Debug.Assert(!source.IsEmpty);
Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD);

if ((OperatingSystem.IsBrowser() || OperatingSystem.IsWasi())&& (normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD))
if (destination.IsEmpty)
{
// Browser's ICU doesn't contain data needed for FormKC and FormKD
throw new PlatformNotSupportedException();
charsWritten = 0;
return false;
}

ValidateArguments(source, normalizationForm, nameof(source));

int realLen;
fixed (char* pInput = source)
fixed (char* pDest = destination)
{
#if TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS
if (GlobalizationMode.Hybrid)
{
realLen = Interop.Globalization.NormalizeStringNative(normalizationForm, pInput, source.Length, pDest, destination.Length);
}
else
#endif
{
realLen = Interop.Globalization.NormalizeString(normalizationForm, pInput, source.Length, pDest, destination.Length);
}
}

if (realLen < 0)
{
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(source));
}

if (realLen <= destination.Length)
{
charsWritten = realLen;
return true;
}

charsWritten = 0;
return false;
}

private static unsafe int IcuGetNormalizedLength(ReadOnlySpan<char> source, NormalizationForm normalizationForm)
{
Debug.Assert(!GlobalizationMode.Invariant);
Debug.Assert(!GlobalizationMode.UseNls);
Debug.Assert(!source.IsEmpty);
Debug.Assert(normalizationForm == NormalizationForm.FormC || normalizationForm == NormalizationForm.FormD || normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD);

ValidateArguments(source, normalizationForm, nameof(source));

int realLen;
fixed (char* pInput = source)
{
#if TARGET_MACCATALYST || TARGET_IOS || TARGET_TVOS
if (GlobalizationMode.Hybrid)
{
realLen = Interop.Globalization.NormalizeStringNative(normalizationForm, pInput, source.Length, null, 0);
}
else
#endif
{
realLen = Interop.Globalization.NormalizeString(normalizationForm, pInput, source.Length, null, 0);
}
}

if (realLen < 0)
{
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(source));
}

if (normalizationForm != NormalizationForm.FormC && normalizationForm != NormalizationForm.FormD &&
normalizationForm != NormalizationForm.FormKC && normalizationForm != NormalizationForm.FormKD)
return realLen;
}

private static void ValidateArguments(ReadOnlySpan<char> strInput, NormalizationForm normalizationForm, string paramName = "strInput")
{
if ((OperatingSystem.IsBrowser() || OperatingSystem.IsWasi()) && (normalizationForm == NormalizationForm.FormKC || normalizationForm == NormalizationForm.FormKD))
{
throw new ArgumentException(SR.Argument_InvalidNormalizationForm, nameof(normalizationForm));
// Browser's ICU doesn't contain data needed for FormKC and FormKD
throw new PlatformNotSupportedException(SR.Argument_UnsupportedNormalizationFormInBrowser);
}

if (HasInvalidUnicodeSequence(strInput))
{
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, paramName);
}
}

Expand All @@ -143,7 +216,7 @@ private static void ValidateArguments(string strInput, NormalizationForm normali
/// We walk the string ourselves looking for these bad sequences so we can continue to throw
/// ArgumentException in these cases.
/// </summary>
private static bool HasInvalidUnicodeSequence(string s)
private static bool HasInvalidUnicodeSequence(ReadOnlySpan<char> s)
{
for (int i = 0; i < s.Length; i++)
{
Expand Down
Loading

0 comments on commit a5af0ab

Please sign in to comment.