Skip to content

Commit

Permalink
Reduce allocations in string.Normalize (#34774)
Browse files Browse the repository at this point in the history
* Reduce allocations in string.Normalize

* Move stackalloc out of the loop

* Use char* instead of ref char, pin manually

* Mark methods unsafe instead of using unsafe blocks

* Return the original string if unchanged

* Use const for StackallocThreshold
  • Loading branch information
MihaZupan authored Apr 10, 2020
1 parent 1b683ec commit cd185d9
Show file tree
Hide file tree
Showing 4 changed files with 141 additions and 93 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ internal static partial class Interop
internal static partial class Globalization
{
[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_IsNormalized")]
internal static extern int IsNormalized(NormalizationForm normalizationForm, string src, int srcLen);
internal static extern unsafe int IsNormalized(NormalizationForm normalizationForm, char* src, int srcLen);

[DllImport(Libraries.GlobalizationNative, CharSet = CharSet.Unicode, EntryPoint = "GlobalizationNative_NormalizeString")]
internal static extern int NormalizeString(NormalizationForm normalizationForm, string src, int srcLen, [Out] char[] dstBuffer, int dstBufferCapacity);
internal static extern unsafe int NormalizeString(NormalizationForm normalizationForm, char* src, int srcLen, char* dstBuffer, int dstBufferCapacity);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,21 @@
// See the LICENSE file in the project root for more information.

using System.Runtime.InteropServices;
using System.Text;

internal static partial class Interop
{
internal static partial class Normaliz
{
[DllImport("Normaliz.dll", CharSet = CharSet.Unicode, SetLastError = true)]
internal static extern bool IsNormalizedString(int normForm, string source, int length);
internal static extern unsafe BOOL IsNormalizedString(NormalizationForm normForm, char* source, int length);

[DllImport("Normaliz.dll", CharSet = CharSet.Unicode, SetLastError = true)]
internal static extern int NormalizeString(
int normForm,
string source,
internal static extern unsafe int NormalizeString(
NormalizationForm normForm,
char* source,
int sourceLength,
[System.Runtime.InteropServices.OutAttribute]
char[]? destination,
char* destination,
int destinationLength);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Buffers;
using System.Diagnostics;
using System.Runtime.InteropServices;
using System.Text;

namespace System.Globalization
{
internal static partial class Normalization
{
internal static bool IsNormalized(string strInput, NormalizationForm normalizationForm)
internal static unsafe bool IsNormalized(string strInput, NormalizationForm normalizationForm)
{
if (GlobalizationMode.Invariant)
{
Expand All @@ -20,7 +22,11 @@ internal static bool IsNormalized(string strInput, NormalizationForm normalizati

ValidateArguments(strInput, normalizationForm);

int ret = Interop.Globalization.IsNormalized(normalizationForm, strInput, strInput.Length);
int ret;
fixed (char* pInput = strInput)
{
ret = Interop.Globalization.IsNormalized(normalizationForm, pInput, strInput.Length);
}

if (ret == -1)
{
Expand All @@ -30,7 +36,7 @@ internal static bool IsNormalized(string strInput, NormalizationForm normalizati
return ret == 1;
}

internal static string Normalize(string strInput, NormalizationForm normalizationForm)
internal static unsafe string Normalize(string strInput, NormalizationForm normalizationForm)
{
if (GlobalizationMode.Invariant)
{
Expand All @@ -41,26 +47,62 @@ internal static string Normalize(string strInput, NormalizationForm normalizatio

ValidateArguments(strInput, normalizationForm);

char[] buf = new char[strInput.Length];

for (int attempts = 2; attempts > 0; attempts--)
char[]? toReturn = null;
try
{
int realLen = Interop.Globalization.NormalizeString(normalizationForm, strInput, strInput.Length, buf, buf.Length);
const int StackallocThreshold = 512;

if (realLen == -1)
Span<char> buffer = strInput.Length <= StackallocThreshold
? stackalloc char[StackallocThreshold]
: (toReturn = ArrayPool<char>.Shared.Rent(strInput.Length));

for (int attempt = 0; attempt < 2; attempt++)
{
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
int realLen;
fixed (char* pInput = strInput)
fixed (char* pDest = &MemoryMarshal.GetReference(buffer))
{
realLen = Interop.Globalization.NormalizeString(normalizationForm, pInput, strInput.Length, pDest, buffer.Length);
}

if (realLen == -1)
{
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
}

if (realLen <= buffer.Length)
{
ReadOnlySpan<char> result = buffer.Slice(0, realLen);
return result.SequenceEqual(strInput)
? strInput
: new string(result);
}

Debug.Assert(realLen > StackallocThreshold);

if (attempt == 0)
{
if (toReturn != null)
{
// Clear toReturn first to ensure we don't return the same buffer twice
char[] temp = toReturn;
toReturn = null;
ArrayPool<char>.Shared.Return(temp);
}

buffer = toReturn = ArrayPool<char>.Shared.Rent(realLen);
}
}

if (realLen <= buf.Length)
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
}
finally
{
if (toReturn != null)
{
return new string(buf, 0, realLen);
ArrayPool<char>.Shared.Return(toReturn);
}

buf = new char[realLen];
}

throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
}

// -----------------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Buffers;
using System.Diagnostics;
using System.Runtime.InteropServices;
using System.Text;
Expand All @@ -10,7 +11,7 @@ namespace System.Globalization
{
internal static partial class Normalization
{
internal static bool IsNormalized(string strInput, NormalizationForm normalizationForm)
internal static unsafe bool IsNormalized(string strInput, NormalizationForm normalizationForm)
{
if (GlobalizationMode.Invariant)
{
Expand All @@ -24,7 +25,11 @@ internal static bool IsNormalized(string strInput, NormalizationForm normalizati
// The only way to know if IsNormalizedString failed is through checking the Win32 last error
// IsNormalizedString pinvoke has SetLastError attribute property which will set the last error
// to 0 (ERROR_SUCCESS) before executing the calls.
bool result = Interop.Normaliz.IsNormalizedString((int)normalizationForm, strInput, strInput.Length);
Interop.BOOL result;
fixed (char* pInput = strInput)
{
result = Interop.Normaliz.IsNormalizedString(normalizationForm, pInput, strInput.Length);
}

int lastError = Marshal.GetLastWin32Error();
switch (lastError)
Expand All @@ -51,10 +56,10 @@ internal static bool IsNormalized(string strInput, NormalizationForm normalizati
throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError));
}

return result;
return result != Interop.BOOL.FALSE;
}

internal static string Normalize(string strInput, NormalizationForm normalizationForm)
internal static unsafe string Normalize(string strInput, NormalizationForm normalizationForm)
{
if (GlobalizationMode.Invariant)
{
Expand All @@ -65,84 +70,85 @@ internal static string Normalize(string strInput, NormalizationForm normalizatio

Debug.Assert(strInput != null);

// we depend on Win32 last error when calling NormalizeString
// NormalizeString pinvoke has SetLastError attribute property which will set the last error
// to 0 (ERROR_SUCCESS) before executing the calls.

// Guess our buffer size first
int iLength = Interop.Normaliz.NormalizeString((int)normalizationForm, strInput, strInput.Length, null, 0);
if (strInput.Length == 0)
{
return string.Empty;
}

int lastError = Marshal.GetLastWin32Error();
// Could have an error (actually it'd be quite hard to have an error here)
if ((lastError != Interop.Errors.ERROR_SUCCESS) || iLength < 0)
char[]? toReturn = null;
try
{
if (lastError == Interop.Errors.ERROR_INVALID_PARAMETER)
const int StackallocThreshold = 512;

Span<char> buffer = strInput.Length <= StackallocThreshold
? stackalloc char[StackallocThreshold]
: (toReturn = ArrayPool<char>.Shared.Rent(strInput.Length));

while (true)
{
if (normalizationForm != NormalizationForm.FormC &&
normalizationForm != NormalizationForm.FormD &&
normalizationForm != NormalizationForm.FormKC &&
normalizationForm != NormalizationForm.FormKD)
// we depend on Win32 last error when calling NormalizeString
// NormalizeString pinvoke has SetLastError attribute property which will set the last error
// to 0 (ERROR_SUCCESS) before executing the calls.
int realLength;
fixed (char* pInput = strInput)
fixed (char* pDest = &MemoryMarshal.GetReference(buffer))
{
throw new ArgumentException(SR.Argument_InvalidNormalizationForm, nameof(normalizationForm));
realLength = Interop.Normaliz.NormalizeString(normalizationForm, pInput, strInput.Length, pDest, buffer.Length);
}
int lastError = Marshal.GetLastWin32Error();

throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));
switch (lastError)
{
case Interop.Errors.ERROR_SUCCESS:
ReadOnlySpan<char> result = buffer.Slice(0, realLength);
return result.SequenceEqual(strInput)
? strInput
: new string(result);

// Do appropriate stuff for the individual errors:
case Interop.Errors.ERROR_INSUFFICIENT_BUFFER:
realLength = Math.Abs(realLength);
Debug.Assert(realLength > buffer.Length, "Buffer overflow should have iLength > cBuffer.Length");
if (toReturn != null)
{
// Clear toReturn first to ensure we don't return the same buffer twice
char[] temp = toReturn;
toReturn = null;
ArrayPool<char>.Shared.Return(temp);
}
Debug.Assert(realLength > StackallocThreshold);
buffer = toReturn = ArrayPool<char>.Shared.Rent(realLength);
continue;

case Interop.Errors.ERROR_INVALID_PARAMETER:
case Interop.Errors.ERROR_NO_UNICODE_TRANSLATION:
if (normalizationForm != NormalizationForm.FormC &&
normalizationForm != NormalizationForm.FormD &&
normalizationForm != NormalizationForm.FormKC &&
normalizationForm != NormalizationForm.FormKD)
{
throw new ArgumentException(SR.Argument_InvalidNormalizationForm, nameof(normalizationForm));
}

// Illegal code point or order found. Ie: FFFE or D800 D800, etc.
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));

case Interop.Errors.ERROR_NOT_ENOUGH_MEMORY:
throw new OutOfMemoryException();

default:
// We shouldn't get here...
throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError));
}
}

// We shouldn't really be able to get here..., guessing length is
// a trivial math function...
// Can't really be Out of Memory, but just in case:
if (lastError == Interop.Errors.ERROR_NOT_ENOUGH_MEMORY)
throw new OutOfMemoryException();

// Who knows what happened? Not us!
throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError));
}

// Don't break for empty strings (only possible for D & KD and not really possible at that)
if (iLength == 0) return string.Empty;

// Someplace to stick our buffer
char[] cBuffer;

while (true)
finally
{
// (re)allocation buffer and normalize string
cBuffer = new char[iLength];

// NormalizeString pinvoke has SetLastError attribute property which will set the last error
// to 0 (ERROR_SUCCESS) before executing the calls.
iLength = Interop.Normaliz.NormalizeString((int)normalizationForm, strInput, strInput.Length, cBuffer, cBuffer.Length);
lastError = Marshal.GetLastWin32Error();

if (lastError == Interop.Errors.ERROR_SUCCESS)
break;

// Could have an error (actually it'd be quite hard to have an error here)
switch (lastError)
if (toReturn != null)
{
// Do appropriate stuff for the individual errors:
case Interop.Errors.ERROR_INSUFFICIENT_BUFFER:
iLength = Math.Abs(iLength);
Debug.Assert(iLength > cBuffer.Length, "Buffer overflow should have iLength > cBuffer.Length");
continue;

case Interop.Errors.ERROR_INVALID_PARAMETER:
case Interop.Errors.ERROR_NO_UNICODE_TRANSLATION:
// Illegal code point or order found. Ie: FFFE or D800 D800, etc.
throw new ArgumentException(SR.Argument_InvalidCharSequenceNoIndex, nameof(strInput));

case Interop.Errors.ERROR_NOT_ENOUGH_MEMORY:
throw new OutOfMemoryException();

default:
// We shouldn't get here...
throw new InvalidOperationException(SR.Format(SR.UnknownError_Num, lastError));
ArrayPool<char>.Shared.Return(toReturn);
}
}

// Copy our buffer into our new string, which will be the appropriate size
return new string(cBuffer, 0, iLength);
}
}
}

0 comments on commit cd185d9

Please sign in to comment.