From a7bb34a14fe4e5caed115d3ca8855bf25f767c2c Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Fri, 22 Mar 2024 12:02:30 -0700 Subject: [PATCH] Switch to iSimdVector and Align WidenAsciiToUtf16. --- .../src/System/Text/Ascii.Utility.cs | 152 ++++++++++-------- 1 file changed, 84 insertions(+), 68 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index ba07b985b67058..824c51e9a028be 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -2038,79 +2038,17 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128.Count) { - ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; - - if (Vector512.IsHardwareAccelerated && elementCount >= (uint)Vector512.Count) + if (Vector512.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector512.Count) { - // Calculating the destination address outside the loop results in significant - // perf wins vs. relying on the JIT to fold memory addressing logic into the - // write instructions. See: https://github.com/dotnet/runtime/issues/33002 - nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector512.Count; - - do - { - Vector512 asciiVector = Vector512.Load(pAsciiBuffer + currentOffset); - - if (asciiVector.ExtractMostSignificantBits() != 0) - { - break; - } - - (Vector512 utf16LowVector, Vector512 utf16HighVector) = Vector512.Widen(asciiVector); - utf16LowVector.Store(pCurrentWriteAddress); - utf16HighVector.Store(pCurrentWriteAddress + Vector512.Count); - - currentOffset += (nuint)Vector512.Count; - pCurrentWriteAddress += (nuint)Vector512.Count; - } while (currentOffset <= finalOffsetWhereCanRunLoop); + WidenAsciiToUtf1_Vector, Vector512>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount); } - else if (Vector256.IsHardwareAccelerated && elementCount >= (uint)Vector256.Count) + else if (Vector256.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector256.Count) { - // Calculating the destination address outside the loop results in significant - // perf wins vs. relying on the JIT to fold memory addressing logic into the - // write instructions. See: https://github.com/dotnet/runtime/issues/33002 - nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector256.Count; - - do - { - Vector256 asciiVector = Vector256.Load(pAsciiBuffer + currentOffset); - - if (asciiVector.ExtractMostSignificantBits() != 0) - { - break; - } - - (Vector256 utf16LowVector, Vector256 utf16HighVector) = Vector256.Widen(asciiVector); - utf16LowVector.Store(pCurrentWriteAddress); - utf16HighVector.Store(pCurrentWriteAddress + Vector256.Count); - - currentOffset += (nuint)Vector256.Count; - pCurrentWriteAddress += (nuint)Vector256.Count; - } while (currentOffset <= finalOffsetWhereCanRunLoop); + WidenAsciiToUtf1_Vector, Vector256>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount); } - else + else if (Vector128.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector128.Count) { - // Calculating the destination address outside the loop results in significant - // perf wins vs. relying on the JIT to fold memory addressing logic into the - // write instructions. See: https://github.com/dotnet/runtime/issues/33002 - nuint finalOffsetWhereCanRunLoop = elementCount - (uint)Vector128.Count; - - do - { - Vector128 asciiVector = Vector128.Load(pAsciiBuffer + currentOffset); - - if (VectorContainsNonAsciiChar(asciiVector)) - { - break; - } - - (Vector128 utf16LowVector, Vector128 utf16HighVector) = Vector128.Widen(asciiVector); - utf16LowVector.Store(pCurrentWriteAddress); - utf16HighVector.Store(pCurrentWriteAddress + Vector128.Count); - - currentOffset += (nuint)Vector128.Count; - pCurrentWriteAddress += (nuint)Vector128.Count; - } while (currentOffset <= finalOffsetWhereCanRunLoop); + WidenAsciiToUtf1_Vector, Vector128>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount); } } @@ -2212,6 +2150,84 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B goto Finish; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WidenAsciiToUtf1_Vector(byte* pAsciiBuffer, char* pUtf16Buffer, ref nuint currentOffset, nuint elementCount) + where TVectorByte : unmanaged, ISimdVector + where TVectorUShort : unmanaged, ISimdVector + { + ushort* pCurrentWriteAddress = (ushort*)pUtf16Buffer; + // Calculating the destination address outside the loop results in significant + // perf wins vs. relying on the JIT to fold memory addressing logic into the + // write instructions. See: https://github.com/dotnet/runtime/issues/33002 + nuint finalOffsetWhereCanRunLoop = elementCount - (nuint)TVectorByte.Count; + TVectorByte asciiVector = TVectorByte.Load(pAsciiBuffer + currentOffset); + if (!HasMatch(asciiVector)) + { + (TVectorUShort utf16LowVector, TVectorUShort utf16HighVector) = Widen(asciiVector); + utf16LowVector.Store(pCurrentWriteAddress); + utf16HighVector.Store(pCurrentWriteAddress + TVectorUShort.Count); + pCurrentWriteAddress += (nuint)(TVectorUShort.Count * 2); + if (((int)pCurrentWriteAddress & 1) == 0) + { + // Bump write buffer up to the next aligned boundary + pCurrentWriteAddress = (ushort*)((nuint)pCurrentWriteAddress & ~(nuint)(TVectorUShort.Alignment - 1)); + nuint numBytesWritten = (nuint)pCurrentWriteAddress - (nuint)pUtf16Buffer; + currentOffset += (nuint)numBytesWritten / 2; + } + else + { + // If input isn't char aligned, we won't be able to align it to a Vector + currentOffset += (nuint)TVectorByte.Count; + } + while (currentOffset <= finalOffsetWhereCanRunLoop) + { + asciiVector = TVectorByte.Load(pAsciiBuffer + currentOffset); + if (HasMatch(asciiVector)) + { + break; + } + (utf16LowVector, utf16HighVector) = Widen(asciiVector); + utf16LowVector.StoreAligned(pCurrentWriteAddress); + utf16HighVector.StoreAligned(pCurrentWriteAddress + TVectorUShort.Count); + currentOffset += (nuint)TVectorByte.Count; + pCurrentWriteAddress += (nuint)(TVectorUShort.Count * 2); + } + } + return; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe bool HasMatch(TVectorByte vector) + where TVectorByte : unmanaged, ISimdVector + { + if (AdvSimd.IsSupported && typeof(TVectorByte) == typeof(Vector128)) + { + return VectorContainsNonAsciiChar((Vector128)(object)vector); + } + return ((vector & TVectorByte.Create((byte)0b1000_0000)) != TVectorByte.Zero); + } + + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe (TVectorUShort Lower, TVectorUShort Upper) Widen(TVectorByte vector) + where TVectorByte : unmanaged, ISimdVector + where TVectorUShort : unmanaged, ISimdVector + { + if (typeof(TVectorByte) == typeof(Vector256)) + { + (Vector256 Lower256, Vector256 Upper256) = Vector256.Widen((Vector256)(object)vector); + return ((TVectorUShort)(object)Lower256, (TVectorUShort)(object)Upper256); + } + else if (typeof(TVectorByte) == typeof(Vector512)) + { + (Vector512 Lower512, Vector512 Upper512) = Vector512.Widen((Vector512)(object)vector); + return ((TVectorUShort)(object)Lower512, (TVectorUShort)(object)Upper512); + } + (Vector128 Lower128, Vector128 Upper128) = Vector128.Widen((Vector128)(object)vector); + return ((TVectorUShort)(object)Lower128, (TVectorUShort)(object)Upper128); + } + + /// /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and /// writes them to the output buffer with machine endianness.