From fa1164c327052042120bf0d3b6fc81e86cfab69d Mon Sep 17 00:00:00 2001 From: SwapnilGaikwad Date: Sun, 14 Apr 2024 05:00:45 +0100 Subject: [PATCH] Use multi-reg load/store for DecodeFromUTF8 (#100589) * Use multi-reg load/store for DecodeFromUTF8 * Address review comments --- .../src/System/Buffers/Text/Base64Decoder.cs | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs index 1ad2cf9faa9f3..16ff227379d1a 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Buffers/Text/Base64Decoder.cs @@ -90,6 +90,17 @@ private static unsafe OperationStatus DecodeFromUtf8(ReadOnlySpan utf8, Sp } } + end = srcMax - 66; + if (AdvSimd.Arm64.IsSupported && (end >= src)) + { + AdvSimdDecode(ref src, ref dest, end, maxSrcLength, destLength, srcBytes, destBytes); + + if (src == srcEnd) + { + goto DoneExit; + } + } + end = srcMax - 24; if ((Ssse3.IsSupported || AdvSimd.Arm64.IsSupported) && BitConverter.IsLittleEndian && (end >= src)) { @@ -844,6 +855,107 @@ private static Vector128 SimdShuffle(Vector128 left, Vector128 return Vector128.ShuffleUnsafe(left, right); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] + private static unsafe void AdvSimdDecode(ref byte* srcBytes, ref byte* destBytes, byte* srcEnd, int sourceLength, int destLength, byte* srcStart, byte* destStart) + { + // C# implementation of https://github.com/aklomp/base64/blob/3a5add8652076612a8407627a42c768736a4263f/lib/arch/neon64/dec_loop.c + // If we have AdvSimd support, pick off 64 bytes at a time for as long as we can, + // but make sure that we quit before seeing any == markers at the end of the + // string. 64 + 2 = 66 bytes. + + Vector128 decLutOne1 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte(); + Vector128 decLutOne2 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF).AsByte(); + Vector128 decLutOne3 = Vector128.Create(0xFFFFFFFF, 0xFFFFFFFF, 0x3EFFFFFF, 0x3FFFFFFF).AsByte(); + Vector128 decLutOne4 = Vector128.Create(0x37363534, 0x3B3A3938, 0xFFFF3D3C, 0xFFFFFFFF).AsByte(); + Vector128 decLutTwo1 = Vector128.Create(0x0100FF00, 0x05040302, 0x09080706, 0x0D0C0B0A).AsByte(); + Vector128 decLutTwo2 = Vector128.Create(0x11100F0E, 0x15141312, 0x19181716, 0xFFFFFFFF).AsByte(); + Vector128 decLutTwo3 = Vector128.Create(0x1B1AFFFF, 0x1F1E1D1C, 0x23222120, 0x27262524).AsByte(); + Vector128 decLutTwo4 = Vector128.Create(0x2B2A2928, 0x2F2E2D2C, 0x33323130, 0xFFFFFFFF).AsByte(); + + Vector128 decOne1; + Vector128 decOne2; + Vector128 decOne3; + Vector128 decOne4; + Vector128 decTwo1; + Vector128 decTwo2; + Vector128 decTwo3; + Vector128 decTwo4; + Vector128 str1; + Vector128 str2; + Vector128 str3; + Vector128 str4; + Vector128 res1; + Vector128 res2; + Vector128 res3; + + byte* src = srcBytes; + byte* dest = destBytes; + Vector128 offset = AdvSimd.DuplicateToVector128((byte)0x3F); + var decLutOne = (decLutOne1, decLutOne2, decLutOne3, decLutOne4); + var decLutTwo = (decLutTwo1, decLutTwo2, decLutTwo3, decLutTwo4); + + do + { + // Load 64 bytes and de-interleave. + AssertRead>(src, srcStart, sourceLength); + (str1, str2, str3, str4) = AdvSimd.Arm64.LoadVector128x4AndUnzip(src); + + // Get indices for second LUT: + decTwo1 = AdvSimd.SubtractSaturate(str1, offset); + decTwo2 = AdvSimd.SubtractSaturate(str2, offset); + decTwo3 = AdvSimd.SubtractSaturate(str3, offset); + decTwo4 = AdvSimd.SubtractSaturate(str4, offset); + + // Get values from first LUT. Out-of-range indices are set to 0. + decOne1 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str1); + decOne2 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str2); + decOne3 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str3); + decOne4 = AdvSimd.Arm64.VectorTableLookup(decLutOne, str4); + + // Get values from second LUT. Out-of-range indices are unchanged. + decTwo1 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo1, decLutTwo, decTwo1); + decTwo2 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo2, decLutTwo, decTwo2); + decTwo3 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo3, decLutTwo, decTwo3); + decTwo4 = AdvSimd.Arm64.VectorTableLookupExtension(decTwo4, decLutTwo, decTwo4); + + // Invalid values are set to 255 during above look-ups using 'decLutTwo' and 'decLutTwo'. + // Thus the intermediate results 'decOne' and 'decTwo' could be OR-ed to get final values. + str1 = decOne1 | decTwo1; + str2 = decOne2 | decTwo2; + str3 = decOne3 | decTwo3; + str4 = decOne4 | decTwo4; + + // Check for invalid input, any value larger than 63. + Vector128 classified = AdvSimd.CompareGreaterThan(str1, offset) + | AdvSimd.CompareGreaterThan(str2, offset) + | AdvSimd.CompareGreaterThan(str3, offset) + | AdvSimd.CompareGreaterThan(str4, offset); + + // Check that all bits are zero. + if (classified != Vector128.Zero) + { + break; + } + + // Compress four bytes into three. + res1 = AdvSimd.ShiftLeftLogical(str1, 2) | AdvSimd.ShiftRightLogical(str2, 4); + res2 = AdvSimd.ShiftLeftLogical(str2, 4) | AdvSimd.ShiftRightLogical(str3, 2); + res3 = AdvSimd.ShiftLeftLogical(str3, 6) | str4; + + // Interleave and store decoded result. + AssertWrite>(dest, destStart, destLength); + AdvSimd.Arm64.StoreVector128x3AndZip(dest, (res1, res2, res3)); + + src += 64; + dest += 48; + } + while (src <= srcEnd); + + srcBytes = src; + destBytes = dest; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] [CompExactlyDependsOn(typeof(Ssse3))]