From 7eda9e05bfd1b43f03467928a618593a4a0ebe70 Mon Sep 17 00:00:00 2001 From: Matous Kozak Date: Thu, 11 Jul 2024 09:05:41 +0200 Subject: [PATCH] Revert "Revert "Add Avx512 support to IndexOfAnyAsciiSearcher (#103710)" (#104688)" This reverts commit a48a39fbb29aa745713c4f1028bcc9df84a5da71. --- .../SearchValues/IndexOfAnyAsciiSearcher.cs | 567 +++++++++++++++++- .../Strings/Helpers/AhoCorasick.cs | 2 +- 2 files changed, 538 insertions(+), 31 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs index bf06e365f40c2..ec5cfef5610fd 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs @@ -17,18 +17,36 @@ internal static class IndexOfAnyAsciiSearcher { public struct AsciiState(Vector128 bitmap, BitVector256 lookup) { - public Vector256 Bitmap = Vector256.Create(bitmap); + public Vector512 Bitmap512 = Vector512.Create(bitmap); public BitVector256 Lookup = lookup; + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public readonly Vector128 Bitmap128() => Bitmap512._lower._lower; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public readonly Vector256 Bitmap256() => Bitmap512._lower; + public readonly AsciiState CreateInverse() => - new AsciiState(~Bitmap._lower, Lookup.CreateInverse()); + new AsciiState(~Bitmap128(), Lookup.CreateInverse()); } public struct AnyByteState(Vector128 bitmap0, Vector128 bitmap1, BitVector256 lookup) { - public Vector256 Bitmap0 = Vector256.Create(bitmap0); - public Vector256 Bitmap1 = Vector256.Create(bitmap1); + public Vector512 Bitmap0_512 = Vector512.Create(bitmap0); + public Vector512 Bitmap1_512 = Vector512.Create(bitmap1); public BitVector256 Lookup = lookup; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public readonly Vector128 Bitmap0_128() => Bitmap0_512._lower._lower; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public readonly Vector128 Bitmap1_128() => Bitmap1_512._lower._lower; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public readonly Vector256 Bitmap0_256() => Bitmap0_512._lower; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public readonly Vector256 Bitmap1_256() => Bitmap1_512._lower; } internal static bool IsVectorizationSupported => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported; @@ -142,11 +160,11 @@ private static unsafe bool TryIndexOfAny(ref short searchSpace, int se { AsciiState state = default; - if (TryComputeBitmap(asciiValues, (byte*)&state.Bitmap._lower, out bool needleContainsZero)) + if (TryComputeBitmap(asciiValues, (byte*)&state.Bitmap512._lower._lower, out bool needleContainsZero)) { // Only initializing the bitmap here is okay as we can only get here if the search space is long enough // and we support vectorization, so the IndexOfAnyVectorized implementation will never touch state.Lookup. - state.Bitmap = Vector256.Create(state.Bitmap.GetLower()); + state.Bitmap512 = Vector512.Create(state.Bitmap128()); index = (Ssse3.IsSupported || PackedSimd.IsSupported) && needleContainsZero ? IndexOfAny(ref searchSpace, searchSpaceLength, ref state) @@ -169,11 +187,11 @@ private static unsafe bool TryLastIndexOfAny(ref short searchSpace, in { AsciiState state = default; - if (TryComputeBitmap(asciiValues, (byte*)&state.Bitmap._lower, out bool needleContainsZero)) + if (TryComputeBitmap(asciiValues, (byte*)&state.Bitmap512._lower._lower, out bool needleContainsZero)) { // Only initializing the bitmap here is okay as we can only get here if the search space is long enough // and we support vectorization, so the LastIndexOfAnyVectorized implementation will never touch state.Lookup. - state.Bitmap = Vector256.Create(state.Bitmap.GetLower()); + state.Bitmap512 = Vector512.Create(state.Bitmap128()); index = (Ssse3.IsSupported || PackedSimd.IsSupported) && needleContainsZero ? LastIndexOfAny(ref searchSpace, searchSpaceLength, ref state) @@ -237,9 +255,68 @@ private static TResult IndexOfAnyCore 2 * Vector128.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { - Vector256 bitmap256 = state.Bitmap; +#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false + if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && searchSpaceLength > 2 * Vector256.Count) +#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough + { + Vector512 bitmap512 = state.Bitmap512; + + if (searchSpaceLength > 2 * Vector512.Count) + { + // Process the input in chunks of 64 characters (2 * Vector512). + // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector512. + // As packing two Vector512s into a Vector512 is cheap compared to the lookup, we can effectively double the throughput. + // If the input length is a multiple of 64, don't consume the last 64 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector512.Count)); + + do + { + Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); + Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count); + + Vector512 result = IndexOfAnyLookup(source0, source1, bitmap512); + if (result != Vector512.Zero) + { + return TResultMapper.FirstIndex(ref searchSpace, ref currentSearchSpace, result); + } + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector512.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd)); + } + + // We have 1-64 characters remaining. Process the first and last vector in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector512.Count, "We expect that the input is long enough for us to load a whole vector."); + { + ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector512.Count); + + ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd) + ? ref oneVectorAwayFromEnd + : ref currentSearchSpace; + + Vector512 source0 = Vector512.LoadUnsafe(ref firstVector); + Vector512 source1 = Vector512.LoadUnsafe(ref oneVectorAwayFromEnd); + + Vector512 result = IndexOfAnyLookup(source0, source1, bitmap512); + if (result != Vector512.Zero) + { + return TResultMapper.FirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + } + } - if (searchSpaceLength > 2 * Vector256.Count) + return TResultMapper.NotFound; + } + + Vector256 bitmap256 = state.Bitmap256(); + +#pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer +#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false + if (!(Vector512.IsHardwareAccelerated && Avx512BW.IsSupported) && searchSpaceLength > 2 * Vector256.Count) +#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough +#pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing { // Process the input in chunks of 32 characters (2 * Vector256). // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256. @@ -288,7 +365,7 @@ private static TResult IndexOfAnyCore bitmap = state.Bitmap._lower; + Vector128 bitmap = state.Bitmap128(); #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx2.IsSupported is false if (!Avx2.IsSupported && searchSpaceLength > 2 * Vector128.Count) @@ -368,9 +445,64 @@ public static int LastIndexOfAny(ref short searchSpace if (Avx2.IsSupported && searchSpaceLength > 2 * Vector128.Count) #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { - Vector256 bitmap256 = state.Bitmap; + if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && searchSpaceLength > 2 * Vector256.Count) + { + Vector512 bitmap512 = state.Bitmap512; + + if (searchSpaceLength > 2 * Vector512.Count) + { + // Process the input in chunks of 64 characters (2 * Vector512). + // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector512. + // As packing two Vector512s into a Vector512 is cheap compared to the lookup, we can effectively double the throughput. + // If the input length is a multiple of 64, don't consume the last 64 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector512.Count); + + do + { + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector512.Count); + + Vector512 source0 = Vector512.LoadUnsafe(ref currentSearchSpace); + Vector512 source1 = Vector512.LoadUnsafe(ref currentSearchSpace, (nuint)Vector512.Count); + + Vector512 result = IndexOfAnyLookup(source0, source1, bitmap512); + if (result != Vector512.Zero) + { + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + } + } + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart)); + } + + // We have 1-64 characters remaining. Process the first and last vector in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector512.Count, "We expect that the input is long enough for us to load a whole vector."); + { + ref short oneVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector512.Count); - if (searchSpaceLength > 2 * Vector256.Count) + ref short secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAfterStart) + ? ref Unsafe.Subtract(ref currentSearchSpace, Vector512.Count) + : ref searchSpace; + + Vector512 source0 = Vector512.LoadUnsafe(ref searchSpace); + Vector512 source1 = Vector512.LoadUnsafe(ref secondVector); + + Vector512 result = IndexOfAnyLookup(source0, source1, bitmap512); + if (result != Vector512.Zero) + { + return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + } + } + + return -1; + } + + Vector256 bitmap256 = state.Bitmap256(); + +#pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer + if (!(Vector512.IsHardwareAccelerated && Avx512BW.IsSupported) && searchSpaceLength > 2 * Vector256.Count) +#pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing { // Process the input in chunks of 32 characters (2 * Vector256). // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256. @@ -419,7 +551,7 @@ public static int LastIndexOfAny(ref short searchSpace return -1; } - Vector128 bitmap = state.Bitmap._lower; + Vector128 bitmap = state.Bitmap128(); if (!Avx2.IsSupported && searchSpaceLength > 2 * Vector128.Count) { @@ -518,9 +650,62 @@ private static TResult IndexOfAnyCore(ref byte if (Avx2.IsSupported && searchSpaceLength > Vector128.Count) #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { - Vector256 bitmap256 = state.Bitmap; + if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && searchSpaceLength > Vector256.Count) + { + Vector512 bitmap512 = state.Bitmap512; + + if (searchSpaceLength > Vector512.Count) + { + // Process the input in chunks of 64 bytes. + // If the input length is a multiple of 64, don't consume the last 64 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector512.Count); + + do + { + Vector512 source = Vector512.LoadUnsafe(ref currentSearchSpace); - if (searchSpaceLength > Vector256.Count) + Vector512 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap512)); + if (result != Vector512.Zero) + { + return TResultMapper.FirstIndex(ref searchSpace, ref currentSearchSpace, result); + } + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector512.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); + } + + // We have 1-64 bytes remaining. Process the first and last half vectors in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector256.Count, "We expect that the input is long enough for us to load a Vector256."); + { + ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256.Count); + + ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd) + ? ref halfVectorAwayFromEnd + : ref currentSearchSpace; + + Vector256 source0 = Vector256.LoadUnsafe(ref firstVector); + Vector256 source1 = Vector256.LoadUnsafe(ref halfVectorAwayFromEnd); + Vector512 source = Vector512.Create(source0, source1); + + Vector512 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap512)); + if (result != Vector512.Zero) + { + return TResultMapper.FirstIndexOverlapped(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result); + } + } + + return TResultMapper.NotFound; + } + + Vector256 bitmap256 = state.Bitmap256(); + +#pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer + if (!(Vector512.IsHardwareAccelerated && Avx512BW.IsSupported) && searchSpaceLength > Vector256.Count) +#pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing { // Process the input in chunks of 32 bytes. // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. @@ -567,7 +752,7 @@ private static TResult IndexOfAnyCore(ref byte return TResultMapper.NotFound; } - Vector128 bitmap = state.Bitmap._lower; + Vector128 bitmap = state.Bitmap128(); if (!Avx2.IsSupported && searchSpaceLength > Vector128.Count) { @@ -642,9 +827,62 @@ public static int LastIndexOfAny(ref byte searchSpace, int searchSpace if (Avx2.IsSupported && searchSpaceLength > Vector128.Count) #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { - Vector256 bitmap256 = state.Bitmap; + if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && searchSpaceLength > Vector256.Count) + { + Vector512 bitmap512 = state.Bitmap512; + + if (searchSpaceLength > Vector512.Count) + { + // Process the input in chunks of 64 bytes. + // If the input length is a multiple of 64, don't consume the last 64 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector512.Count); + + do + { + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector512.Count); + + Vector512 source = Vector512.LoadUnsafe(ref currentSearchSpace); + + Vector512 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap512)); + if (result != Vector512.Zero) + { + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + } + } + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); + } + + // We have 1-64 bytes remaining. Process the first and last half vectors in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector256.Count, "We expect that the input is long enough for us to load a Vector256."); + { + ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256.Count); - if (searchSpaceLength > Vector256.Count) + ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart) + ? ref Unsafe.Subtract(ref currentSearchSpace, Vector256.Count) + : ref searchSpace; + + Vector256 source0 = Vector256.LoadUnsafe(ref searchSpace); + Vector256 source1 = Vector256.LoadUnsafe(ref secondVector); + Vector512 source = Vector512.Create(source0, source1); + + Vector512 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap512)); + if (result != Vector512.Zero) + { + return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + } + } + + return -1; + } + + Vector256 bitmap256 = state.Bitmap256(); + +#pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer + if (!(Vector512.IsHardwareAccelerated && Avx512BW.IsSupported) && searchSpaceLength > Vector256.Count) +#pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing { // Process the input in chunks of 32 bytes. // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. @@ -691,7 +929,7 @@ public static int LastIndexOfAny(ref byte searchSpace, int searchSpace return -1; } - Vector128 bitmap = state.Bitmap._lower; + Vector128 bitmap = state.Bitmap128(); if (!Avx2.IsSupported && searchSpaceLength > Vector128.Count) { @@ -788,10 +1026,68 @@ private static TResult IndexOfAnyCore(ref byte if (Avx2.IsSupported && searchSpaceLength > Vector128.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { - Vector256 bitmap256_0 = state.Bitmap0; - Vector256 bitmap256_1 = state.Bitmap1; +#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false + if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && searchSpaceLength > Vector256.Count) +#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough + { + Vector512 bitmap512_0 = state.Bitmap0_512; + Vector512 bitmap512_1 = state.Bitmap1_512; + + if (searchSpaceLength > Vector512.Count) + { + // Process the input in chunks of 64 bytes. + // If the input length is a multiple of 64, don't consume the last 64 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector512.Count); + + do + { + Vector512 source = Vector512.LoadUnsafe(ref currentSearchSpace); + + Vector512 result = IndexOfAnyLookup(source, bitmap512_0, bitmap512_1); + if (result != Vector512.Zero) + { + return TResultMapper.FirstIndex(ref searchSpace, ref currentSearchSpace, result); + } + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector512.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); + } + + // We have 1-64 bytes remaining. Process the first and last half vectors in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector256.Count, "We expect that the input is long enough for us to load a Vector256."); + { + ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256.Count); + + ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd) + ? ref halfVectorAwayFromEnd + : ref currentSearchSpace; + + Vector256 source0 = Vector256.LoadUnsafe(ref firstVector); + Vector256 source1 = Vector256.LoadUnsafe(ref halfVectorAwayFromEnd); + Vector512 source = Vector512.Create(source0, source1); + + Vector512 result = IndexOfAnyLookup(source, bitmap512_0, bitmap512_1); + if (result != Vector512.Zero) + { + return TResultMapper.FirstIndexOverlapped(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result); + } + } - if (searchSpaceLength > Vector256.Count) + return TResultMapper.NotFound; + } + + Vector256 bitmap256_0 = state.Bitmap0_256(); + Vector256 bitmap256_1 = state.Bitmap1_256(); + +#pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer +#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false + if (!(Vector512.IsHardwareAccelerated && Avx512BW.IsSupported) && searchSpaceLength > Vector256.Count) +#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough +#pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing { // Process the input in chunks of 32 bytes. // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. @@ -838,8 +1134,8 @@ private static TResult IndexOfAnyCore(ref byte return TResultMapper.NotFound; } - Vector128 bitmap0 = state.Bitmap0._lower; - Vector128 bitmap1 = state.Bitmap1._lower; + Vector128 bitmap0 = state.Bitmap0_128(); + Vector128 bitmap1 = state.Bitmap1_128(); #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx2.IsSupported is false if (!Avx2.IsSupported && searchSpaceLength > Vector128.Count) @@ -914,12 +1210,70 @@ public static int LastIndexOfAny(ref byte searchSpace, int searchSpace #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx2.IsSupported is false if (Avx2.IsSupported && searchSpaceLength > Vector128.Count) +#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough { +#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false + if (Vector512.IsHardwareAccelerated && Avx512BW.IsSupported && searchSpaceLength > Vector256.Count) #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough - Vector256 bitmap256_0 = state.Bitmap0; - Vector256 bitmap256_1 = state.Bitmap1; + { + Vector512 bitmap512_0 = state.Bitmap0_512; + Vector512 bitmap512_1 = state.Bitmap1_512; + + if (searchSpaceLength > Vector512.Count) + { + // Process the input in chunks of 64 bytes. + // If the input length is a multiple of 64, don't consume the last 64 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector512.Count); - if (searchSpaceLength > Vector256.Count) + do + { + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector512.Count); + + Vector512 source = Vector512.LoadUnsafe(ref currentSearchSpace); + + Vector512 result = IndexOfAnyLookup(source, bitmap512_0, bitmap512_1); + if (result != Vector512.Zero) + { + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + } + } + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); + } + + // We have 1-64 bytes remaining. Process the first and last half vectors in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector256.Count, "We expect that the input is long enough for us to load a Vector256."); + { + ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256.Count); + + ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart) + ? ref Unsafe.Subtract(ref currentSearchSpace, Vector256.Count) + : ref searchSpace; + + Vector256 source0 = Vector256.LoadUnsafe(ref searchSpace); + Vector256 source1 = Vector256.LoadUnsafe(ref secondVector); + Vector512 source = Vector512.Create(source0, source1); + + Vector512 result = IndexOfAnyLookup(source, bitmap512_0, bitmap512_1); + if (result != Vector512.Zero) + { + return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + } + } + + return -1; + } + + Vector256 bitmap256_0 = state.Bitmap0_256(); + Vector256 bitmap256_1 = state.Bitmap1_256(); + +#pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer +#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false + if (!(Vector512.IsHardwareAccelerated && Avx512BW.IsSupported) && searchSpaceLength > Vector256.Count) +#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough +#pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing { // Process the input in chunks of 32 bytes. // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. @@ -966,8 +1320,8 @@ public static int LastIndexOfAny(ref byte searchSpace, int searchSpace return -1; } - Vector128 bitmap0 = state.Bitmap0._lower; - Vector128 bitmap1 = state.Bitmap1._lower; + Vector128 bitmap0 = state.Bitmap0_128(); + Vector128 bitmap1 = state.Bitmap1_128(); #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx2.IsSupported is false if (!Avx2.IsSupported && searchSpaceLength > Vector128.Count) @@ -1088,6 +1442,31 @@ private static Vector256 IndexOfAnyLookupCore(Vector256 source, Vect return result; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(Avx512BW))] + private static Vector512 IndexOfAnyLookup(Vector512 source0, Vector512 source1, Vector512 bitmapLookup) + where TNegator : struct, INegator + where TOptimizations : struct, IOptimizations + { + Vector512 source = TOptimizations.PackSources(source0.AsUInt16(), source1.AsUInt16()); + + Vector512 result = IndexOfAnyLookupCore(source, bitmapLookup); + + return TNegator.NegateIfNeeded(result); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(Avx512BW))] + private static Vector512 IndexOfAnyLookupCore(Vector512 source, Vector512 bitmapLookup) + { + // See comments in IndexOfAnyLookupCore(Vector128) above for more details. + Vector512 highNibbles = source >>> 4; + Vector512 bitMask = Avx512BW.Shuffle(bitmapLookup, source); + Vector512 bitPositions = Avx512BW.Shuffle(Vector512.Create(0x8040201008040201).AsByte(), highNibbles); + Vector512 result = bitMask & bitPositions; + return result; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(Ssse3))] [CompExactlyDependsOn(typeof(AdvSimd))] @@ -1136,6 +1515,29 @@ private static Vector256 IndexOfAnyLookup(Vector256 source return TNegator.NegateIfNeeded(result); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(Avx512BW))] + private static Vector512 IndexOfAnyLookup(Vector512 source, Vector512 bitmapLookup0, Vector512 bitmapLookup1) + where TNegator : struct, INegator + { + // http://0x80.pl/articles/simd-byte-lookup.html#universal-algorithm + + Vector512 lowNibbles = source & Vector512.Create((byte)0xF); + Vector512 highNibbles = source >>> 4; + + Vector512 row0 = Avx512BW.Shuffle(bitmapLookup0, lowNibbles); + Vector512 row1 = Avx512BW.Shuffle(bitmapLookup1, lowNibbles); + + Vector512 bitmask = Avx512BW.Shuffle(Vector512.Create(0x8040201008040201).AsByte(), highNibbles); + + Vector512 mask = Vector512.GreaterThan(highNibbles.AsSByte(), Vector512.Create((sbyte)0x7)).AsByte(); + Vector512 bitsets = Vector512.ConditionalSelect(mask, row1, row0); + + Vector512 result = Vector512.Equals(bitsets & bitmask, bitmask); + + return TNegator.NegateIfNeeded(result); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe int ComputeLastIndex(ref T searchSpace, ref T current, Vector128 result) where TNegator : struct, INegator @@ -1198,13 +1600,53 @@ private static unsafe int ComputeLastIndexOverlapped(ref T searchSp return offsetInVector - Vector256.Count + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref secondVector) / (nuint)sizeof(T)); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(Avx512F))] + private static unsafe int ComputeLastIndex(ref T searchSpace, ref T current, Vector512 result) + where TNegator : struct, INegator + { + if (typeof(T) == typeof(short)) + { + result = PackedSpanHelpers.FixUpPackedVector512Result(result); + } + + ulong mask = TNegator.ExtractMask(result); + + int offsetInVector = 63 - BitOperations.LeadingZeroCount(mask); + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / (nuint)sizeof(T)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(Avx512F))] + private static unsafe int ComputeLastIndexOverlapped(ref T searchSpace, ref T secondVector, Vector512 result) + where TNegator : struct, INegator + { + if (typeof(T) == typeof(short)) + { + result = PackedSpanHelpers.FixUpPackedVector512Result(result); + } + + ulong mask = TNegator.ExtractMask(result); + + int offsetInVector = 63 - BitOperations.LeadingZeroCount(mask); + if (offsetInVector < Vector512.Count) + { + return offsetInVector; + } + + // We matched within the second vector + return offsetInVector - Vector512.Count + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref secondVector) / (nuint)sizeof(T)); + } + internal interface INegator { static abstract bool NegateIfNeeded(bool result); static abstract Vector128 NegateIfNeeded(Vector128 result); static abstract Vector256 NegateIfNeeded(Vector256 result); + static abstract Vector512 NegateIfNeeded(Vector512 result); static abstract uint ExtractMask(Vector128 result); static abstract uint ExtractMask(Vector256 result); + static abstract ulong ExtractMask(Vector512 result); } internal readonly struct DontNegate : INegator @@ -1212,10 +1654,13 @@ internal interface INegator public static bool NegateIfNeeded(bool result) => result; public static Vector128 NegateIfNeeded(Vector128 result) => result; public static Vector256 NegateIfNeeded(Vector256 result) => result; + public static Vector512 NegateIfNeeded(Vector512 result) => result; [MethodImpl(MethodImplOptions.AggressiveInlining)] public static uint ExtractMask(Vector128 result) => ~Vector128.Equals(result, Vector128.Zero).ExtractMostSignificantBits(); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static uint ExtractMask(Vector256 result) => ~Vector256.Equals(result, Vector256.Zero).ExtractMostSignificantBits(); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ulong ExtractMask(Vector512 result) => ~Vector512.Equals(result, Vector512.Zero).ExtractMostSignificantBits(); } internal readonly struct Negate : INegator @@ -1228,9 +1673,13 @@ internal interface INegator [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 NegateIfNeeded(Vector256 result) => Vector256.Equals(result, Vector256.Zero); [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector512 NegateIfNeeded(Vector512 result) => Vector512.Equals(result, Vector512.Zero); + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static uint ExtractMask(Vector128 result) => result.ExtractMostSignificantBits(); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static uint ExtractMask(Vector256 result) => result.ExtractMostSignificantBits(); + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static ulong ExtractMask(Vector512 result) => result.ExtractMostSignificantBits(); } internal interface IOptimizations @@ -1243,6 +1692,7 @@ internal interface IOptimizations // - All values result in min(value, 255) static abstract Vector128 PackSources(Vector128 lower, Vector128 upper); static abstract Vector256 PackSources(Vector256 lower, Vector256 upper); + static abstract Vector512 PackSources(Vector512 lower, Vector512 upper); } internal readonly struct Ssse3AndWasmHandleZeroInNeedle : IOptimizations @@ -1270,6 +1720,16 @@ public static Vector256 PackSources(Vector256 lower, Vector256 PackSources(Vector512 lower, Vector512 upper) + { + return Avx512BW.PackUnsignedSaturate( + Vector512.Min(lower, Vector512.Create((ushort)255)).AsInt16(), + Vector512.Min(upper, Vector512.Create((ushort)255)).AsInt16()); + } } internal readonly struct Default : IOptimizations @@ -1292,6 +1752,13 @@ public static Vector256 PackSources(Vector256 lower, Vector256 PackSources(Vector512 lower, Vector512 upper) + { + return Avx512BW.PackUnsignedSaturate(lower.AsInt16(), upper.AsInt16()); + } } private interface IResultMapper @@ -1302,8 +1769,10 @@ private interface IResultMapper static abstract TResult ScalarResult(ref T searchSpace, ref T current); static abstract TResult FirstIndex(ref T searchSpace, ref T current, Vector128 result) where TNegator : struct, INegator; static abstract TResult FirstIndex(ref T searchSpace, ref T current, Vector256 result) where TNegator : struct, INegator; + static abstract TResult FirstIndex(ref T searchSpace, ref T current, Vector512 result) where TNegator : struct, INegator; static abstract TResult FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector128 result) where TNegator : struct, INegator; static abstract TResult FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector256 result) where TNegator : struct, INegator; + static abstract TResult FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector512 result) where TNegator : struct, INegator; } private readonly struct ContainsAnyResultMapper : IResultMapper @@ -1313,8 +1782,10 @@ private interface IResultMapper public static bool ScalarResult(ref T searchSpace, ref T current) => true; public static bool FirstIndex(ref T searchSpace, ref T current, Vector128 result) where TNegator : struct, INegator => true; public static bool FirstIndex(ref T searchSpace, ref T current, Vector256 result) where TNegator : struct, INegator => true; + public static bool FirstIndex(ref T searchSpace, ref T current, Vector512 result) where TNegator : struct, INegator => true; public static bool FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector128 result) where TNegator : struct, INegator => true; public static bool FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector256 result) where TNegator : struct, INegator => true; + public static bool FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector512 result) where TNegator : struct, INegator => true; } private readonly unsafe struct IndexOfAnyResultMapper : IResultMapper @@ -1350,6 +1821,21 @@ public static int FirstIndex(ref T searchSpace, ref T current, Vector2 return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / (nuint)sizeof(T)); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(Avx512F))] + public static int FirstIndex(ref T searchSpace, ref T current, Vector512 result) where TNegator : struct, INegator + { + if (typeof(T) == typeof(short)) + { + result = PackedSpanHelpers.FixUpPackedVector512Result(result); + } + + ulong mask = TNegator.ExtractMask(result); + + int offsetInVector = BitOperations.TrailingZeroCount(mask); + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / (nuint)sizeof(T)); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector128 result) where TNegator : struct, INegator { @@ -1384,6 +1870,27 @@ public static int FirstIndexOverlapped(ref T searchSpace, ref T curren } return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current0) / (nuint)sizeof(T)); } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + [CompExactlyDependsOn(typeof(Avx512F))] + public static int FirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector512 result) where TNegator : struct, INegator + { + if (typeof(T) == typeof(short)) + { + result = PackedSpanHelpers.FixUpPackedVector512Result(result); + } + + ulong mask = TNegator.ExtractMask(result); + + int offsetInVector = BitOperations.TrailingZeroCount(mask); + if (offsetInVector >= Vector512.Count) + { + // We matched within the second vector + current0 = ref current1; + offsetInVector -= Vector512.Count; + } + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current0) / (nuint)sizeof(T)); + } } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/AhoCorasick.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/AhoCorasick.cs index ebc94616ae642..777251124cbbe 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/AhoCorasick.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/Helpers/AhoCorasick.cs @@ -30,7 +30,7 @@ public readonly bool ShouldUseAsciiFastScan { get { - if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && _startingAsciiChars.Bitmap != default) + if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && _startingAsciiChars.Bitmap128() != default) { // If there are a lot of starting characters such that we often find one early, // the ASCII fast scan may end up performing worse than checking one character at a time.