From 56059b65a555c8cc41d0a3653d78d4f6137e089f Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Wed, 30 Nov 2022 05:43:36 +0100 Subject: [PATCH] Add AVX2 support to IndexOfAnyValues --- .../tests/Span/IndexOfAnyValues.cs | 2 +- .../IndexOfAnyAsciiSearcher.cs | 668 +++++++++++++++--- 2 files changed, 577 insertions(+), 93 deletions(-) diff --git a/src/libraries/System.Memory/tests/Span/IndexOfAnyValues.cs b/src/libraries/System.Memory/tests/Span/IndexOfAnyValues.cs index 8a4266d76cb36..511330b0d51c9 100644 --- a/src/libraries/System.Memory/tests/Span/IndexOfAnyValues.cs +++ b/src/libraries/System.Memory/tests/Span/IndexOfAnyValues.cs @@ -360,7 +360,7 @@ static int LastIndexOfAnyExceptReferenceImpl(ReadOnlySpan searchSpace, Rea private static class IndexOfAnyValuesTestHelper { private const int MaxNeedleLength = 10; - private const int MaxHaystackLength = 40; + private const int MaxHaystackLength = 100; private static readonly char[] s_randomAsciiChars; private static readonly char[] s_randomLatin1Chars; diff --git a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs index dccbddb6676ff..85ef35e7046a0 100644 --- a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs +++ b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers.Binary; using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; @@ -171,28 +172,83 @@ internal static int IndexOfAnyVectorized(ref short sea if (searchSpaceLength > 2 * Vector128.Count) { - // Process the input in chunks of 16 characters (2 * Vector128). - // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128. - // As packing two Vector128s into a Vector128 is cheap compared to the lookup, we can effectively double the throughput. - // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". - ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector128.Count)); - - do + if (Avx2.IsSupported) { - Vector128 source0 = Vector128.LoadUnsafe(ref currentSearchSpace); - Vector128 source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128.Count); + Vector256 bitmap256 = Vector256.Create(bitmap, bitmap); - Vector128 result = IndexOfAnyLookup(source0, source1, bitmap); - if (result != Vector128.Zero) + if (searchSpaceLength > 2 * Vector256.Count) { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + // Process the input in chunks of 32 characters (2 * Vector256). + // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256. + // As packing two Vector256s into a Vector256 is cheap compared to the lookup, we can effectively double the throughput. + // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector256.Count)); + + do + { + Vector256 source0 = Vector256.LoadUnsafe(ref currentSearchSpace); + Vector256 source1 = Vector256.LoadUnsafe(ref currentSearchSpace, (nuint)Vector256.Count); + + Vector256 result = IndexOfAnyLookup(source0, source1, bitmap256); + if (result != Vector256.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + } + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector256.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd)); } - currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector128.Count); + // We have 1-32 characters remaining. Process the first and last vector in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector256.Count, "We expect that the input is long enough for us to load a whole vector."); + { + ref short oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256.Count); + + ref short firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd) + ? ref oneVectorAwayFromEnd + : ref currentSearchSpace; + + Vector256 source0 = Vector256.LoadUnsafe(ref firstVector); + Vector256 source1 = Vector256.LoadUnsafe(ref oneVectorAwayFromEnd); + + Vector256 result = IndexOfAnyLookup(source0, source1, bitmap256); + if (result != Vector256.Zero) + { + return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + } + } + + return -1; + } + else + { + // Process the input in chunks of 16 characters (2 * Vector128). + // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128. + // As packing two Vector128s into a Vector128 is cheap compared to the lookup, we can effectively double the throughput. + // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref short twoVectorsAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - (2 * Vector128.Count)); + + do + { + Vector128 source0 = Vector128.LoadUnsafe(ref currentSearchSpace); + Vector128 source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128.Count); + + Vector128 result = IndexOfAnyLookup(source0, source1, bitmap); + if (result != Vector128.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + } + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector128.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd)); } - while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref twoVectorsAwayFromEnd)); } // We have 1-16 characters remaining. Process the first and last vector in the search space. @@ -226,28 +282,83 @@ internal static int LastIndexOfAnyVectorized(ref short if (searchSpaceLength > 2 * Vector128.Count) { - // Process the input in chunks of 16 characters (2 * Vector128). - // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128. - // As packing two Vector128s into a Vector128 is cheap compared to the lookup, we can effectively double the throughput. - // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". - ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector128.Count); - - do + if (Avx2.IsSupported) { - currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector128.Count); + Vector256 bitmap256 = Vector256.Create(bitmap, bitmap); + + if (searchSpaceLength > 2 * Vector256.Count) + { + // Process the input in chunks of 32 characters (2 * Vector256). + // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256. + // As packing two Vector256s into a Vector256 is cheap compared to the lookup, we can effectively double the throughput. + // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector256.Count); + + do + { + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector256.Count); + + Vector256 source0 = Vector256.LoadUnsafe(ref currentSearchSpace); + Vector256 source1 = Vector256.LoadUnsafe(ref currentSearchSpace, (nuint)Vector256.Count); + + Vector256 result = IndexOfAnyLookup(source0, source1, bitmap256); + if (result != Vector256.Zero) + { + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + } + } + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart)); + } + + // We have 1-32 characters remaining. Process the first and last vector in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector256.Count, "We expect that the input is long enough for us to load a whole vector."); + { + ref short oneVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256.Count); - Vector128 source0 = Vector128.LoadUnsafe(ref currentSearchSpace); - Vector128 source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128.Count); + ref short secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAfterStart) + ? ref Unsafe.Subtract(ref currentSearchSpace, Vector256.Count) + : ref searchSpace; - Vector128 result = IndexOfAnyLookup(source0, source1, bitmap); - if (result != Vector128.Zero) + Vector256 source0 = Vector256.LoadUnsafe(ref searchSpace); + Vector256 source1 = Vector256.LoadUnsafe(ref secondVector); + + Vector256 result = IndexOfAnyLookup(source0, source1, bitmap256); + if (result != Vector256.Zero) + { + return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + } + } + + return -1; + } + else + { + // Process the input in chunks of 16 characters (2 * Vector128). + // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector128. + // As packing two Vector128s into a Vector128 is cheap compared to the lookup, we can effectively double the throughput. + // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref short twoVectorsAfterStart = ref Unsafe.Add(ref searchSpace, 2 * Vector128.Count); + + do { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, 2 * Vector128.Count); + + Vector128 source0 = Vector128.LoadUnsafe(ref currentSearchSpace); + Vector128 source1 = Vector128.LoadUnsafe(ref currentSearchSpace, (nuint)Vector128.Count); + + Vector128 result = IndexOfAnyLookup(source0, source1, bitmap); + if (result != Vector128.Zero) + { + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + } } + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart)); } - while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart)); } // We have 1-16 characters remaining. Process the first and last vector in the search space. @@ -281,32 +392,85 @@ internal static int IndexOfAnyVectorized(ref byte sear if (searchSpaceLength > Vector128.Count) { - // Process the input in chunks of 16 bytes. - // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". - ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count); - - do + if (Avx2.IsSupported) { - Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + Vector256 bitmap256 = Vector256.Create(bitmap, bitmap); + + if (searchSpaceLength > Vector256.Count) + { + // Process the input in chunks of 32 bytes. + // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256.Count); + + do + { + Vector256 source = Vector256.LoadUnsafe(ref currentSearchSpace); + + Vector256 result = IndexOfAnyLookup(source, bitmap256); + if (result != Vector256.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + } + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); + } - Vector128 result = IndexOfAnyLookup(source, bitmap); - if (result != Vector128.Zero) + // We have 1-32 bytes remaining. Process the first and last half vectors in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector128.Count, "We expect that the input is long enough for us to load a Vector128."); { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count); + + ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd) + ? ref halfVectorAwayFromEnd + : ref currentSearchSpace; + + Vector128 source0 = Vector128.LoadUnsafe(ref firstVector); + Vector128 source1 = Vector128.LoadUnsafe(ref halfVectorAwayFromEnd); + Vector256 source = Vector256.Create(source0, source1); + + Vector256 result = IndexOfAnyLookup(source, bitmap256); + if (result != Vector256.Zero) + { + return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result); + } } - currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128.Count); + return -1; + } + else + { + // Process the input in chunks of 16 bytes. + // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count); + + do + { + Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + + Vector128 result = IndexOfAnyLookup(source, bitmap); + if (result != Vector128.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + } + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); } - while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); } // We have 1-16 bytes remaining. Process the first and last half vectors in the search space. // They may overlap, but we'll handle that in the index calculation if we do get a match. Debug.Assert(searchSpaceLength >= sizeof(ulong), "We expect that the input is long enough for us to load a ulong."); { - ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count / 2); + ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - sizeof(ulong)); ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd) ? ref halfVectorAwayFromEnd @@ -334,35 +498,88 @@ internal static int LastIndexOfAnyVectorized(ref byte if (searchSpaceLength > Vector128.Count) { - // Process the input in chunks of 16 bytes. - // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". - ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count); - - do + if (Avx2.IsSupported) { - currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count); + Vector256 bitmap256 = Vector256.Create(bitmap, bitmap); - Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + if (searchSpaceLength > Vector256.Count) + { + // Process the input in chunks of 32 bytes. + // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256.Count); + + do + { + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector256.Count); + + Vector256 source = Vector256.LoadUnsafe(ref currentSearchSpace); + + Vector256 result = IndexOfAnyLookup(source, bitmap256); + if (result != Vector256.Zero) + { + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + } + } + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); + } - Vector128 result = IndexOfAnyLookup(source, bitmap); - if (result != Vector128.Zero) + // We have 1-32 bytes remaining. Process the first and last half vectors in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector128.Count, "We expect that the input is long enough for us to load a Vector128."); { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count); + + ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart) + ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count) + : ref searchSpace; + + Vector128 source0 = Vector128.LoadUnsafe(ref searchSpace); + Vector128 source1 = Vector128.LoadUnsafe(ref secondVector); + Vector256 source = Vector256.Create(source0, source1); + + Vector256 result = IndexOfAnyLookup(source, bitmap256); + if (result != Vector256.Zero) + { + return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + } } + + return -1; + } + else + { + // Process the input in chunks of 16 bytes. + // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count); + + do + { + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count); + + Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + + Vector128 result = IndexOfAnyLookup(source, bitmap); + if (result != Vector128.Zero) + { + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + } + } + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); } - while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); } // We have 1-16 bytes remaining. Process the first and last half vectors in the search space. // They may overlap, but we'll handle that in the index calculation if we do get a match. Debug.Assert(searchSpaceLength >= sizeof(ulong), "We expect that the input is long enough for us to load a ulong."); { - ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count / 2); + ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, sizeof(ulong)); ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart) - ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count) + ? ref Unsafe.Subtract(ref currentSearchSpace, sizeof(ulong)) : ref searchSpace; ulong source0 = Unsafe.ReadUnaligned(ref searchSpace); @@ -386,32 +603,86 @@ internal static int IndexOfAnyVectorized(ref byte searchSpace, int sea if (searchSpaceLength > Vector128.Count) { - // Process the input in chunks of 16 bytes. - // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". - ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count); - - do + if (Avx2.IsSupported) { - Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + Vector256 bitmap256_0 = Vector256.Create(bitmap0, bitmap0); + Vector256 bitmap256_1 = Vector256.Create(bitmap1, bitmap1); + + if (searchSpaceLength > Vector256.Count) + { + // Process the input in chunks of 32 bytes. + // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector256.Count); + + do + { + Vector256 source = Vector256.LoadUnsafe(ref currentSearchSpace); + + Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); + if (result != Vector256.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + } + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); + } - Vector128 result = IndexOfAnyLookup(source, bitmap0, bitmap1); - if (result != Vector128.Zero) + // We have 1-32 bytes remaining. Process the first and last half vectors in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector128.Count, "We expect that the input is long enough for us to load a Vector128."); { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count); + + ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd) + ? ref halfVectorAwayFromEnd + : ref currentSearchSpace; + + Vector128 source0 = Vector128.LoadUnsafe(ref firstVector); + Vector128 source1 = Vector128.LoadUnsafe(ref halfVectorAwayFromEnd); + Vector256 source = Vector256.Create(source0, source1); + + Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); + if (result != Vector256.Zero) + { + return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result); + } } - currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128.Count); + return -1; + } + else + { + // Process the input in chunks of 16 bytes. + // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan". + ref byte vectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count); + + do + { + Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + + Vector128 result = IndexOfAnyLookup(source, bitmap0, bitmap1); + if (result != Vector128.Zero) + { + return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + } + + currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128.Count); + } + while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); } - while (Unsafe.IsAddressLessThan(ref currentSearchSpace, ref vectorAwayFromEnd)); } // We have 1-16 bytes remaining. Process the first and last half vectors in the search space. // They may overlap, but we'll handle that in the index calculation if we do get a match. Debug.Assert(searchSpaceLength >= sizeof(ulong), "We expect that the input is long enough for us to load a ulong."); { - ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - Vector128.Count / 2); + ref byte halfVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength - sizeof(ulong)); ref byte firstVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAwayFromEnd) ? ref halfVectorAwayFromEnd @@ -438,35 +709,89 @@ internal static int LastIndexOfAnyVectorized(ref byte searchSpace, int if (searchSpaceLength > Vector128.Count) { - // Process the input in chunks of 16 bytes. - // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. - // Let the fallback below handle it instead. This is why the condition is - // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". - ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count); - - do + if (Avx2.IsSupported) { - currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count); + Vector256 bitmap256_0 = Vector256.Create(bitmap0, bitmap0); + Vector256 bitmap256_1 = Vector256.Create(bitmap1, bitmap1); - Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + if (searchSpaceLength > Vector256.Count) + { + // Process the input in chunks of 32 bytes. + // If the input length is a multiple of 32, don't consume the last 32 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector256.Count); + + do + { + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector256.Count); + + Vector256 source = Vector256.LoadUnsafe(ref currentSearchSpace); + + Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); + if (result != Vector256.Zero) + { + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + } + } + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); + } - Vector128 result = IndexOfAnyLookup(source, bitmap0, bitmap1); - if (result != Vector128.Zero) + // We have 1-32 bytes remaining. Process the first and last half vectors in the search space. + // They may overlap, but we'll handle that in the index calculation if we do get a match. + Debug.Assert(searchSpaceLength >= Vector128.Count, "We expect that the input is long enough for us to load a Vector128."); { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count); + + ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart) + ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count) + : ref searchSpace; + + Vector128 source0 = Vector128.LoadUnsafe(ref searchSpace); + Vector128 source1 = Vector128.LoadUnsafe(ref secondVector); + Vector256 source = Vector256.Create(source0, source1); + + Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); + if (result != Vector256.Zero) + { + return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + } } + + return -1; + } + else + { + // Process the input in chunks of 16 bytes. + // If the input length is a multiple of 16, don't consume the last 16 characters in this loop. + // Let the fallback below handle it instead. This is why the condition is + // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan". + ref byte vectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count); + + do + { + currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count); + + Vector128 source = Vector128.LoadUnsafe(ref currentSearchSpace); + + Vector128 result = IndexOfAnyLookup(source, bitmap0, bitmap1); + if (result != Vector128.Zero) + { + return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + } + } + while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); } - while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); } // We have 1-16 bytes remaining. Process the first and last half vectors in the search space. // They may overlap, but we'll handle that in the index calculation if we do get a match. Debug.Assert(searchSpaceLength >= sizeof(ulong), "We expect that the input is long enough for us to load a ulong."); { - ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, Vector128.Count / 2); + ref byte halfVectorAfterStart = ref Unsafe.Add(ref searchSpace, sizeof(ulong)); ref byte secondVector = ref Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref halfVectorAfterStart) - ? ref Unsafe.Subtract(ref currentSearchSpace, Vector128.Count) + ? ref Unsafe.Subtract(ref currentSearchSpace, sizeof(ulong)) : ref searchSpace; ulong source0 = Unsafe.ReadUnaligned(ref searchSpace); @@ -558,6 +883,54 @@ private static Vector128 IndexOfAnyLookupCore(Vector128 source, Vect return result; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 IndexOfAnyLookup(Vector256 source0, Vector256 source1, Vector256 bitmapLookup) + where TNegator : struct, INegator + where TOptimizations : struct, IOptimizations + { + // See comments in IndexOfAnyLookup(Vector128) above for more details. + Vector256 source = Avx2.PackUnsignedSaturate(source0, source1); + Vector256 result = IndexOfAnyLookupCore(source, bitmapLookup); + + if (TOptimizations.NeedleContainsZero) + { + Vector256 ascii0 = Vector256.LessThan(source0.AsUInt16(), Vector256.Create((ushort)128)).AsInt16(); + Vector256 ascii1 = Vector256.LessThan(source1.AsUInt16(), Vector256.Create((ushort)128)).AsInt16(); + Vector256 ascii = Avx2.PackSignedSaturate(ascii0, ascii1).AsByte(); + result &= ascii; + } + + return TNegator.NegateIfNeeded(result); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 IndexOfAnyLookup(Vector256 source, Vector256 bitmapLookup) + where TNegator : struct, INegator + where TOptimizations : struct, IOptimizations + { + // See comments in IndexOfAnyLookup(Vector128) above for more details. + Vector256 result = IndexOfAnyLookupCore(source, bitmapLookup); + + if (TOptimizations.NeedleContainsZero) + { + Vector256 ascii = Vector256.LessThan(source, Vector256.Create((byte)128)); + result &= ascii; + } + + return TNegator.NegateIfNeeded(result); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 IndexOfAnyLookupCore(Vector256 source, Vector256 bitmapLookup) + { + // See comments in IndexOfAnyLookupCore(Vector128) above for more details. + Vector256 highNibbles = Vector256.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector256.Create((byte)0xF); + Vector256 bitMask = Avx2.Shuffle(bitmapLookup, source); + Vector256 bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibbles); + Vector256 result = bitMask & bitPositions; + return result; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128 IndexOfAnyLookup(Vector128 source, Vector128 bitmapLookup0, Vector128 bitmapLookup1) where TNegator : struct, INegator @@ -572,14 +945,36 @@ private static Vector128 IndexOfAnyLookup(Vector128 source Vector128 bitmask = Shuffle(Vector128.Create(0x8040201008040201).AsByte(), highNibbles); - Vector128 mask = Vector128.LessThan(highNibbles, Vector128.Create((byte)0x8)); - Vector128 bitsets = Vector128.ConditionalSelect(mask, row0, row1); + Vector128 mask = Vector128.GreaterThan(highNibbles.AsSByte(), Vector128.Create((sbyte)0x7)).AsByte(); + Vector128 bitsets = Vector128.ConditionalSelect(mask, row1, row0); Vector128 result = Vector128.Equals(bitsets & bitmask, bitmask); return TNegator.NegateIfNeeded(result); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector256 IndexOfAnyLookup(Vector256 source, Vector256 bitmapLookup0, Vector256 bitmapLookup1) + where TNegator : struct, INegator + { + // http://0x80.pl/articles/simd-byte-lookup.html#universal-algorithm + + Vector256 lowNibbles = source & Vector256.Create((byte)0xF); + Vector256 highNibbles = Vector256.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector256.Create((byte)0xF); + + Vector256 row0 = Avx2.Shuffle(bitmapLookup0, lowNibbles); + Vector256 row1 = Avx2.Shuffle(bitmapLookup1, lowNibbles); + + Vector256 bitmask = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibbles); + + Vector256 mask = Vector256.GreaterThan(highNibbles.AsSByte(), Vector256.Create((sbyte)0x7)).AsByte(); + Vector256 bitsets = Vector256.ConditionalSelect(mask, row1, row0); + + Vector256 result = Vector256.Equals(bitsets & bitmask, bitmask); + + return TNegator.NegateIfNeeded(result); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128 Shuffle(Vector128 vector, Vector128 indices) { @@ -639,18 +1034,105 @@ private static unsafe int ComputeLastIndexOverlapped(ref T searchSp return offsetInVector - Vector128.Count + (int)(Unsafe.ByteOffset(ref searchSpace, ref secondVector) / sizeof(T)); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe int ComputeFirstIndex(ref T searchSpace, ref T current, Vector256 result) + where TNegator : struct, INegator + { + uint mask = TNegator.ExtractMask(result); + if (typeof(T) == typeof(short)) + { + mask = FixUpPackedVector256Mask(mask); + } + + int offsetInVector = BitOperations.TrailingZeroCount(mask); + return offsetInVector + (int)(Unsafe.ByteOffset(ref searchSpace, ref current) / sizeof(T)); + } + +#pragma warning disable IDE0060 // https://github.com/dotnet/roslyn-analyzers/issues/6228 + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe int ComputeFirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector256 result) + where TNegator : struct, INegator + { + uint mask = TNegator.ExtractMask(result); + if (typeof(T) == typeof(short)) + { + mask = FixUpPackedVector256Mask(mask); + } + + int offsetInVector = BitOperations.TrailingZeroCount(mask); + if (offsetInVector >= Vector256.Count) + { + // We matched within the second vector + current0 = ref current1; + offsetInVector -= Vector256.Count; + } + return offsetInVector + (int)(Unsafe.ByteOffset(ref searchSpace, ref current0) / sizeof(T)); + } +#pragma warning restore IDE0060 // https://github.com/dotnet/roslyn-analyzers/issues/6228 + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe int ComputeLastIndex(ref T searchSpace, ref T current, Vector256 result) + where TNegator : struct, INegator + { + uint mask = TNegator.ExtractMask(result); + if (typeof(T) == typeof(short)) + { + mask = FixUpPackedVector256Mask(mask); + } + + int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask); + return offsetInVector + (int)(Unsafe.ByteOffset(ref searchSpace, ref current) / sizeof(T)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe int ComputeLastIndexOverlapped(ref T searchSpace, ref T secondVector, Vector256 result) + where TNegator : struct, INegator + { + uint mask = TNegator.ExtractMask(result); + if (typeof(T) == typeof(short)) + { + mask = FixUpPackedVector256Mask(mask); + } + + int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask); + if (offsetInVector < Vector256.Count) + { + return offsetInVector; + } + + // We matched within the second vector + return offsetInVector - Vector256.Count + (int)(Unsafe.ByteOffset(ref searchSpace, ref secondVector) / sizeof(T)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static uint FixUpPackedVector256Mask(uint mask) + { + Debug.Assert(Avx2.IsSupported); + // Avx2.PackUnsignedSaturate(Vector256.Create((short)1), Vector256.Create((short)2)) will result in + // 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 + // We want to swap the X and Y bits + // 1, 1, 1, 1, 1, 1, 1, 1, X, X, X, X, X, X, X, X, Y, Y, Y, Y, Y, Y, Y, Y, 2, 2, 2, 2, 2, 2, 2, 2 + const uint CorrectPositionsMask = 0xFF0000FF; + + return (mask & CorrectPositionsMask) | BinaryPrimitives.ReverseEndianness(mask & ~CorrectPositionsMask); + } + internal interface INegator { static abstract bool NegateIfNeeded(bool result); static abstract Vector128 NegateIfNeeded(Vector128 result); + static abstract Vector256 NegateIfNeeded(Vector256 result); static abstract uint ExtractMask(Vector128 result); + static abstract uint ExtractMask(Vector256 result); } internal readonly struct DontNegate : INegator { public static bool NegateIfNeeded(bool result) => result; public static Vector128 NegateIfNeeded(Vector128 result) => result; + public static Vector256 NegateIfNeeded(Vector256 result) => result; public static uint ExtractMask(Vector128 result) => ~Vector128.Equals(result, Vector128.Zero).ExtractMostSignificantBits(); + public static uint ExtractMask(Vector256 result) => ~Vector256.Equals(result, Vector256.Zero).ExtractMostSignificantBits(); } internal readonly struct Negate : INegator @@ -659,7 +1141,9 @@ internal interface INegator // This is intentionally testing for equality with 0 instead of "~result". // We want to know if any character didn't match, as that means it should be treated as a match for the -Except method. public static Vector128 NegateIfNeeded(Vector128 result) => Vector128.Equals(result, Vector128.Zero); + public static Vector256 NegateIfNeeded(Vector256 result) => Vector256.Equals(result, Vector256.Zero); public static uint ExtractMask(Vector128 result) => result.ExtractMostSignificantBits(); + public static uint ExtractMask(Vector256 result) => result.ExtractMostSignificantBits(); } internal interface IOptimizations