diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs index b9efaf24ee1a2..05a52499925bf 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs @@ -847,7 +847,7 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int continue; } - goto Difference; + goto VectorMatch; } // Move to Vector length from end for final compare @@ -862,11 +862,13 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int goto NotFound; } - Difference: + VectorMatch: offset += (nuint)LocateFirstFoundByte(search); + goto Found; } - goto Found; + Debug.Fail("Unreachable"); + goto NotFound; } [MethodImpl(MethodImplOptions.AggressiveOptimization)] @@ -875,27 +877,38 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, byt Debug.Assert(length >= 0); uint uValue0 = value0; // Use uint for comparisons to avoid unnecessary 8->32 extensions - uint uValue1 = value1; - uint uValue2 = value2; + uint uValue1 = value1; // Use uint for comparisons to avoid unnecessary 8->32 extensions + uint uValue2 = value2; // Use uint for comparisons to avoid unnecessary 8->32 extensions nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations nuint lengthToExamine = (nuint)(uint)length; if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported) { // Avx2 branch also operates on Sse2 sizes, so check is combined. - if (length >= Vector128.Count * 2) + nint vectorDiff = (nint)length - Vector128.Count; + if (vectorDiff >= 0) { - lengthToExamine = UnalignedCountVector128(ref searchSpace); + // >= Sse2 intrinsics are supported, and length is enough to use them so use that path. + // We jump forward to the intrinsics at the end of the method so a naive branch predict + // will choose the non-intrinsic path so short lengths which don't gain anything aren't + // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths + // more than make this back from the intrinsics. + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; } } else if (Vector.IsHardwareAccelerated) { - if (length >= Vector.Count * 2) + // Calculate lengthToExamine here for test, as it is used later + nint vectorDiff = (nint)length - Vector.Count; + if (vectorDiff >= 0) { - lengthToExamine = UnalignedCountVector(ref searchSpace); + // Similar as above for Vector version + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; } } - SequentialScan: + uint lookUp; while (lengthToExamine >= 8) { @@ -951,216 +964,254 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, byt while (lengthToExamine > 0) { - lengthToExamine -= 1; - lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); if (uValue0 == lookUp || uValue1 == lookUp || uValue2 == lookUp) goto Found; offset += 1; + lengthToExamine -= 1; } - if (Avx2.IsSupported) + NotFound: + return -1; + Found: // Workaround for https://github.com/dotnet/runtime/issues/8795 + return (int)offset; + Found1: + return (int)(offset + 1); + Found2: + return (int)(offset + 2); + Found3: + return (int)(offset + 3); + Found4: + return (int)(offset + 4); + Found5: + return (int)(offset + 5); + Found6: + return (int)(offset + 6); + Found7: + return (int)(offset + 7); + + IntrinsicsCompare: + // When we move into a Vectorized block, we process everything of Vector size; + // and then for any remainder we do a final compare of Vector size but starting at + // the end and forwards, which may overlap on an earlier compare. + + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (Sse2.IsSupported) { - if (offset < (nuint)(uint)length) + int matches; + if (Avx2.IsSupported) { - lengthToExamine = GetByteVector256SpanLength(offset, length); - if (lengthToExamine > offset) + Vector256 search; + // Guard as we may only have a valid size for Vector128; when we will move to the Sse2 + // We have already subtracted Vector128.Count from lengthToExamine so compare against that + // to see if we have double the size for Vector256.Count + if (lengthToExamine >= (nuint)Vector128.Count) { Vector256 values0 = Vector256.Create(value0); Vector256 values1 = Vector256.Create(value1); Vector256 values2 = Vector256.Create(value2); - do - { - Vector256 search = LoadVector256(ref searchSpace, offset); - Vector256 matches0 = Avx2.CompareEqual(values0, search); - Vector256 matches1 = Avx2.CompareEqual(values1, search); - Vector256 matches2 = Avx2.CompareEqual(values2, search); - // Bitwise Or to combine the matches and MoveMask to convert them to bitflags - int matches = Avx2.MoveMask(Avx2.Or(Avx2.Or(matches0, matches1), matches2)); + // Subtract Vector128.Count so we have now subtracted Vector256.Count + lengthToExamine -= (nuint)Vector128.Count; + // First time this checks again against 0, however we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector256(ref searchSpace, offset); + // Bitwise Or to combine the flagged matches for the second value to our match flags + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search)), + Avx2.CompareEqual(values2, search))); // Note that MoveMask has converted the equal vector elements into a set of bit flags, // So the bit position in 'matches' corresponds to the element offset. if (matches == 0) { - // Zero flags set so no matches + // None matched offset += (nuint)Vector256.Count; continue; } - // Find bitflag offset of first match and add to current offset - return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); - } while (lengthToExamine > offset); - } - - lengthToExamine = GetByteVector128SpanLength(offset, length); - if (lengthToExamine > offset) - { - Vector128 values0 = Vector128.Create(value0); - Vector128 values1 = Vector128.Create(value1); - Vector128 values2 = Vector128.Create(value2); - - Vector128 search = LoadVector128(ref searchSpace, offset); + goto IntrinsicsMatch; + } - Vector128 matches0 = Sse2.CompareEqual(values0, search); - Vector128 matches1 = Sse2.CompareEqual(values1, search); - Vector128 matches2 = Sse2.CompareEqual(values2, search); - // Same method as above - int matches = Sse2.MoveMask(Sse2.Or(Sse2.Or(matches0, matches1), matches2)); + // Move to Vector length from end for final compare + search = LoadVector256(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search)), + Avx2.CompareEqual(values2, search))); if (matches == 0) { - // Zero flags set so no matches - offset += (nuint)Vector128.Count; - } - else - { - // Find bitflag offset of first match and add to current offset - return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); + // None matched + goto NotFound; } - } - if (offset < (nuint)(uint)length) - { - lengthToExamine = ((nuint)(uint)length - offset); - goto SequentialScan; + goto IntrinsicsMatch; } } - } - else if (Sse2.IsSupported) - { - if (offset < (nuint)(uint)length) - { - lengthToExamine = GetByteVector128SpanLength(offset, length); + // Initial size check was done on method entry. + Debug.Assert(length >= Vector128.Count); + { + Vector128 search; Vector128 values0 = Vector128.Create(value0); Vector128 values1 = Vector128.Create(value1); Vector128 values2 = Vector128.Create(value2); - + // First time this checks against 0 and we will move into final compare if it fails. while (lengthToExamine > offset) { - Vector128 search = LoadVector128(ref searchSpace, offset); + search = LoadVector128(ref searchSpace, offset); - Vector128 matches0 = Sse2.CompareEqual(values0, search); - Vector128 matches1 = Sse2.CompareEqual(values1, search); - Vector128 matches2 = Sse2.CompareEqual(values2, search); - // Same method as above - int matches = Sse2.MoveMask(Sse2.Or(Sse2.Or(matches0, matches1), matches2)); + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search)), + Sse2.CompareEqual(values2, search))); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. if (matches == 0) { - // Zero flags set so no matches + // None matched offset += (nuint)Vector128.Count; continue; } - // Find bitflag offset of first match and add to current offset - return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); + goto IntrinsicsMatch; } - - if (offset < (nuint)(uint)length) + // Move to Vector length from end for final compare + search = LoadVector128(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search)), + Sse2.CompareEqual(values2, search))); + if (matches == 0) { - lengthToExamine = ((nuint)(uint)length - offset); - goto SequentialScan; + // None matched + goto NotFound; } } + + IntrinsicsMatch: + // Find bitflag offset of first difference and add to current offset + offset += (nuint)BitOperations.TrailingZeroCount(matches); + goto Found; } else if (AdvSimd.Arm64.IsSupported) { - if (offset < (nuint)(uint)length) - { - lengthToExamine = GetByteVector128SpanLength(offset, length); + // Mask to help find the first lane in compareResult that is set. + // LSB 0x01 corresponds to lane 0, 0x10 - to lane 1, and so on. + Vector128 mask = Vector128.Create((ushort)0x1001).AsByte(); + int matchedLane = 0; - // Mask to help find the first lane in compareResult that is set. - // LSB 0x01 corresponds to lane 0, 0x10 - to lane 1, and so on. - Vector128 mask = Vector128.Create((ushort)0x1001).AsByte(); - int matchedLane = 0; + Vector128 search; + Vector128 matches; + Vector128 values0 = Vector128.Create(value0); + Vector128 values1 = Vector128.Create(value1); + Vector128 values2 = Vector128.Create(value2); + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector128(ref searchSpace, offset); - Vector128 values0 = Vector128.Create(value0); - Vector128 values1 = Vector128.Create(value1); - Vector128 values2 = Vector128.Create(value2); + matches = AdvSimd.Or( + AdvSimd.Or( + AdvSimd.CompareEqual(values0, search), + AdvSimd.CompareEqual(values1, search)), + AdvSimd.CompareEqual(values2, search)); - while (lengthToExamine > offset) + if (!TryFindFirstMatchedLane(mask, matches, ref matchedLane)) { - Vector128 search = LoadVector128(ref searchSpace, offset); - - // Same method as above - Vector128 matches0 = AdvSimd.CompareEqual(values0, search); - Vector128 matches1 = AdvSimd.CompareEqual(values1, search); - Vector128 matches2 = AdvSimd.CompareEqual(values2, search); + // Zero flags set so no matches + offset += (nuint)Vector128.Count; + continue; + } - Vector128 compareResult = AdvSimd.Or(AdvSimd.Or(matches0, matches1), matches2); + // Find bitflag offset of first match and add to current offset + offset += (uint)matchedLane; - if (!TryFindFirstMatchedLane(mask, compareResult, ref matchedLane)) - { - // Zero flags set so no matches - offset += (nuint)Vector128.Count; - continue; - } + goto Found; + } - // Find bitflag offset of first match and add to current offset - return (int)(offset + (uint)matchedLane); - } + // Move to Vector length from end for final compare + search = LoadVector128(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = AdvSimd.Or( + AdvSimd.Or( + AdvSimd.CompareEqual(values0, search), + AdvSimd.CompareEqual(values1, search)), + AdvSimd.CompareEqual(values2, search)); - if (offset < (nuint)(uint)length) - { - lengthToExamine = ((nuint)(uint)length - offset); - goto SequentialScan; - } + if (!TryFindFirstMatchedLane(mask, matches, ref matchedLane)) + { + // None matched + goto NotFound; } + + // Find bitflag offset of first match and add to current offset + offset += (nuint)(uint)matchedLane; + + goto Found; } else if (Vector.IsHardwareAccelerated) { - if (offset < (nuint)(uint)length) - { - lengthToExamine = GetByteVectorSpanLength(offset, length); - - Vector values0 = new Vector(value0); - Vector values1 = new Vector(value1); - Vector values2 = new Vector(value2); + Vector values0 = new Vector(value0); + Vector values1 = new Vector(value1); + Vector values2 = new Vector(value2); - while (lengthToExamine > offset) + Vector search; + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector(ref searchSpace, offset); + search = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)); + if (Vector.Zero.Equals(search)) { - Vector search = LoadVector(ref searchSpace, offset); - - var matches = Vector.BitwiseOr( - Vector.BitwiseOr( - Vector.Equals(search, values0), - Vector.Equals(search, values1)), - Vector.Equals(search, values2)); - - if (Vector.Zero.Equals(matches)) - { - offset += (nuint)Vector.Count; - continue; - } - - // Find offset of first match and add to current offset - return (int)offset + LocateFirstFoundByte(matches); + // None matched + offset += (nuint)Vector.Count; + continue; } - if (offset < (nuint)(uint)length) - { - lengthToExamine = ((nuint)(uint)length - offset); - goto SequentialScan; - } + goto VectorMatch; + } + + // Move to Vector length from end for final compare + search = LoadVector(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + search = Vector.BitwiseOr( + Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)), + Vector.Equals(search, values2)); + if (Vector.Zero.Equals(search)) + { + // None matched + goto NotFound; } + + VectorMatch: + offset += (nuint)LocateFirstFoundByte(search); + goto Found; } - return -1; - Found: // Workaround for https://github.com/dotnet/runtime/issues/8795 - return (int)offset; - Found1: - return (int)(offset + 1); - Found2: - return (int)(offset + 2); - Found3: - return (int)(offset + 3); - Found4: - return (int)(offset + 4); - Found5: - return (int)(offset + 5); - Found6: - return (int)(offset + 6); - Found7: - return (int)(offset + 7); + + Debug.Fail("Unreachable"); + goto NotFound; } public static int LastIndexOfAny(ref byte searchSpace, byte value0, byte value1, int length)