diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs index 2d1d2278e2965..0a0e8587c2869 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs @@ -603,19 +603,30 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported) { // Avx2 branch also operates on Sse2 sizes, so check is combined. - if (length >= Vector128.Count * 2) + nint vectorDiff = (nint)length - Vector128.Count; + if (vectorDiff >= 0) { - lengthToExamine = UnalignedCountVector128(ref searchSpace); + // >= Sse2 intrinsics are supported, and length is enough to use them so use that path. + // We jump forward to the intrinsics at the end of them method so a naive branch predict + // will choose the non-intrinsic path so short lengths which don't gain anything aren't + // overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths + // more than make this back from the intrinsics. + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; } } else if (Vector.IsHardwareAccelerated) { - if (length >= Vector.Count * 2) + // Calucate lengthToExamine here for test, rather than just testing as it used later, rather than doing it twice. + nint vectorDiff = (nint)length - Vector.Count; + if (vectorDiff >= 0) { - lengthToExamine = UnalignedCountVector(ref searchSpace); + // Similar as above for Vector version + lengthToExamine = (nuint)vectorDiff; + goto IntrinsicsCompare; } } - SequentialScan: + uint lookUp; while (lengthToExamine >= 8) { @@ -671,204 +682,234 @@ public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int while (lengthToExamine > 0) { - lengthToExamine -= 1; lookUp = Unsafe.AddByteOffset(ref searchSpace, offset); if (uValue0 == lookUp || uValue1 == lookUp) goto Found; offset += 1; + lengthToExamine -= 1; } - // We get past SequentialScan only if IsHardwareAccelerated or intrinsic .IsSupported is true. However, we still have the redundant check to allow - // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated. - if (Avx2.IsSupported) + NotFound: + return -1; + Found: // Workaround for https://github.com/dotnet/runtime/issues/8795 + return (int)offset; + Found1: + return (int)(offset + 1); + Found2: + return (int)(offset + 2); + Found3: + return (int)(offset + 3); + Found4: + return (int)(offset + 4); + Found5: + return (int)(offset + 5); + Found6: + return (int)(offset + 6); + Found7: + return (int)(offset + 7); + + IntrinsicsCompare: + // When we move into a Vectorized block, we process everything of Vector size; + // and then for any remainder we do a final compare of Vector size but starting at + // the end and forwards, which may overlap on an earlier compare. + + // We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported. + if (Sse2.IsSupported) { - if (offset < (nuint)(uint)length) + int matches; + if (Avx2.IsSupported) { - lengthToExamine = GetByteVector256SpanLength(offset, length); - if (lengthToExamine > offset) + Vector256 search; + // Guard as we may only have a valid size for Vector128; when we will move to the Sse2 + // We have already subtracted Vector128.Count from lengthToExamine so compare against that + // to see if we have double the size for Vector256.Count + if (lengthToExamine >= (nuint)Vector128.Count) { Vector256 values0 = Vector256.Create(value0); Vector256 values1 = Vector256.Create(value1); - do + + // Subtract Vector128.Count so we have now subtracted Vector256.Count + lengthToExamine -= (nuint)Vector128.Count; + // First time this checks again against 0, however we will move into final compare if it fails. + while (lengthToExamine > offset) { - Vector256 search = LoadVector256(ref searchSpace, offset); - // Bitwise Or to combine the matches and MoveMask to convert them to bitflags - int matches = Avx2.MoveMask( - Avx2.Or( - Avx2.CompareEqual(values0, search), - Avx2.CompareEqual(values1, search))); + search = LoadVector256(ref searchSpace, offset); + // Bitwise Or to combine the flagged matches for the second value to our match flags + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search))); // Note that MoveMask has converted the equal vector elements into a set of bit flags, // So the bit position in 'matches' corresponds to the element offset. if (matches == 0) { - // Zero flags set so no matches + // None matched offset += (nuint)Vector256.Count; continue; } - // Find bitflag offset of first match and add to current offset - return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); - } while (lengthToExamine > offset); - } - - lengthToExamine = GetByteVector128SpanLength(offset, length); - if (lengthToExamine > offset) - { - Vector128 values0 = Vector128.Create(value0); - Vector128 values1 = Vector128.Create(value1); + goto IntrinsicsMatch; + } - Vector128 search = LoadVector128(ref searchSpace, offset); - // Same method as above - int matches = Sse2.MoveMask( - Sse2.Or( - Sse2.CompareEqual(values0, search), - Sse2.CompareEqual(values1, search))); + // Move to Vector length from end for final compare + search = LoadVector256(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Avx2.MoveMask( + Avx2.Or( + Avx2.CompareEqual(values0, search), + Avx2.CompareEqual(values1, search))); if (matches == 0) { - // Zero flags set so no matches - offset += (nuint)Vector128.Count; + // None matched + goto NotFound; } - else - { - // Find bitflag offset of first match and add to current offset - return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); - } - } - if (offset < (nuint)(uint)length) - { - lengthToExamine = ((nuint)(uint)length - offset); - goto SequentialScan; + goto IntrinsicsMatch; } } - } - else if (Sse2.IsSupported) - { - if (offset < (nuint)(uint)length) - { - lengthToExamine = GetByteVector128SpanLength(offset, length); + // Initial size check was done on method entry. + Debug.Assert(length >= Vector128.Count); + { + Vector128 search; Vector128 values0 = Vector128.Create(value0); Vector128 values1 = Vector128.Create(value1); - + // First time this checks against 0 and we will move into final compare if it fails. while (lengthToExamine > offset) { - Vector128 search = LoadVector128(ref searchSpace, offset); - // Same method as above - int matches = Sse2.MoveMask( + search = LoadVector128(ref searchSpace, offset); + + matches = Sse2.MoveMask( Sse2.Or( Sse2.CompareEqual(values0, search), - Sse2.CompareEqual(values1, search))); + Sse2.CompareEqual(values1, search)) + .AsByte()); + // Note that MoveMask has converted the equal vector elements into a set of bit flags, + // So the bit position in 'matches' corresponds to the element offset. if (matches == 0) { - // Zero flags set so no matches + // None matched offset += (nuint)Vector128.Count; continue; } - // Find bitflag offset of first match and add to current offset - return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); + goto IntrinsicsMatch; } - - if (offset < (nuint)(uint)length) + // Move to Vector length from end for final compare + search = LoadVector128(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = Sse2.MoveMask( + Sse2.Or( + Sse2.CompareEqual(values0, search), + Sse2.CompareEqual(values1, search))); + if (matches == 0) { - lengthToExamine = ((nuint)(uint)length - offset); - goto SequentialScan; + // None matched + goto NotFound; } } + + IntrinsicsMatch: + // Find bitflag offset of first difference and add to current offset + offset += (nuint)BitOperations.TrailingZeroCount(matches); + goto Found; } else if (AdvSimd.Arm64.IsSupported) { - if (offset < (nuint)(uint)length) - { - lengthToExamine = GetByteVector128SpanLength(offset, length); + // Mask to help find the first lane in compareResult that is set. + // LSB 0x01 corresponds to lane 0, 0x10 - to lane 1, and so on. + Vector128 mask = Vector128.Create((ushort)0x1001).AsByte(); + int matchedLane = 0; - // Mask to help find the first lane in compareResult that is set. - // LSB 0x01 corresponds to lane 0, 0x10 - to lane 1, and so on. - Vector128 mask = Vector128.Create((ushort)0x1001).AsByte(); - int matchedLane = 0; + Vector128 search; + Vector128 matches; + Vector128 values0 = Vector128.Create(value0); + Vector128 values1 = Vector128.Create(value1); + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector128(ref searchSpace, offset); - Vector128 values0 = Vector128.Create(value0); - Vector128 values1 = Vector128.Create(value1); + matches = AdvSimd.Or( + AdvSimd.CompareEqual(values0, search), + AdvSimd.CompareEqual(values1, search)); - while (lengthToExamine > offset) + if (!TryFindFirstMatchedLane(mask, matches, ref matchedLane)) { - Vector128 search = LoadVector128(ref searchSpace, offset); + // Zero flags set so no matches + offset += (nuint)Vector128.Count; + continue; + } - // Same method as above - Vector128 compareResult = AdvSimd.Or( - AdvSimd.CompareEqual(values0, search), - AdvSimd.CompareEqual(values1, search)); + // Find bitflag offset of first match and add to current offset + offset += (uint)matchedLane; - if (!TryFindFirstMatchedLane(mask, compareResult, ref matchedLane)) - { - // Zero flags set so no matches - offset += (nuint)Vector128.Count; - continue; - } + goto Found; + } - // Find bitflag offset of first match and add to current offset - return (int)(offset + (uint)matchedLane); - } + // Move to Vector length from end for final compare + search = LoadVector128(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + // Same as method as above + matches = AdvSimd.Or( + AdvSimd.CompareEqual(values0, search), + AdvSimd.CompareEqual(values1, search)); - if (offset < (nuint)(uint)length) - { - lengthToExamine = ((nuint)(uint)length - offset); - goto SequentialScan; - } + if (!TryFindFirstMatchedLane(mask, matches, ref matchedLane)) + { + // None matched + goto NotFound; } + + // Find bitflag offset of first match and add to current offset + offset += (nuint)(uint)matchedLane; + + goto Found; } else if (Vector.IsHardwareAccelerated) { - if (offset < (nuint)(uint)length) - { - lengthToExamine = GetByteVectorSpanLength(offset, length); - - Vector values0 = new Vector(value0); - Vector values1 = new Vector(value1); + Vector values0 = new Vector(value0); + Vector values1 = new Vector(value1); - while (lengthToExamine > offset) + Vector search; + // First time this checks against 0 and we will move into final compare if it fails. + while (lengthToExamine > offset) + { + search = LoadVector(ref searchSpace, offset); + search = Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)); + if (Vector.Zero.Equals(search)) { - Vector search = LoadVector(ref searchSpace, offset); - var matches = Vector.BitwiseOr( - Vector.Equals(search, values0), - Vector.Equals(search, values1)); - if (Vector.Zero.Equals(matches)) - { - offset += (nuint)Vector.Count; - continue; - } - - // Find offset of first match and add to current offset - return (int)offset + LocateFirstFoundByte(matches); + // None matched + offset += (nuint)Vector.Count; + continue; } - if (offset < (nuint)(uint)length) - { - lengthToExamine = ((nuint)(uint)length - offset); - goto SequentialScan; - } + goto Difference; } + + // Move to Vector length from end for final compare + search = LoadVector(ref searchSpace, lengthToExamine); + offset = lengthToExamine; + search = Vector.BitwiseOr( + Vector.Equals(search, values0), + Vector.Equals(search, values1)); + if (Vector.Zero.Equals(search)) + { + // None matched + goto NotFound; + } + + Difference: + offset += (nuint)LocateFirstFoundByte(search); } - return -1; - Found: // Workaround for https://github.com/dotnet/runtime/issues/8795 - return (int)offset; - Found1: - return (int)(offset + 1); - Found2: - return (int)(offset + 2); - Found3: - return (int)(offset + 3); - Found4: - return (int)(offset + 4); - Found5: - return (int)(offset + 5); - Found6: - return (int)(offset + 6); - Found7: - return (int)(offset + 7); + + goto Found; } [MethodImpl(MethodImplOptions.AggressiveOptimization)]