Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

port SpanHelpers.IndexOf(ref char, char, int) to Vector128/256 #73368

Merged
merged 1 commit into from
Aug 5, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 16 additions & 48 deletions src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ public static unsafe int IndexOf(ref char searchSpace, char value, int length)
{
// Input isn't char aligned, we won't be able to align it to a Vector
}
else if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported)
else if (Vector128.IsHardwareAccelerated)
{
// Avx2 branch also operates on Sse2 sizes, so check is combined.
// Needs to be double length to allow us to align the data first.
Expand Down Expand Up @@ -575,13 +575,14 @@ public static unsafe int IndexOf(ref char searchSpace, char value, int length)
lengthToExamine--;
}

// We get past SequentialScan only if IsHardwareAccelerated or intrinsic .IsSupported is true. However, we still have the redundant check to allow
// We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow
// the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw, last time I checked it wasn't needed for JIT - it's smart enough 🙂

if (Avx2.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
if (offset < length)
{
Debug.Assert(length - offset >= Vector128<ushort>.Count);
ref ushort ushortSearchSpace = ref Unsafe.As<char, ushort>(ref searchSpace);
if (((nint)Unsafe.AsPointer(ref Unsafe.Add(ref searchSpace, (nint)offset)) & (nint)(Vector256<byte>.Count - 1)) != 0)
{
// Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches
Expand All @@ -600,10 +601,10 @@ public static unsafe int IndexOf(ref char searchSpace, char value, int length)
// the misalignment that may occur after; to we default to giving the GC a free hand to relocate and
// its up to the caller whether they are operating over fixed data.
Vector128<ushort> values = Vector128.Create((ushort)value);
Vector128<ushort> search = LoadVector128(ref searchSpace, offset);
Vector128<ushort> search = Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)offset);

// Same method as below
int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte());
uint matches = Vector128.Equals(values, search).AsByte().ExtractMostSignificantBits();
adamsitnik marked this conversation as resolved.
Show resolved Hide resolved
if (matches == 0)
{
// Zero flags set so no matches
Expand All @@ -624,8 +625,8 @@ public static unsafe int IndexOf(ref char searchSpace, char value, int length)
{
Debug.Assert(lengthToExamine >= Vector256<ushort>.Count);

Vector256<ushort> search = LoadVector256(ref searchSpace, offset);
int matches = Avx2.MoveMask(Avx2.CompareEqual(values, search).AsByte());
Vector256<ushort> search = Vector256.LoadUnsafe(ref ushortSearchSpace, (nuint)offset);
uint matches = Vector256.Equals(values, search).AsByte().ExtractMostSignificantBits();
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.
if (matches == 0)
Expand All @@ -648,10 +649,10 @@ public static unsafe int IndexOf(ref char searchSpace, char value, int length)
Debug.Assert(lengthToExamine >= Vector128<ushort>.Count);

Vector128<ushort> values = Vector128.Create((ushort)value);
Vector128<ushort> search = LoadVector128(ref searchSpace, offset);
Vector128<ushort> search = Vector128.LoadUnsafe(ref ushortSearchSpace, (nuint)offset);

// Same method as above
int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte());
uint matches = Vector128.Equals(values, search).AsByte().ExtractMostSignificantBits();
if (matches == 0)
{
// Zero flags set so no matches
Expand All @@ -673,11 +674,12 @@ public static unsafe int IndexOf(ref char searchSpace, char value, int length)
}
}
}
else if (Sse2.IsSupported)
else if (Vector128.IsHardwareAccelerated)
{
if (offset < length)
{
Debug.Assert(length - offset >= Vector128<ushort>.Count);
ref ushort ushortSearchSpace = ref Unsafe.As<char, ushort>(ref searchSpace);

lengthToExamine = GetCharVector128SpanLength(offset, length);
if (lengthToExamine > 0)
Expand All @@ -687,11 +689,11 @@ public static unsafe int IndexOf(ref char searchSpace, char value, int length)
{
Debug.Assert(lengthToExamine >= Vector128<ushort>.Count);

Vector128<ushort> search = LoadVector128(ref searchSpace, offset);
Vector128<ushort> search = Vector128.LoadUnsafe(ref ushortSearchSpace, (uint)offset);

// Same method as above
int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte());
if (matches == 0)
Vector128<ushort> compareResult = Vector128.Equals(values, search);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question as on the byte PR.

What is the cost of this approach vs doing it every loop for Vector256

Is it better, particularly for large inputs, to do this there as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tannergooding is slightly worse compared to the PR (4%):

- uint matches = Vector256.Equals(values, search).AsByte().ExtractMostSignificantBits();
- if (matches == 0)
+ Vector256<ushort> compareResult = Vector256.Equals(values, search);
+ if (compareResult == Vector256<ushort>.Zero)
  {
      // Zero flags set so no matches
      offset += Vector256<ushort>.Count;
      lengthToExamine -= Vector128<ushort>.Count;
      continue;
  }

+ uint matches = compareResult.AsByte().ExtractMostSignificantBits();
BenchmarkDotNet=v0.13.1.1828-nightly, OS=Windows 10 (10.0.18363.2212/1909/November2019Update/19H2)
Intel Xeon CPU E5-1650 v4 3.60GHz, 1 CPU, 12 logical and 6 physical cores
.NET SDK=7.0.100-rc.1.22405.1
  [Host]     : .NET 7.0.0 (7.0.22.40308), X64 RyuJIT AVX2
  Job-ZEKBPR : .NET 7.0.0 (42.42.42.42424), X64 RyuJIT AVX2
  Job-XEENUE : .NET 7.0.0 (42.42.42.42424), X64 RyuJIT AVX2
  Job-FOODNB : .NET 7.0.0 (42.42.42.42424), X64 RyuJIT AVX2

LaunchCount=9
Method Toolchain Size Mean Ratio
IndexOfValue \question\corerun.exe 512 13.09 ns 0.99
IndexOfValue \main\corerun.exe 512 13.18 ns 1.00
IndexOfValue \PR\corerun.exe 512 12.48 ns 0.95

vtune

if (compareResult == Vector128<ushort>.Zero)
{
// Zero flags set so no matches
offset += Vector128<ushort>.Count;
Expand All @@ -701,6 +703,7 @@ public static unsafe int IndexOf(ref char searchSpace, char value, int length)

// Find bitflag offset of first match and add to current offset,
// flags are in bytes so divide for chars
uint matches = compareResult.AsByte().ExtractMostSignificantBits();
return (int)(offset + ((uint)BitOperations.TrailingZeroCount(matches) / sizeof(char)));
} while (lengthToExamine > 0);
}
Expand All @@ -712,41 +715,6 @@ public static unsafe int IndexOf(ref char searchSpace, char value, int length)
}
}
}
else if (AdvSimd.Arm64.IsSupported)
{
if (offset < length)
{
Debug.Assert(length - offset >= Vector128<ushort>.Count);

lengthToExamine = GetCharVector128SpanLength(offset, length);
if (lengthToExamine > 0)
{
Vector128<ushort> values = Vector128.Create((ushort)value);
do
{
Debug.Assert(lengthToExamine >= Vector128<ushort>.Count);

Vector128<ushort> search = LoadVector128(ref searchSpace, offset);
Vector128<ushort> compareResult = AdvSimd.CompareEqual(values, search);

if (compareResult == Vector128<ushort>.Zero)
{
offset += Vector128<ushort>.Count;
lengthToExamine -= Vector128<ushort>.Count;
continue;
}

return (int)(offset + FindFirstMatchedLane(compareResult));
} while (lengthToExamine > 0);
}

if (offset < length)
{
lengthToExamine = length - offset;
goto SequentialScan;
}
}
}
else if (Vector.IsHardwareAccelerated)
{
if (offset < length)
Expand Down