Skip to content

Commit

Permalink
Use Vector types rather than Custom blocks
Browse files Browse the repository at this point in the history
  • Loading branch information
benaadams committed Oct 16, 2019
1 parent 14a981f commit 5b90be6
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 262 deletions.
2 changes: 1 addition & 1 deletion src/System.Private.CoreLib/shared/System/Buffer.Unix.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public static partial class Buffer
#elif ARM
private const nuint MemmoveNativeThreshold = 512;
#else
private const nuint MemmoveNativeThreshold = 2048;
private const nuint MemmoveNativeThreshold = 4096;
#endif
}
}
2 changes: 1 addition & 1 deletion src/System.Private.CoreLib/shared/System/Buffer.Windows.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public static partial class Buffer
// https://github.com/dotnet/coreclr/issues/13843
private const nuint MemmoveNativeThreshold = ulong.MaxValue;
#else
private const nuint MemmoveNativeThreshold = 2048;
private const nuint MemmoveNativeThreshold = 4096;
#endif
}
}
279 changes: 19 additions & 260 deletions src/System.Private.CoreLib/shared/System/Buffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,10 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

#if AMD64 || ARM64 || (BIT32 && !ARM)
#define HAS_CUSTOM_BLOCKS
#endif

using System.Diagnostics;
using System.Runtime;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

using Internal.Runtime.CompilerServices;

Expand All @@ -20,6 +16,8 @@
using nint = System.Int32;
using nuint = System.UInt32;
#endif
using Block16 = System.Runtime.Intrinsics.Vector128<byte>;
using Block32 = System.Runtime.Intrinsics.Vector256<byte>;

namespace System
{
Expand Down Expand Up @@ -87,7 +85,7 @@ internal static unsafe void ZeroMemory(byte* dest, nuint len)

// The attributes on this method are chosen for best JIT performance.
// Please do not edit unless intentional.
[MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CLSCompliant(false)]
public static unsafe void MemoryCopy(void* source, void* destination, long destinationSizeInBytes, long sourceBytesToCopy)
{
Expand All @@ -100,7 +98,7 @@ public static unsafe void MemoryCopy(void* source, void* destination, long desti

// The attributes on this method are chosen for best JIT performance.
// Please do not edit unless intentional.
[MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CLSCompliant(false)]
public static unsafe void MemoryCopy(void* source, void* destination, ulong destinationSizeInBytes, ulong sourceBytesToCopy)
{
Expand Down Expand Up @@ -141,171 +139,7 @@ internal static unsafe void Memcpy(byte* pDest, int destIndex, byte[] src, int s

// This method has different signature for x64 and other platforms and is done for performance reasons.
internal static unsafe void Memmove(byte* dest, byte* src, nuint len)
{
// P/Invoke into the native version when the buffers are overlapping.
if (((nuint)dest - (nuint)src < len) || ((nuint)src - (nuint)dest < len))
{
goto PInvoke;
}

byte* srcEnd = src + len;
byte* destEnd = dest + len;

if (len <= 16) goto MCPY02;
if (len > 64) goto MCPY05;

MCPY00:
// Copy bytes which are multiples of 16 and leave the remainder for MCPY01 to handle.
Debug.Assert(len > 16 && len <= 64);
#if HAS_CUSTOM_BLOCKS
*(Block16*)dest = *(Block16*)src; // [0,16]
#elif BIT64
*(long*)dest = *(long*)src;
*(long*)(dest + 8) = *(long*)(src + 8); // [0,16]
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 12) = *(int*)(src + 12); // [0,16]
#endif
if (len <= 32) goto MCPY01;
#if HAS_CUSTOM_BLOCKS
*(Block16*)(dest + 16) = *(Block16*)(src + 16); // [0,32]
#elif BIT64
*(long*)(dest + 16) = *(long*)(src + 16);
*(long*)(dest + 24) = *(long*)(src + 24); // [0,32]
#else
*(int*)(dest + 16) = *(int*)(src + 16);
*(int*)(dest + 20) = *(int*)(src + 20);
*(int*)(dest + 24) = *(int*)(src + 24);
*(int*)(dest + 28) = *(int*)(src + 28); // [0,32]
#endif
if (len <= 48) goto MCPY01;
#if HAS_CUSTOM_BLOCKS
*(Block16*)(dest + 32) = *(Block16*)(src + 32); // [0,48]
#elif BIT64
*(long*)(dest + 32) = *(long*)(src + 32);
*(long*)(dest + 40) = *(long*)(src + 40); // [0,48]
#else
*(int*)(dest + 32) = *(int*)(src + 32);
*(int*)(dest + 36) = *(int*)(src + 36);
*(int*)(dest + 40) = *(int*)(src + 40);
*(int*)(dest + 44) = *(int*)(src + 44); // [0,48]
#endif

MCPY01:
// Unconditionally copy the last 16 bytes using destEnd and srcEnd and return.
Debug.Assert(len > 16 && len <= 64);
#if HAS_CUSTOM_BLOCKS
*(Block16*)(destEnd - 16) = *(Block16*)(srcEnd - 16);
#elif BIT64
*(long*)(destEnd - 16) = *(long*)(srcEnd - 16);
*(long*)(destEnd - 8) = *(long*)(srcEnd - 8);
#else
*(int*)(destEnd - 16) = *(int*)(srcEnd - 16);
*(int*)(destEnd - 12) = *(int*)(srcEnd - 12);
*(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
#endif
return;

MCPY02:
// Copy the first 8 bytes and then unconditionally copy the last 8 bytes and return.
if ((len & 24) == 0) goto MCPY03;
Debug.Assert(len >= 8 && len <= 16);
#if BIT64
*(long*)dest = *(long*)src;
*(long*)(destEnd - 8) = *(long*)(srcEnd - 8);
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
#endif
return;

MCPY03:
// Copy the first 4 bytes and then unconditionally copy the last 4 bytes and return.
if ((len & 4) == 0) goto MCPY04;
Debug.Assert(len >= 4 && len < 8);
*(int*)dest = *(int*)src;
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
return;

MCPY04:
// Copy the first byte. For pending bytes, do an unconditionally copy of the last 2 bytes and return.
Debug.Assert(len < 4);
if (len == 0) return;
*dest = *src;
if ((len & 2) == 0) return;
*(short*)(destEnd - 2) = *(short*)(srcEnd - 2);
return;

MCPY05:
// PInvoke to the native version when the copy length exceeds the threshold.
if (len > MemmoveNativeThreshold)
{
goto PInvoke;
}

// Copy 64-bytes at a time until the remainder is less than 64.
// If remainder is greater than 16 bytes, then jump to MCPY00. Otherwise, unconditionally copy the last 16 bytes and return.
Debug.Assert(len > 64 && len <= MemmoveNativeThreshold);
nuint n = len >> 6;

MCPY06:
#if HAS_CUSTOM_BLOCKS
*(Block64*)dest = *(Block64*)src;
#elif BIT64
*(long*)dest = *(long*)src;
*(long*)(dest + 8) = *(long*)(src + 8);
*(long*)(dest + 16) = *(long*)(src + 16);
*(long*)(dest + 24) = *(long*)(src + 24);
*(long*)(dest + 32) = *(long*)(src + 32);
*(long*)(dest + 40) = *(long*)(src + 40);
*(long*)(dest + 48) = *(long*)(src + 48);
*(long*)(dest + 56) = *(long*)(src + 56);
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 12) = *(int*)(src + 12);
*(int*)(dest + 16) = *(int*)(src + 16);
*(int*)(dest + 20) = *(int*)(src + 20);
*(int*)(dest + 24) = *(int*)(src + 24);
*(int*)(dest + 28) = *(int*)(src + 28);
*(int*)(dest + 32) = *(int*)(src + 32);
*(int*)(dest + 36) = *(int*)(src + 36);
*(int*)(dest + 40) = *(int*)(src + 40);
*(int*)(dest + 44) = *(int*)(src + 44);
*(int*)(dest + 48) = *(int*)(src + 48);
*(int*)(dest + 52) = *(int*)(src + 52);
*(int*)(dest + 56) = *(int*)(src + 56);
*(int*)(dest + 60) = *(int*)(src + 60);
#endif
dest += 64;
src += 64;
n--;
if (n != 0) goto MCPY06;

len %= 64;
if (len > 16) goto MCPY00;
#if HAS_CUSTOM_BLOCKS
*(Block16*)(destEnd - 16) = *(Block16*)(srcEnd - 16);
#elif BIT64
*(long*)(destEnd - 16) = *(long*)(srcEnd - 16);
*(long*)(destEnd - 8) = *(long*)(srcEnd - 8);
#else
*(int*)(destEnd - 16) = *(int*)(srcEnd - 16);
*(int*)(destEnd - 12) = *(int*)(srcEnd - 12);
*(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
#endif
return;

PInvoke:
_Memmove(dest, src, len);
}
=> Memmove(ref Unsafe.AsRef<byte>(dest), ref Unsafe.AsRef<byte>(src), len);

// This method has different signature for x64 and other platforms and is done for performance reasons.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand Down Expand Up @@ -358,58 +192,25 @@ private static void Memmove(ref byte dest, ref byte src, nuint len)
MCPY00:
// Copy bytes which are multiples of 16 and leave the remainder for MCPY01 to handle.
Debug.Assert(len > 16 && len <= 64);
#if HAS_CUSTOM_BLOCKS

Unsafe.As<byte, Block16>(ref dest) = Unsafe.As<byte, Block16>(ref src); // [0,16]
#elif BIT64
Unsafe.As<byte, long>(ref dest) = Unsafe.As<byte, long>(ref src);
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 8)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 8)); // [0,16]
#else
Unsafe.As<byte, int>(ref dest) = Unsafe.As<byte, int>(ref src);
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 4)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 4));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 8)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 8));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 12)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 12)); // [0,16]
#endif

if (len <= 32)
goto MCPY01;
#if HAS_CUSTOM_BLOCKS

Unsafe.As<byte, Block16>(ref Unsafe.Add(ref dest, 16)) = Unsafe.As<byte, Block16>(ref Unsafe.Add(ref src, 16)); // [0,32]
#elif BIT64
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 16)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 16));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 24)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 24)); // [0,32]
#else
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 16)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 16));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 20)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 20));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 24)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 24));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 28)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 28)); // [0,32]
#endif

if (len <= 48)
goto MCPY01;
#if HAS_CUSTOM_BLOCKS

Unsafe.As<byte, Block16>(ref Unsafe.Add(ref dest, 32)) = Unsafe.As<byte, Block16>(ref Unsafe.Add(ref src, 32)); // [0,48]
#elif BIT64
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 32)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 32));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 40)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 40)); // [0,48]
#else
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 32)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 32));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 36)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 36));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 40)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 40));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 44)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 44)); // [0,48]
#endif

MCPY01:
// Unconditionally copy the last 16 bytes using destEnd and srcEnd and return.
Debug.Assert(len > 16 && len <= 64);
#if HAS_CUSTOM_BLOCKS

Unsafe.As<byte, Block16>(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As<byte, Block16>(ref Unsafe.Add(ref srcEnd, -16));
#elif BIT64
Unsafe.As<byte, long>(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref srcEnd, -16));
Unsafe.As<byte, long>(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref srcEnd, -8));
#else
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -16));
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -12)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -12));
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -8));
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -4)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -4));
#endif

return;

MCPY02:
Expand Down Expand Up @@ -461,35 +262,9 @@ private static void Memmove(ref byte dest, ref byte src, nuint len)
nuint n = len >> 6;

MCPY06:
#if HAS_CUSTOM_BLOCKS
Unsafe.As<byte, Block64>(ref dest) = Unsafe.As<byte, Block64>(ref src);
#elif BIT64
Unsafe.As<byte, long>(ref dest) = Unsafe.As<byte, long>(ref src);
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 8)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 8));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 16)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 16));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 24)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 24));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 32)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 32));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 40)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 40));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 48)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 48));
Unsafe.As<byte, long>(ref Unsafe.Add(ref dest, 56)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref src, 56));
#else
Unsafe.As<byte, int>(ref dest) = Unsafe.As<byte, int>(ref src);
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 4)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 4));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 8)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 8));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 12)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 12));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 16)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 16));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 20)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 20));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 24)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 24));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 28)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 28));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 32)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 32));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 36)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 36));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 40)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 40));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 44)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 44));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 48)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 48));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 52)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 52));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 56)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 56));
Unsafe.As<byte, int>(ref Unsafe.Add(ref dest, 60)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref src, 60));
#endif
Unsafe.As<byte, Block32>(ref dest) = Unsafe.As<byte, Block32>(ref src);
Unsafe.As<byte, Block32>(ref Unsafe.Add(ref dest, 32)) = Unsafe.As<byte, Block32>(ref Unsafe.Add(ref src, 32));

dest = ref Unsafe.Add(ref dest, 64);
src = ref Unsafe.Add(ref src, 64);
n--;
Expand All @@ -499,17 +274,9 @@ private static void Memmove(ref byte dest, ref byte src, nuint len)
len %= 64;
if (len > 16)
goto MCPY00;
#if HAS_CUSTOM_BLOCKS

Unsafe.As<byte, Block16>(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As<byte, Block16>(ref Unsafe.Add(ref srcEnd, -16));
#elif BIT64
Unsafe.As<byte, long>(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref srcEnd, -16));
Unsafe.As<byte, long>(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As<byte, long>(ref Unsafe.Add(ref srcEnd, -8));
#else
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -16));
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -12)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -12));
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -8));
Unsafe.As<byte, int>(ref Unsafe.Add(ref destEnd, -4)) = Unsafe.As<byte, int>(ref Unsafe.Add(ref srcEnd, -4));
#endif

return;

BuffersOverlap:
Expand All @@ -525,28 +292,20 @@ private static void Memmove(ref byte dest, ref byte src, nuint len)

// Non-inlinable wrapper around the QCall that avoids polluting the fast path
// with P/Invoke prolog/epilog.
[MethodImplAttribute(MethodImplOptions.NoInlining)]
[MethodImpl(MethodImplOptions.NoInlining)]
private static unsafe void _Memmove(byte* dest, byte* src, nuint len)
{
__Memmove(dest, src, len);
}

// Non-inlinable wrapper around the QCall that avoids polluting the fast path
// with P/Invoke prolog/epilog.
[MethodImplAttribute(MethodImplOptions.NoInlining)]
[MethodImpl(MethodImplOptions.NoInlining)]
private static unsafe void _Memmove(ref byte dest, ref byte src, nuint len)
{
fixed (byte* pDest = &dest)
fixed (byte* pSrc = &src)
__Memmove(pDest, pSrc, len);
}

#if HAS_CUSTOM_BLOCKS
[StructLayout(LayoutKind.Sequential, Size = 16)]
private struct Block16 { }

[StructLayout(LayoutKind.Sequential, Size = 64)]
private struct Block64 { }
#endif // HAS_CUSTOM_BLOCKS
}
}

0 comments on commit 5b90be6

Please sign in to comment.