Skip to content

Commit

Permalink
Port optimization in Buffer.Memmove from CoreCLR
Browse files Browse the repository at this point in the history
This is just a copy of changes made in
dotnet/coreclr#9786
  • Loading branch information
sergiy-k committed May 8, 2017
1 parent 4bb8337 commit 138fe5d
Showing 1 changed file with 161 additions and 195 deletions.
356 changes: 161 additions & 195 deletions src/System.Private.CoreLib/src/System/Buffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

#if AMD64 || (BIT32 && !ARM)
#define HAS_CUSTOM_BLOCKS
#endif

using System;
using System.Runtime;
using System.Diagnostics;
Expand Down Expand Up @@ -233,222 +237,176 @@ internal unsafe static void Memcpy(byte* dest, byte* src, int len)
Memmove(dest, src, (nuint)len);
}

internal static unsafe void Memmove(byte* dest, byte* src, nuint len)
// This method has different signature for x64 and other platforms and is done for performance reasons.
internal unsafe static void Memmove(byte* dest, byte* src, nuint len)
{
// P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards
// This check can produce false positives for lengths greater than Int32.MaxInt. It is fine because we want to use PInvoke path for the large lengths anyway.
if ((nuint)dest - (nuint)src < len)
{
_Memmove(dest, src, len);
return;
}
#if AMD64 || (BIT32 && !ARM)
const nuint CopyThreshold = 2048;
#else
const nuint CopyThreshold = 512;
#endif // AMD64 || (BIT32 && !ARM)

//
// This is portable version of memcpy. It mirrors what the hand optimized assembly versions of memcpy typically do.
//
// P/Invoke into the native version when the buffers are overlapping.

#if ALIGN_ACCESS
#error Needs porting for ALIGN_ACCESS (https://github.com/dotnet/corert/issues/430)
#else // ALIGN_ACCESS
switch (len)
{
case 0:
return;
case 1:
*dest = *src;
return;
case 2:
*(short*)dest = *(short*)src;
return;
case 3:
*(short*)dest = *(short*)src;
*(dest + 2) = *(src + 2);
return;
case 4:
*(int*)dest = *(int*)src;
return;
case 5:
*(int*)dest = *(int*)src;
*(dest + 4) = *(src + 4);
return;
case 6:
*(int*)dest = *(int*)src;
*(short*)(dest + 4) = *(short*)(src + 4);
return;
case 7:
*(int*)dest = *(int*)src;
*(short*)(dest + 4) = *(short*)(src + 4);
*(dest + 6) = *(src + 6);
return;
case 8:
#if BIT64
*(long*)dest = *(long*)src;
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
#endif
return;
case 9:
#if BIT64
*(long*)dest = *(long*)src;
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
#endif
*(dest + 8) = *(src + 8);
return;
case 10:
#if BIT64
*(long*)dest = *(long*)src;
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
#endif
*(short*)(dest + 8) = *(short*)(src + 8);
return;
case 11:
#if BIT64
*(long*)dest = *(long*)src;
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
#endif
*(short*)(dest + 8) = *(short*)(src + 8);
*(dest + 10) = *(src + 10);
return;
case 12:
#if BIT64
*(long*)dest = *(long*)src;
if (((nuint)dest - (nuint)src < len) || ((nuint)src - (nuint)dest < len)) goto PInvoke;

byte* srcEnd = src + len;
byte* destEnd = dest + len;

if (len <= 16) goto MCPY02;
if (len > 64) goto MCPY05;

MCPY00:
// Copy bytes which are multiples of 16 and leave the remainder for MCPY01 to handle.
Debug.Assert(len > 16 && len <= 64);
#if HAS_CUSTOM_BLOCKS
*(Block16*)dest = *(Block16*)src; // [0,16]
#elif BIT64
*(long*)dest = *(long*)src;
*(long*)(dest + 8) = *(long*)(src + 8); // [0,16]
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 12) = *(int*)(src + 12); // [0,16]
#endif
*(int*)(dest + 8) = *(int*)(src + 8);
return;
case 13:
#if BIT64
*(long*)dest = *(long*)src;
if (len <= 32) goto MCPY01;
#if HAS_CUSTOM_BLOCKS
*(Block16*)(dest + 16) = *(Block16*)(src + 16); // [0,32]
#elif BIT64
*(long*)(dest + 16) = *(long*)(src + 16);
*(long*)(dest + 24) = *(long*)(src + 24); // [0,32]
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 16) = *(int*)(src + 16);
*(int*)(dest + 20) = *(int*)(src + 20);
*(int*)(dest + 24) = *(int*)(src + 24);
*(int*)(dest + 28) = *(int*)(src + 28); // [0,32]
#endif
*(int*)(dest + 8) = *(int*)(src + 8);
*(dest + 12) = *(src + 12);
return;
case 14:
#if BIT64
*(long*)dest = *(long*)src;
if (len <= 48) goto MCPY01;
#if HAS_CUSTOM_BLOCKS
*(Block16*)(dest + 32) = *(Block16*)(src + 32); // [0,48]
#elif BIT64
*(long*)(dest + 32) = *(long*)(src + 32);
*(long*)(dest + 40) = *(long*)(src + 40); // [0,48]
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 32) = *(int*)(src + 32);
*(int*)(dest + 36) = *(int*)(src + 36);
*(int*)(dest + 40) = *(int*)(src + 40);
*(int*)(dest + 44) = *(int*)(src + 44); // [0,48]
#endif
*(int*)(dest + 8) = *(int*)(src + 8);
*(short*)(dest + 12) = *(short*)(src + 12);
return;
case 15:
#if BIT64
*(long*)dest = *(long*)src;

MCPY01:
// Unconditionally copy the last 16 bytes using destEnd and srcEnd and return.
Debug.Assert(len > 16 && len <= 64);
#if HAS_CUSTOM_BLOCKS
*(Block16*)(destEnd - 16) = *(Block16*)(srcEnd - 16);
#elif BIT64
*(long*)(destEnd - 16) = *(long*)(srcEnd - 16);
*(long*)(destEnd - 8) = *(long*)(srcEnd - 8);
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(destEnd - 16) = *(int*)(srcEnd - 16);
*(int*)(destEnd - 12) = *(int*)(srcEnd - 12);
*(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
#endif
*(int*)(dest + 8) = *(int*)(src + 8);
*(short*)(dest + 12) = *(short*)(src + 12);
*(dest + 14) = *(src + 14);
return;
case 16:
return;

MCPY02:
// Copy the first 8 bytes and then unconditionally copy the last 8 bytes and return.
if ((len & 24) == 0) goto MCPY03;
Debug.Assert(len >= 8 && len <= 16);
#if BIT64
*(long*)dest = *(long*)src;
*(long*)(dest + 8) = *(long*)(src + 8);
*(long*)dest = *(long*)src;
*(long*)(destEnd - 8) = *(long*)(srcEnd - 8);
#else
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 12) = *(int*)(src + 12);
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
#endif
return;
default:
break;
}

// P/Invoke into the native version for large lengths.
if (len >= 200)
{
_Memmove(dest, src, len);
return;
}

if (((int)dest & 3) != 0)
return;

MCPY03:
// Copy the first 4 bytes and then unconditionally copy the last 4 bytes and return.
if ((len & 4) == 0) goto MCPY04;
Debug.Assert(len >= 4 && len < 8);
*(int*)dest = *(int*)src;
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
return;

MCPY04:
// Copy the first byte. For pending bytes, do an unconditionally copy of the last 2 bytes and return.
Debug.Assert(len < 4);
if (len == 0) return;
*dest = *src;
if ((len & 2) == 0) return;
*(short*)(destEnd - 2) = *(short*)(srcEnd - 2);
return;

MCPY05:
// PInvoke to the native version when the copy length exceeds the threshold.
if (len > CopyThreshold)
{
if (((int)dest & 1) != 0)
{
*dest = *src;
src++;
dest++;
len--;
if (((int)dest & 2) == 0)
goto Aligned;
}
*(short*)dest = *(short*)src;
src += 2;
dest += 2;
len -= 2;
Aligned:;
goto PInvoke;
}

#if BIT64
if (((int)dest & 4) != 0)
{
*(int*)dest = *(int*)src;
src += 4;
dest += 4;
len -= 4;
}
#endif

nuint count = len / 16;
while (count > 0)
{
#if BIT64
((long*)dest)[0] = ((long*)src)[0];
((long*)dest)[1] = ((long*)src)[1];
// Copy 64-bytes at a time until the remainder is less than 64.
// If remainder is greater than 16 bytes, then jump to MCPY00. Otherwise, unconditionally copy the last 16 bytes and return.
Debug.Assert(len > 64 && len <= CopyThreshold);
nuint n = len >> 6;

MCPY06:
#if HAS_CUSTOM_BLOCKS
*(Block64*)dest = *(Block64*)src;
#elif BIT64
*(long*)dest = *(long*)src;
*(long*)(dest + 8) = *(long*)(src + 8);
*(long*)(dest + 16) = *(long*)(src + 16);
*(long*)(dest + 24) = *(long*)(src + 24);
*(long*)(dest + 32) = *(long*)(src + 32);
*(long*)(dest + 40) = *(long*)(src + 40);
*(long*)(dest + 48) = *(long*)(src + 48);
*(long*)(dest + 56) = *(long*)(src + 56);
#else
((int*)dest)[0] = ((int*)src)[0];
((int*)dest)[1] = ((int*)src)[1];
((int*)dest)[2] = ((int*)src)[2];
((int*)dest)[3] = ((int*)src)[3];
*(int*)dest = *(int*)src;
*(int*)(dest + 4) = *(int*)(src + 4);
*(int*)(dest + 8) = *(int*)(src + 8);
*(int*)(dest + 12) = *(int*)(src + 12);
*(int*)(dest + 16) = *(int*)(src + 16);
*(int*)(dest + 20) = *(int*)(src + 20);
*(int*)(dest + 24) = *(int*)(src + 24);
*(int*)(dest + 28) = *(int*)(src + 28);
*(int*)(dest + 32) = *(int*)(src + 32);
*(int*)(dest + 36) = *(int*)(src + 36);
*(int*)(dest + 40) = *(int*)(src + 40);
*(int*)(dest + 44) = *(int*)(src + 44);
*(int*)(dest + 48) = *(int*)(src + 48);
*(int*)(dest + 52) = *(int*)(src + 52);
*(int*)(dest + 56) = *(int*)(src + 56);
*(int*)(dest + 60) = *(int*)(src + 60);
#endif
dest += 16;
src += 16;
count--;
}

if ((len & 8) != 0)
{
#if BIT64
((long*)dest)[0] = ((long*)src)[0];
dest += 64;
src += 64;
n--;
if (n != 0) goto MCPY06;

len %= 64;
if (len > 16) goto MCPY00;
#if HAS_CUSTOM_BLOCKS
*(Block16*)(destEnd - 16) = *(Block16*)(srcEnd - 16);
#elif BIT64
*(long*)(destEnd - 16) = *(long*)(srcEnd - 16);
*(long*)(destEnd - 8) = *(long*)(srcEnd - 8);
#else
((int*)dest)[0] = ((int*)src)[0];
((int*)dest)[1] = ((int*)src)[1];
*(int*)(destEnd - 16) = *(int*)(srcEnd - 16);
*(int*)(destEnd - 12) = *(int*)(srcEnd - 12);
*(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
*(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
#endif
dest += 8;
src += 8;
}
if ((len & 4) != 0)
{
((int*)dest)[0] = ((int*)src)[0];
dest += 4;
src += 4;
}
if ((len & 2) != 0)
{
((short*)dest)[0] = ((short*)src)[0];
dest += 2;
src += 2;
}
if ((len & 1) != 0)
*dest = *src;
#endif // ALIGN_ACCESS
return;

PInvoke:
_Memmove(dest, src, len);
}

// Non-inlinable wrapper around the QCall that avoids poluting the fast path
Expand All @@ -458,5 +416,13 @@ private static unsafe void _Memmove(byte* dest, byte* src, nuint len)
{
RuntimeImports.memmove(dest, src, len);
}

#if HAS_CUSTOM_BLOCKS
[StructLayout(LayoutKind.Sequential, Size = 16)]
private struct Block16 { }

[StructLayout(LayoutKind.Sequential, Size = 64)]
private struct Block64 { }
#endif // HAS_CUSTOM_BLOCKS
}
}

0 comments on commit 138fe5d

Please sign in to comment.