From feacf6ae5a58c631323a827577392caf46e650fd Mon Sep 17 00:00:00 2001
From: James Ko <jamesqko@gmail.com>
Date: Thu, 26 Jan 2017 18:43:15 -0500
Subject: [PATCH 1/3] Rename the variables and the method

---
 src/mscorlib/src/System/Buffer.cs | 288 +++++++++++++++---------------
 src/mscorlib/src/System/Span.cs   |   4 +-
 2 files changed, 147 insertions(+), 145 deletions(-)

diff --git a/src/mscorlib/src/System/Buffer.cs b/src/mscorlib/src/System/Buffer.cs
index eee2a81b51fe..3a44f37827a8 100644
--- a/src/mscorlib/src/System/Buffer.cs
+++ b/src/mscorlib/src/System/Buffer.cs
@@ -250,23 +250,25 @@ internal unsafe static void Memcpy(byte* pDest, int destIndex, byte[] src, int s
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         internal unsafe static void Memcpy(byte* dest, byte* src, int len) {
             Debug.Assert(len >= 0, "Negative length in memcopy!");
-            Memmove(dest, src, (uint)len);
+            MemoryCopyCore(dest, src, (uint)len);
         }
 #endif // ARM
 
-        // This method has different signature for x64 and other platforms and is done for performance reasons.
+        // This method has different signature for x64 and other platforms for performance reasons.
         [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
-        internal unsafe static void Memmove(byte* dest, byte* src, nuint len)
+        internal unsafe static void MemoryCopyCore(byte* destination, byte* source, nuint length)
         {
-            // P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards
-            // This check can produce false positives for lengths greater than Int32.MaxInt. It is fine because we want to use PInvoke path for the large lengths anyway.
+            // P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards.
+            // This check can produce false positives for very large lengths if the destination is behind the source.
+            // It is fine because we want to use the P/Invoke path for such large lengths anyway.
 
-            if ((nuint)dest - (nuint)src < len) goto PInvoke;
-
-            // This is portable version of memcpy. It mirrors what the hand optimized assembly versions of memcpy typically do.
-            //
-            // Ideally, we would just use the cpblk IL instruction here. Unfortunately, cpblk IL instruction is not as efficient as
-            // possible yet and so we have this implementation here for now.
+            if ((nuint)destination - (nuint)source < length)
+            {
+                goto PInvoke;
+            }
+            
+            // Currently, the following code seems to be faster than `Unsafe.CopyBlock` in benchmarks. If that is no longer
+            // the case after changes to the JIT, the below code can simply be replaced with a call to that method.
 
             // Note: It's important that this switch handles lengths at least up to 22.
             // See notes below near the main loop for why.
@@ -274,215 +276,215 @@ internal unsafe static void Memmove(byte* dest, byte* src, nuint len)
             // The switch will be very fast since it can be implemented using a jump
             // table in assembly. See http://stackoverflow.com/a/449297/4077294 for more info.
 
-            switch (len)
+            switch (length)
             {
             case 0:
                 return;
             case 1:
-                *dest = *src;
+                *destination = *source;
                 return;
             case 2:
-                *(short*)dest = *(short*)src;
+                *(short*)destination = *(short*)source;
                 return;
             case 3:
-                *(short*)dest = *(short*)src;
-                *(dest + 2) = *(src + 2);
+                *(short*)destination = *(short*)source;
+                *(destination + 2) = *(source + 2);
                 return;
             case 4:
-                *(int*)dest = *(int*)src;
+                *(int*)destination = *(int*)source;
                 return;
             case 5:
-                *(int*)dest = *(int*)src;
-                *(dest + 4) = *(src + 4);
+                *(int*)destination = *(int*)source;
+                *(destination + 4) = *(source + 4);
                 return;
             case 6:
-                *(int*)dest = *(int*)src;
-                *(short*)(dest + 4) = *(short*)(src + 4);
+                *(int*)destination = *(int*)source;
+                *(short*)(destination + 4) = *(short*)(source + 4);
                 return;
             case 7:
-                *(int*)dest = *(int*)src;
-                *(short*)(dest + 4) = *(short*)(src + 4);
-                *(dest + 6) = *(src + 6);
+                *(int*)destination = *(int*)source;
+                *(short*)(destination + 4) = *(short*)(source + 4);
+                *(destination + 6) = *(source + 6);
                 return;
             case 8:
 #if BIT64
-                *(long*)dest = *(long*)src;
+                *(long*)destination = *(long*)source;
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
 #endif
                 return;
             case 9:
 #if BIT64
-                *(long*)dest = *(long*)src;
+                *(long*)destination = *(long*)source;
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
 #endif
-                *(dest + 8) = *(src + 8);
+                *(destination + 8) = *(source + 8);
                 return;
             case 10:
 #if BIT64
-                *(long*)dest = *(long*)src;
+                *(long*)destination = *(long*)source;
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
 #endif
-                *(short*)(dest + 8) = *(short*)(src + 8);
+                *(short*)(destination + 8) = *(short*)(source + 8);
                 return;
             case 11:
 #if BIT64
-                *(long*)dest = *(long*)src;
+                *(long*)destination = *(long*)source;
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
 #endif
-                *(short*)(dest + 8) = *(short*)(src + 8);
-                *(dest + 10) = *(src + 10);
+                *(short*)(destination + 8) = *(short*)(source + 8);
+                *(destination + 10) = *(source + 10);
                 return;
             case 12:
 #if BIT64
-                *(long*)dest = *(long*)src;
+                *(long*)destination = *(long*)source;
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
 #endif
-                *(int*)(dest + 8) = *(int*)(src + 8);
+                *(int*)(destination + 8) = *(int*)(source + 8);
                 return;
             case 13:
 #if BIT64
-                *(long*)dest = *(long*)src;
+                *(long*)destination = *(long*)source;
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
 #endif
-                *(int*)(dest + 8) = *(int*)(src + 8);
-                *(dest + 12) = *(src + 12);
+                *(int*)(destination + 8) = *(int*)(source + 8);
+                *(destination + 12) = *(source + 12);
                 return;
             case 14:
 #if BIT64
-                *(long*)dest = *(long*)src;
+                *(long*)destination = *(long*)source;
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
 #endif
-                *(int*)(dest + 8) = *(int*)(src + 8);
-                *(short*)(dest + 12) = *(short*)(src + 12);
+                *(int*)(destination + 8) = *(int*)(source + 8);
+                *(short*)(destination + 12) = *(short*)(source + 12);
                 return;
             case 15:
 #if BIT64
-                *(long*)dest = *(long*)src;
+                *(long*)destination = *(long*)source;
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
 #endif
-                *(int*)(dest + 8) = *(int*)(src + 8);
-                *(short*)(dest + 12) = *(short*)(src + 12);
-                *(dest + 14) = *(src + 14);
+                *(int*)(destination + 8) = *(int*)(source + 8);
+                *(short*)(destination + 12) = *(short*)(source + 12);
+                *(destination + 14) = *(source + 14);
                 return;
             case 16:
 #if BIT64
-                *(long*)dest = *(long*)src;
-                *(long*)(dest + 8) = *(long*)(src + 8);
+                *(long*)destination = *(long*)source;
+                *(long*)(destination + 8) = *(long*)(source + 8);
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
-                *(int*)(dest + 8) = *(int*)(src + 8);
-                *(int*)(dest + 12) = *(int*)(src + 12);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
+                *(int*)(destination + 8) = *(int*)(source + 8);
+                *(int*)(destination + 12) = *(int*)(source + 12);
 #endif
                 return;
             case 17:
 #if BIT64
-                *(long*)dest = *(long*)src;
-                *(long*)(dest + 8) = *(long*)(src + 8);
+                *(long*)destination = *(long*)source;
+                *(long*)(destination + 8) = *(long*)(source + 8);
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
-                *(int*)(dest + 8) = *(int*)(src + 8);
-                *(int*)(dest + 12) = *(int*)(src + 12);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
+                *(int*)(destination + 8) = *(int*)(source + 8);
+                *(int*)(destination + 12) = *(int*)(source + 12);
 #endif
-                *(dest + 16) = *(src + 16);
+                *(destination + 16) = *(source + 16);
                 return;
             case 18:
 #if BIT64
-                *(long*)dest = *(long*)src;
-                *(long*)(dest + 8) = *(long*)(src + 8);
+                *(long*)destination = *(long*)source;
+                *(long*)(destination + 8) = *(long*)(source + 8);
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
-                *(int*)(dest + 8) = *(int*)(src + 8);
-                *(int*)(dest + 12) = *(int*)(src + 12);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
+                *(int*)(destination + 8) = *(int*)(source + 8);
+                *(int*)(destination + 12) = *(int*)(source + 12);
 #endif
-                *(short*)(dest + 16) = *(short*)(src + 16);
+                *(short*)(destination + 16) = *(short*)(source + 16);
                 return;
             case 19:
 #if BIT64
-                *(long*)dest = *(long*)src;
-                *(long*)(dest + 8) = *(long*)(src + 8);
+                *(long*)destination = *(long*)source;
+                *(long*)(destination + 8) = *(long*)(source + 8);
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
-                *(int*)(dest + 8) = *(int*)(src + 8);
-                *(int*)(dest + 12) = *(int*)(src + 12);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
+                *(int*)(destination + 8) = *(int*)(source + 8);
+                *(int*)(destination + 12) = *(int*)(source + 12);
 #endif
-                *(short*)(dest + 16) = *(short*)(src + 16);
-                *(dest + 18) = *(src + 18);
+                *(short*)(destination + 16) = *(short*)(source + 16);
+                *(destination + 18) = *(source + 18);
                 return;
             case 20:
 #if BIT64
-                *(long*)dest = *(long*)src;
-                *(long*)(dest + 8) = *(long*)(src + 8);
+                *(long*)destination = *(long*)source;
+                *(long*)(destination + 8) = *(long*)(source + 8);
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
-                *(int*)(dest + 8) = *(int*)(src + 8);
-                *(int*)(dest + 12) = *(int*)(src + 12);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
+                *(int*)(destination + 8) = *(int*)(source + 8);
+                *(int*)(destination + 12) = *(int*)(source + 12);
 #endif
-                *(int*)(dest + 16) = *(int*)(src + 16);
+                *(int*)(destination + 16) = *(int*)(source + 16);
                 return;
             case 21:
 #if BIT64
-                *(long*)dest = *(long*)src;
-                *(long*)(dest + 8) = *(long*)(src + 8);
+                *(long*)destination = *(long*)source;
+                *(long*)(destination + 8) = *(long*)(source + 8);
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
-                *(int*)(dest + 8) = *(int*)(src + 8);
-                *(int*)(dest + 12) = *(int*)(src + 12);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
+                *(int*)(destination + 8) = *(int*)(source + 8);
+                *(int*)(destination + 12) = *(int*)(source + 12);
 #endif
-                *(int*)(dest + 16) = *(int*)(src + 16);
-                *(dest + 20) = *(src + 20);
+                *(int*)(destination + 16) = *(int*)(source + 16);
+                *(destination + 20) = *(source + 20);
                 return;
             case 22:
 #if BIT64
-                *(long*)dest = *(long*)src;
-                *(long*)(dest + 8) = *(long*)(src + 8);
+                *(long*)destination = *(long*)source;
+                *(long*)(destination + 8) = *(long*)(source + 8);
 #else
-                *(int*)dest = *(int*)src;
-                *(int*)(dest + 4) = *(int*)(src + 4);
-                *(int*)(dest + 8) = *(int*)(src + 8);
-                *(int*)(dest + 12) = *(int*)(src + 12);
+                *(int*)destination = *(int*)source;
+                *(int*)(destination + 4) = *(int*)(source + 4);
+                *(int*)(destination + 8) = *(int*)(source + 8);
+                *(int*)(destination + 12) = *(int*)(source + 12);
 #endif
-                *(int*)(dest + 16) = *(int*)(src + 16);
-                *(short*)(dest + 20) = *(short*)(src + 20);
+                *(int*)(destination + 16) = *(int*)(source + 16);
+                *(short*)(destination + 20) = *(short*)(source + 20);
                 return;
             }
 
             // P/Invoke into the native version for large lengths
-            if (len >= 512) goto PInvoke;
+            if (length >= 512) goto PInvoke;
 
             nuint i = 0; // byte offset at which we're copying
 
-            if (((int)dest & 3) != 0)
+            if (((int)destination & 3) != 0)
             {
-                if (((int)dest & 1) != 0)
+                if (((int)destination & 1) != 0)
                 {
-                    *(dest + i) = *(src + i);
+                    *(destination + i) = *(source + i);
                     i += 1;
-                    if (((int)dest & 2) != 0)
+                    if (((int)destination & 2) != 0)
                         goto IntAligned;
                 }
-                *(short*)(dest + i) = *(short*)(src + i);
+                *(short*)(destination + i) = *(short*)(source + i);
                 i += 2;
             }
 
@@ -490,22 +492,22 @@ internal unsafe static void Memmove(byte* dest, byte* src, nuint len)
 
 #if BIT64
             // On 64-bit IntPtr.Size == 8, so we want to advance to the next 8-aligned address. If
-            // (int)dest % 8 is 0, 5, 6, or 7, we will already have advanced by 0, 3, 2, or 1
+            // (int)destination % 8 is 0, 5, 6, or 7, we will already have advanced by 0, 3, 2, or 1
             // bytes to the next aligned address (respectively), so do nothing. On the other hand,
             // if it is 1, 2, 3, or 4 we will want to copy-and-advance another 4 bytes until
             // we're aligned.
             // The thing 1, 2, 3, and 4 have in common that the others don't is that if you
             // subtract one from them, their 3rd lsb will not be set. Hence, the below check.
 
-            if ((((int)dest - 1) & 4) == 0)
+            if ((((int)destination - 1) & 4) == 0)
             {
-                *(int*)(dest + i) = *(int*)(src + i);
+                *(int*)(destination + i) = *(int*)(source + i);
                 i += 4;
             }
 #endif // BIT64
 
-            nuint end = len - 16;
-            len -= i; // lower 4 bits of len represent how many bytes are left *after* the unrolled loop
+            nuint end = length - 16;
+            length -= i; // lower 4 bits of length represent how many bytes are left *after* the unrolled loop
 
             // We know due to the above switch-case that this loop will always run 1 iteration; max
             // bytes we copy before checking is 23 (7 to align the pointers, 16 for 1 iteration) so
@@ -526,16 +528,16 @@ internal unsafe static void Memmove(byte* dest, byte* src, nuint len)
                 // these to use memory addressing operands.
 
                 // So the only cost is a bit of code size, which is made up for by the fact that
-                // we save on writes to dest/src.
+                // we save on writes to destination/source.
 
 #if BIT64
-                *(long*)(dest + i) = *(long*)(src + i);
-                *(long*)(dest + i + 8) = *(long*)(src + i + 8);
+                *(long*)(destination + i) = *(long*)(source + i);
+                *(long*)(destination + i + 8) = *(long*)(source + i + 8);
 #else
-                *(int*)(dest + i) = *(int*)(src + i);
-                *(int*)(dest + i + 4) = *(int*)(src + i + 4);
-                *(int*)(dest + i + 8) = *(int*)(src + i + 8);
-                *(int*)(dest + i + 12) = *(int*)(src + i + 12);
+                *(int*)(destination + i) = *(int*)(source + i);
+                *(int*)(destination + i + 4) = *(int*)(source + i + 4);
+                *(int*)(destination + i + 8) = *(int*)(source + i + 8);
+                *(int*)(destination + i + 12) = *(int*)(source + i + 12);
 #endif
 
                 i = counter;
@@ -545,29 +547,29 @@ internal unsafe static void Memmove(byte* dest, byte* src, nuint len)
             }
             while (counter <= end);
 
-            if ((len & 8) != 0)
+            if ((length & 8) != 0)
             {
 #if BIT64
-                *(long*)(dest + i) = *(long*)(src + i);
+                *(long*)(destination + i) = *(long*)(source + i);
 #else
-                *(int*)(dest + i) = *(int*)(src + i);
-                *(int*)(dest + i + 4) = *(int*)(src + i + 4);
+                *(int*)(destination + i) = *(int*)(source + i);
+                *(int*)(destination + i + 4) = *(int*)(source + i + 4);
 #endif
                 i += 8;
             }
-            if ((len & 4) != 0) 
+            if ((length & 4) != 0) 
             {
-                *(int*)(dest + i) = *(int*)(src + i);
+                *(int*)(destination + i) = *(int*)(source + i);
                 i += 4;
             }
-            if ((len & 2) != 0) 
+            if ((length & 2) != 0) 
             {
-                *(short*)(dest + i) = *(short*)(src + i);
+                *(short*)(destination + i) = *(short*)(source + i);
                 i += 2;
             }
-            if ((len & 1) != 0)
+            if ((length & 1) != 0)
             {
-                *(dest + i) = *(src + i);
+                *(destination + i) = *(source + i);
                 // We're not using i after this, so not needed
                 // i += 1;
             }
@@ -575,7 +577,7 @@ internal unsafe static void Memmove(byte* dest, byte* src, nuint len)
             return;
 
             PInvoke:
-            _Memmove(dest, src, len);
+            _Memmove(destination, source, length);
 
         }
 
@@ -603,7 +605,7 @@ public static unsafe void MemoryCopy(void* source, void* destination, long desti
             {
                 ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.sourceBytesToCopy);
             }
-            Memmove((byte*)destination, (byte*)source, checked((nuint)sourceBytesToCopy));
+            MemoryCopyCore((byte*)destination, (byte*)source, checked((nuint)sourceBytesToCopy));
         }
 
 
@@ -618,9 +620,9 @@ public static unsafe void MemoryCopy(void* source, void* destination, ulong dest
                 ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.sourceBytesToCopy);
             }
 #if BIT64
-            Memmove((byte*)destination, (byte*)source, sourceBytesToCopy);
+            MemoryCopyCore((byte*)destination, (byte*)source, sourceBytesToCopy);
 #else // BIT64
-            Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy));
+            MemoryCopyCore((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy));
 #endif // BIT64
         }
     }
diff --git a/src/mscorlib/src/System/Span.cs b/src/mscorlib/src/System/Span.cs
index d7f8dc5c85d6..bfd1acbd5955 100644
--- a/src/mscorlib/src/System/Span.cs
+++ b/src/mscorlib/src/System/Span.cs
@@ -455,9 +455,9 @@ internal static unsafe void CopyTo<T>(ref T destination, ref T source, int eleme
                     fixed (byte* pSource = &Unsafe.As<T, byte>(ref source))
                     {
 #if BIT64
-                        Buffer.Memmove(pDestination, pSource, (ulong)elementsCount * (ulong)Unsafe.SizeOf<T>());
+                        Buffer.MemoryCopyCore(pDestination, pSource, (ulong)elementsCount * (ulong)Unsafe.SizeOf<T>());
 #else
-                        Buffer.Memmove(pDestination, pSource, (uint)elementsCount * (uint)Unsafe.SizeOf<T>());
+                        Buffer.MemoryCopyCore(pDestination, pSource, (uint)elementsCount * (uint)Unsafe.SizeOf<T>());
 #endif
                     }
                 }

From 6a2ed86496daf54287001341673f539d1294dbe3 Mon Sep 17 00:00:00 2001
From: James Ko <jamesqko@gmail.com>
Date: Thu, 26 Jan 2017 19:57:49 -0500
Subject: [PATCH 2/3] Finish porting code, with ARM64 not optimized for

---
 src/mscorlib/src/System/Buffer.cs | 263 ++++++++++--------------------
 1 file changed, 84 insertions(+), 179 deletions(-)

diff --git a/src/mscorlib/src/System/Buffer.cs b/src/mscorlib/src/System/Buffer.cs
index 3a44f37827a8..53808d557d3f 100644
--- a/src/mscorlib/src/System/Buffer.cs
+++ b/src/mscorlib/src/System/Buffer.cs
@@ -48,7 +48,7 @@ internal static extern void InternalBlockCopy(Array src, int srcOffsetBytes,
         //
         internal unsafe static int IndexOfByte(byte* src, byte value, int index, int count)
         {
-            Debug.Assert(src != null, "src should not be null");
+            Debug.Assert(src != null);
 
             byte* pByte = src + index;
 
@@ -258,9 +258,18 @@ internal unsafe static void Memcpy(byte* dest, byte* src, int len) {
         [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
         internal unsafe static void MemoryCopyCore(byte* destination, byte* source, nuint length)
         {
+            const nuint PInvokeThreshold = 512;
+#if AMD64
+            const nuint CopyAlignment = 16; // SIMD is enabled for AMD64, so align on a 16-byte boundary
+            const nuint BytesPerIteration = 64;
+#else
+            const nuint CopyAlignment = 4; // Align on a 4-byte boundary
+            const nuint BytesPerIteration = 16;
+#endif
+
             // P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards.
             // This check can produce false positives for very large lengths if the destination is behind the source.
-            // It is fine because we want to use the P/Invoke path for such large lengths anyway.
+            // It is fine because we would take the P/Invoke path later for such large lengths anyway.
 
             if ((nuint)destination - (nuint)source < length)
             {
@@ -270,12 +279,8 @@ internal unsafe static void MemoryCopyCore(byte* destination, byte* source, nuin
             // Currently, the following code seems to be faster than `Unsafe.CopyBlock` in benchmarks. If that is no longer
             // the case after changes to the JIT, the below code can simply be replaced with a call to that method.
 
-            // Note: It's important that this switch handles lengths at least up to 22.
-            // See notes below near the main loop for why.
-
-            // The switch will be very fast since it can be implemented using a jump
-            // table in assembly. See http://stackoverflow.com/a/449297/4077294 for more info.
-
+            // This switch will be fast since it is compiled into a jump table in assembly.
+            // See http://stackoverflow.com/a/449297/4077294 for more info.
             switch (length)
             {
             case 0:
@@ -382,198 +387,89 @@ internal unsafe static void MemoryCopyCore(byte* destination, byte* source, nuin
                 *(short*)(destination + 12) = *(short*)(source + 12);
                 *(destination + 14) = *(source + 14);
                 return;
-            case 16:
-#if BIT64
-                *(long*)destination = *(long*)source;
-                *(long*)(destination + 8) = *(long*)(source + 8);
-#else
-                *(int*)destination = *(int*)source;
-                *(int*)(destination + 4) = *(int*)(source + 4);
-                *(int*)(destination + 8) = *(int*)(source + 8);
-                *(int*)(destination + 12) = *(int*)(source + 12);
-#endif
-                return;
-            case 17:
-#if BIT64
-                *(long*)destination = *(long*)source;
-                *(long*)(destination + 8) = *(long*)(source + 8);
-#else
-                *(int*)destination = *(int*)source;
-                *(int*)(destination + 4) = *(int*)(source + 4);
-                *(int*)(destination + 8) = *(int*)(source + 8);
-                *(int*)(destination + 12) = *(int*)(source + 12);
-#endif
-                *(destination + 16) = *(source + 16);
-                return;
-            case 18:
-#if BIT64
-                *(long*)destination = *(long*)source;
-                *(long*)(destination + 8) = *(long*)(source + 8);
-#else
-                *(int*)destination = *(int*)source;
-                *(int*)(destination + 4) = *(int*)(source + 4);
-                *(int*)(destination + 8) = *(int*)(source + 8);
-                *(int*)(destination + 12) = *(int*)(source + 12);
-#endif
-                *(short*)(destination + 16) = *(short*)(source + 16);
-                return;
-            case 19:
-#if BIT64
-                *(long*)destination = *(long*)source;
-                *(long*)(destination + 8) = *(long*)(source + 8);
-#else
-                *(int*)destination = *(int*)source;
-                *(int*)(destination + 4) = *(int*)(source + 4);
-                *(int*)(destination + 8) = *(int*)(source + 8);
-                *(int*)(destination + 12) = *(int*)(source + 12);
-#endif
-                *(short*)(destination + 16) = *(short*)(source + 16);
-                *(destination + 18) = *(source + 18);
-                return;
-            case 20:
-#if BIT64
-                *(long*)destination = *(long*)source;
-                *(long*)(destination + 8) = *(long*)(source + 8);
-#else
-                *(int*)destination = *(int*)source;
-                *(int*)(destination + 4) = *(int*)(source + 4);
-                *(int*)(destination + 8) = *(int*)(source + 8);
-                *(int*)(destination + 12) = *(int*)(source + 12);
-#endif
-                *(int*)(destination + 16) = *(int*)(source + 16);
-                return;
-            case 21:
-#if BIT64
-                *(long*)destination = *(long*)source;
-                *(long*)(destination + 8) = *(long*)(source + 8);
-#else
-                *(int*)destination = *(int*)source;
-                *(int*)(destination + 4) = *(int*)(source + 4);
-                *(int*)(destination + 8) = *(int*)(source + 8);
-                *(int*)(destination + 12) = *(int*)(source + 12);
-#endif
-                *(int*)(destination + 16) = *(int*)(source + 16);
-                *(destination + 20) = *(source + 20);
-                return;
-            case 22:
-#if BIT64
-                *(long*)destination = *(long*)source;
-                *(long*)(destination + 8) = *(long*)(source + 8);
-#else
-                *(int*)destination = *(int*)source;
-                *(int*)(destination + 4) = *(int*)(source + 4);
-                *(int*)(destination + 8) = *(int*)(source + 8);
-                *(int*)(destination + 12) = *(int*)(source + 12);
-#endif
-                *(int*)(destination + 16) = *(int*)(source + 16);
-                *(short*)(destination + 20) = *(short*)(source + 20);
-                return;
             }
 
             // P/Invoke into the native version for large lengths
-            if (length >= 512) goto PInvoke;
-
-            nuint i = 0; // byte offset at which we're copying
-
-            if (((int)destination & 3) != 0)
-            {
-                if (((int)destination & 1) != 0)
-                {
-                    *(destination + i) = *(source + i);
-                    i += 1;
-                    if (((int)destination & 2) != 0)
-                        goto IntAligned;
-                }
-                *(short*)(destination + i) = *(short*)(source + i);
-                i += 2;
-            }
-
-            IntAligned:
-
-#if BIT64
-            // On 64-bit IntPtr.Size == 8, so we want to advance to the next 8-aligned address. If
-            // (int)destination % 8 is 0, 5, 6, or 7, we will already have advanced by 0, 3, 2, or 1
-            // bytes to the next aligned address (respectively), so do nothing. On the other hand,
-            // if it is 1, 2, 3, or 4 we will want to copy-and-advance another 4 bytes until
-            // we're aligned.
-            // The thing 1, 2, 3, and 4 have in common that the others don't is that if you
-            // subtract one from them, their 3rd lsb will not be set. Hence, the below check.
-
-            if ((((int)destination - 1) & 4) == 0)
+            if (length > PInvokeThreshold)
             {
-                *(int*)(destination + i) = *(int*)(source + i);
-                i += 4;
+                goto PInvoke;
             }
-#endif // BIT64
-
-            nuint end = length - 16;
-            length -= i; // lower 4 bits of length represent how many bytes are left *after* the unrolled loop
-
-            // We know due to the above switch-case that this loop will always run 1 iteration; max
-            // bytes we copy before checking is 23 (7 to align the pointers, 16 for 1 iteration) so
-            // the switch handles lengths 0-22.
-            Debug.Assert(end >= 7 && i <= end);
-
-            // This is separated out into a different variable, so the i + 16 addition can be
-            // performed at the start of the pipeline and the loop condition does not have
-            // a dependency on the writes.
-            nuint counter; 
-
-            do
-            {
-                counter = i + 16;
-
-                // This loop looks very costly since there appear to be a bunch of temporary values
-                // being created with the adds, but the jit (for x86 anyways) will convert each of
-                // these to use memory addressing operands.
+            
+            // We've already handled lengths 0-15, so we can write at least 16 bytes.
+            // This calculates the offset of the next aligned address we know it's okay to write up to.
+            Debug.Assert(length >= 16);
+            nuint offset = 16 - ((nuint)destination % CopyAlignment);
 
-                // So the only cost is a bit of code size, which is made up for by the fact that
-                // we save on writes to destination/source.
+            Debug.Assert(offset > 0 && offset <= 16);
+            Debug.Assert((nuint)(destination + offset) % CopyAlignment == 0);
 
-#if BIT64
-                *(long*)(destination + i) = *(long*)(source + i);
-                *(long*)(destination + i + 8) = *(long*)(source + i + 8);
+#if AMD64
+            // SIMD is enabled for AMD64. Take advantage of that and use movdqu
+            *(Block16*)destination = *(Block16*)source;
 #else
-                *(int*)(destination + i) = *(int*)(source + i);
-                *(int*)(destination + i + 4) = *(int*)(source + i + 4);
-                *(int*)(destination + i + 8) = *(int*)(source + i + 8);
-                *(int*)(destination + i + 12) = *(int*)(source + i + 12);
+            // Make one unaligned 4-byte write, then 3 aligned 4-byte writes.
+            *(int*)destination = source;
+            *(int*)(destination + offset - 12) = *(int*)(source + offset - 12);
+            *(int*)(destination + offset - 8) = *(int*)(source + offset - 8);
+            *(int*)(destination + offset - 4) = *(int*)(source + offset - 4);
 #endif
 
-                i = counter;
-                
-                // See notes above for why this wasn't used instead
-                // i += 16;
+            // Catch unsigned overflow before we do the subtraction.
+            if (length < BytesPerIteration)
+            {
+                goto AfterUnrolledCopy;
             }
-            while (counter <= end);
 
-            if ((length & 8) != 0)
+            nuint endOffset = length - BytesPerIteration;
+            
+            while (offset <= endOffset)
             {
-#if BIT64
-                *(long*)(destination + i) = *(long*)(source + i);
+#if AMD64
+                // Write 64 bytes at a time, taking advantage of xmm register on AMD64
+                // This will be translated to 4 movdqus (maybe movdqas in the future, see dotnet/coreclr#2725)
+                *(Block64*)destination = *(Block64*)source;
 #else
-                *(int*)(destination + i) = *(int*)(source + i);
-                *(int*)(destination + i + 4) = *(int*)(source + i + 4);
+                // Write 16 bytes at a time, via 4 4-byte writes.
+                *(int*)(destination + offset) = *(int*)(source + offset);
+                *(int*)(destination + offset + 4) = *(int*)(source + offset + 4);
+                *(int*)(destination + offset + 8) = *(int*)(source + offset + 8);
+                *(int*)(destination + offset + 12) = *(int*)(source + offset + 12);
 #endif
-                i += 8;
-            }
-            if ((length & 4) != 0) 
-            {
-                *(int*)(destination + i) = *(int*)(source + i);
-                i += 4;
+                offset += BytesPerIteration;
             }
-            if ((length & 2) != 0) 
+
+            AfterUnrolledCopy:
+
+            Debug.Assert((nuint)(destination + offset) % CopyAlignment == 0);
+
+            nuint remainingLength = length - offset;
+            Debug.Assert(remainingLength < BytesPerIteration);
+
+            // Finish up the copy by dividing it into blocks of smaller powers of 2.
+            // The bits of `remainingLength` tells us how it can be expressed as a sum of powers of 2.
+
+#if AMD64
+            if ((remainingLength & 32) != 0)
             {
-                *(short*)(destination + i) = *(short*)(source + i);
-                i += 2;
+                *(Block32*)(destination + offset) = *(Block32*)(source + offset);
+                offset += 32;
             }
-            if ((length & 1) != 0)
+
+            if ((remainingLength & 16) != 0)
             {
-                *(destination + i) = *(source + i);
-                // We're not using i after this, so not needed
-                // i += 1;
+                *(Block16*)(destination + offset) = *(Block16*)(source + offset);
+                offset += 16;
             }
 
+            // Make one potentially unaligned write and quit.
+            *(Block16*)(destination + length - 16) = *(Block16*)(source + length - 16);
+#else
+            // Make 3 aligned 4-byte writes, then one unaligned 4-byte write.
+            *(int*)(destination + offset) = *(int*)(source + offset);
+            *(int*)(destination + offset + 4) = *(int*)(source + offset + 4);
+            *(int*)(destination + offset + 8) = *(int*)(source + offset + 8);
+            *(int*)(destination + length - 4) = *(int*)(source + length - 4);
+#endif
             return;
 
             PInvoke:
@@ -625,5 +521,14 @@ public static unsafe void MemoryCopy(void* source, void* destination, ulong dest
             MemoryCopyCore((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy));
 #endif // BIT64
         }
+
+        [StructLayout(LayoutKind.Sequential, Size = 16)]
+        private struct Block16 { }
+
+        [StructLayout(LayoutKind.Sequential, Size = 32)]
+        private struct Block32 { }
+
+        [StructLayout(LayoutKind.Sequential, Size = 64)]
+        private struct Block64 { }
     }
 }

From 884d995078cfcd344250ce876eb5329ee5fa6177 Mon Sep 17 00:00:00 2001
From: James Ko <jamesqko@gmail.com>
Date: Thu, 26 Jan 2017 20:04:19 -0500
Subject: [PATCH 3/3] Put back the PInvokeThreshold to 1024

---
 src/mscorlib/src/System/Buffer.cs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mscorlib/src/System/Buffer.cs b/src/mscorlib/src/System/Buffer.cs
index 53808d557d3f..57390bba3414 100644
--- a/src/mscorlib/src/System/Buffer.cs
+++ b/src/mscorlib/src/System/Buffer.cs
@@ -258,11 +258,12 @@ internal unsafe static void Memcpy(byte* dest, byte* src, int len) {
         [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)]
         internal unsafe static void MemoryCopyCore(byte* destination, byte* source, nuint length)
         {
-            const nuint PInvokeThreshold = 512;
 #if AMD64
+            const nuint PInvokeThreshold = 1024;
             const nuint CopyAlignment = 16; // SIMD is enabled for AMD64, so align on a 16-byte boundary
             const nuint BytesPerIteration = 64;
 #else
+            const nuint PInvokeThreshold = 512;
             const nuint CopyAlignment = 4; // Align on a 4-byte boundary
             const nuint BytesPerIteration = 16;
 #endif