Port optimization in Buffer.Memmove from CoreCLR

This is just a copy of changes made in dotnet/coreclr#9786
sergiy-k · May 8, 2017 · 138fe5d · 138fe5d
1 parent 4bb8337
commit 138fe5d
Showing 1 changed file with 161 additions and 195 deletions.
diff --git a/src/System.Private.CoreLib/src/System/Buffer.cs b/src/System.Private.CoreLib/src/System/Buffer.cs
@@ -2,6 +2,10 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+#if AMD64 || (BIT32 && !ARM)
+#define HAS_CUSTOM_BLOCKS
+#endif
+
 using System;
 using System.Runtime;
 using System.Diagnostics;
@@ -233,222 +237,176 @@ internal unsafe static void Memcpy(byte* dest, byte* src, int len)
             Memmove(dest, src, (nuint)len);
         }
 
-        internal static unsafe void Memmove(byte* dest, byte* src, nuint len)
+        // This method has different signature for x64 and other platforms and is done for performance reasons.
+        internal unsafe static void Memmove(byte* dest, byte* src, nuint len)
         {
-            // P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards
-            // This check can produce false positives for lengths greater than Int32.MaxInt. It is fine because we want to use PInvoke path for the large lengths anyway.
-            if ((nuint)dest - (nuint)src < len)
-            {
-                _Memmove(dest, src, len);
-                return;
-            }
+#if AMD64 || (BIT32 && !ARM)
+            const nuint CopyThreshold = 2048;
+#else
+            const nuint CopyThreshold = 512;
+#endif // AMD64 || (BIT32 && !ARM)
 
-            //
-            // This is portable version of memcpy. It mirrors what the hand optimized assembly versions of memcpy typically do.
-            //
+            // P/Invoke into the native version when the buffers are overlapping.
 
-#if ALIGN_ACCESS
-#error Needs porting for ALIGN_ACCESS (https://github.com/dotnet/corert/issues/430)
-#else // ALIGN_ACCESS
-            switch (len)
-            {
-                case 0:
-                    return;
-                case 1:
-                    *dest = *src;
-                    return;
-                case 2:
-                    *(short*)dest = *(short*)src;
-                    return;
-                case 3:
-                    *(short*)dest = *(short*)src;
-                    *(dest + 2) = *(src + 2);
-                    return;
-                case 4:
-                    *(int*)dest = *(int*)src;
-                    return;
-                case 5:
-                    *(int*)dest = *(int*)src;
-                    *(dest + 4) = *(src + 4);
-                    return;
-                case 6:
-                    *(int*)dest = *(int*)src;
-                    *(short*)(dest + 4) = *(short*)(src + 4);
-                    return;
-                case 7:
-                    *(int*)dest = *(int*)src;
-                    *(short*)(dest + 4) = *(short*)(src + 4);
-                    *(dest + 6) = *(src + 6);
-                    return;
-                case 8:
-#if BIT64
-                    *(long*)dest = *(long*)src;
-#else
-                    *(int*)dest = *(int*)src;
-                    *(int*)(dest + 4) = *(int*)(src + 4);
-#endif
-                    return;
-                case 9:
-#if BIT64
-                    *(long*)dest = *(long*)src;
-#else
-                    *(int*)dest = *(int*)src;
-                    *(int*)(dest + 4) = *(int*)(src + 4);
-#endif
-                    *(dest + 8) = *(src + 8);
-                    return;
-                case 10:
-#if BIT64
-                    *(long*)dest = *(long*)src;
-#else
-                    *(int*)dest = *(int*)src;
-                    *(int*)(dest + 4) = *(int*)(src + 4);
-#endif
-                    *(short*)(dest + 8) = *(short*)(src + 8);
-                    return;
-                case 11:
-#if BIT64
-                    *(long*)dest = *(long*)src;
-#else
-                    *(int*)dest = *(int*)src;
-                    *(int*)(dest + 4) = *(int*)(src + 4);
-#endif
-                    *(short*)(dest + 8) = *(short*)(src + 8);
-                    *(dest + 10) = *(src + 10);
-                    return;
-                case 12:
-#if BIT64
-                    *(long*)dest = *(long*)src;
+            if (((nuint)dest - (nuint)src < len) || ((nuint)src - (nuint)dest < len)) goto PInvoke;
+
+            byte* srcEnd = src + len;
+            byte* destEnd = dest + len;
+
+            if (len <= 16) goto MCPY02;
+            if (len > 64) goto MCPY05;
+
+            MCPY00:
+            // Copy bytes which are multiples of 16 and leave the remainder for MCPY01 to handle.
+            Debug.Assert(len > 16 && len <= 64);
+#if HAS_CUSTOM_BLOCKS
+            *(Block16*)dest = *(Block16*)src;                   // [0,16]
+#elif BIT64
+            *(long*)dest = *(long*)src;
+            *(long*)(dest + 8) = *(long*)(src + 8);             // [0,16]
 #else
-                    *(int*)dest = *(int*)src;
-                    *(int*)(dest + 4) = *(int*)(src + 4);
+            *(int*)dest = *(int*)src;
+            *(int*)(dest + 4) = *(int*)(src + 4);
+            *(int*)(dest + 8) = *(int*)(src + 8);
+            *(int*)(dest + 12) = *(int*)(src + 12);             // [0,16]
 #endif
-                    *(int*)(dest + 8) = *(int*)(src + 8);
-                    return;
-                case 13:
-#if BIT64
-                    *(long*)dest = *(long*)src;
+            if (len <= 32) goto MCPY01;
+#if HAS_CUSTOM_BLOCKS
+            *(Block16*)(dest + 16) = *(Block16*)(src + 16);     // [0,32]
+#elif BIT64
+            *(long*)(dest + 16) = *(long*)(src + 16);
+            *(long*)(dest + 24) = *(long*)(src + 24);           // [0,32]
 #else
-                    *(int*)dest = *(int*)src;
-                    *(int*)(dest + 4) = *(int*)(src + 4);
+            *(int*)(dest + 16) = *(int*)(src + 16);
+            *(int*)(dest + 20) = *(int*)(src + 20);
+            *(int*)(dest + 24) = *(int*)(src + 24);
+            *(int*)(dest + 28) = *(int*)(src + 28);             // [0,32]
 #endif
-                    *(int*)(dest + 8) = *(int*)(src + 8);
-                    *(dest + 12) = *(src + 12);
-                    return;
-                case 14:
-#if BIT64
-                    *(long*)dest = *(long*)src;
+            if (len <= 48) goto MCPY01;
+#if HAS_CUSTOM_BLOCKS
+            *(Block16*)(dest + 32) = *(Block16*)(src + 32);     // [0,48]
+#elif BIT64
+            *(long*)(dest + 32) = *(long*)(src + 32);
+            *(long*)(dest + 40) = *(long*)(src + 40);           // [0,48]
 #else
-                    *(int*)dest = *(int*)src;
-                    *(int*)(dest + 4) = *(int*)(src + 4);
+            *(int*)(dest + 32) = *(int*)(src + 32);
+            *(int*)(dest + 36) = *(int*)(src + 36);
+            *(int*)(dest + 40) = *(int*)(src + 40);
+            *(int*)(dest + 44) = *(int*)(src + 44);             // [0,48]
 #endif
-                    *(int*)(dest + 8) = *(int*)(src + 8);
-                    *(short*)(dest + 12) = *(short*)(src + 12);
-                    return;
-                case 15:
-#if BIT64
-                    *(long*)dest = *(long*)src;
+
+            MCPY01:
+            // Unconditionally copy the last 16 bytes using destEnd and srcEnd and return.
+            Debug.Assert(len > 16 && len <= 64);
+#if HAS_CUSTOM_BLOCKS
+            *(Block16*)(destEnd - 16) = *(Block16*)(srcEnd - 16);
+#elif BIT64
+            *(long*)(destEnd - 16) = *(long*)(srcEnd - 16);
+            *(long*)(destEnd - 8) = *(long*)(srcEnd - 8);
 #else
-                    *(int*)dest = *(int*)src;
-                    *(int*)(dest + 4) = *(int*)(src + 4);
+            *(int*)(destEnd - 16) = *(int*)(srcEnd - 16);
+            *(int*)(destEnd - 12) = *(int*)(srcEnd - 12);
+            *(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
+            *(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
 #endif
-                    *(int*)(dest + 8) = *(int*)(src + 8);
-                    *(short*)(dest + 12) = *(short*)(src + 12);
-                    *(dest + 14) = *(src + 14);
-                    return;
-                case 16:
+            return;
+
+            MCPY02:
+            // Copy the first 8 bytes and then unconditionally copy the last 8 bytes and return.
+            if ((len & 24) == 0) goto MCPY03;
+            Debug.Assert(len >= 8 && len <= 16);
 #if BIT64
-                    *(long*)dest = *(long*)src;
-                    *(long*)(dest + 8) = *(long*)(src + 8);
+            *(long*)dest = *(long*)src;
+            *(long*)(destEnd - 8) = *(long*)(srcEnd - 8);
 #else
-                    *(int*)dest = *(int*)src;
-                    *(int*)(dest + 4) = *(int*)(src + 4);
-                    *(int*)(dest + 8) = *(int*)(src + 8);
-                    *(int*)(dest + 12) = *(int*)(src + 12);
+            *(int*)dest = *(int*)src;
+            *(int*)(dest + 4) = *(int*)(src + 4);
+            *(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
+            *(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
 #endif
-                    return;
-                default:
-                    break;
-            }
-
-            // P/Invoke into the native version for large lengths.
-            if (len >= 200)
-            {
-                _Memmove(dest, src, len);
-                return;
-            }
-
-            if (((int)dest & 3) != 0)
+            return;
+
+            MCPY03:
+            // Copy the first 4 bytes and then unconditionally copy the last 4 bytes and return.
+            if ((len & 4) == 0) goto MCPY04;
+            Debug.Assert(len >= 4 && len < 8);
+            *(int*)dest = *(int*)src;
+            *(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
+            return;
+
+            MCPY04:
+            // Copy the first byte. For pending bytes, do an unconditionally copy of the last 2 bytes and return.
+            Debug.Assert(len < 4);
+            if (len == 0) return;
+            *dest = *src;
+            if ((len & 2) == 0) return;
+            *(short*)(destEnd - 2) = *(short*)(srcEnd - 2);
+            return;
+
+            MCPY05:
+            // PInvoke to the native version when the copy length exceeds the threshold.
+            if (len > CopyThreshold)
             {
-                if (((int)dest & 1) != 0)
-                {
-                    *dest = *src;
-                    src++;
-                    dest++;
-                    len--;
-                    if (((int)dest & 2) == 0)
-                        goto Aligned;
-                }
-                *(short*)dest = *(short*)src;
-                src += 2;
-                dest += 2;
-                len -= 2;
-            Aligned:;
+                goto PInvoke;
             }
 
-#if BIT64
-            if (((int)dest & 4) != 0)
-            {
-                *(int*)dest = *(int*)src;
-                src += 4;
-                dest += 4;
-                len -= 4;
-            }
-#endif
-
-            nuint count = len / 16;
-            while (count > 0)
-            {
-#if BIT64
-                ((long*)dest)[0] = ((long*)src)[0];
-                ((long*)dest)[1] = ((long*)src)[1];
+            // Copy 64-bytes at a time until the remainder is less than 64.
+            // If remainder is greater than 16 bytes, then jump to MCPY00. Otherwise, unconditionally copy the last 16 bytes and return.
+            Debug.Assert(len > 64 && len <= CopyThreshold);
+            nuint n = len >> 6;
+
+            MCPY06:
+#if HAS_CUSTOM_BLOCKS
+            *(Block64*)dest = *(Block64*)src;
+#elif BIT64
+            *(long*)dest = *(long*)src;
+            *(long*)(dest + 8) = *(long*)(src + 8);
+            *(long*)(dest + 16) = *(long*)(src + 16);
+            *(long*)(dest + 24) = *(long*)(src + 24);
+            *(long*)(dest + 32) = *(long*)(src + 32);
+            *(long*)(dest + 40) = *(long*)(src + 40);
+            *(long*)(dest + 48) = *(long*)(src + 48);
+            *(long*)(dest + 56) = *(long*)(src + 56);
 #else
-                ((int*)dest)[0] = ((int*)src)[0];
-                ((int*)dest)[1] = ((int*)src)[1];
-                ((int*)dest)[2] = ((int*)src)[2];
-                ((int*)dest)[3] = ((int*)src)[3];
+            *(int*)dest = *(int*)src;
+            *(int*)(dest + 4) = *(int*)(src + 4);
+            *(int*)(dest + 8) = *(int*)(src + 8);
+            *(int*)(dest + 12) = *(int*)(src + 12);
+            *(int*)(dest + 16) = *(int*)(src + 16);
+            *(int*)(dest + 20) = *(int*)(src + 20);
+            *(int*)(dest + 24) = *(int*)(src + 24);
+            *(int*)(dest + 28) = *(int*)(src + 28);
+            *(int*)(dest + 32) = *(int*)(src + 32);
+            *(int*)(dest + 36) = *(int*)(src + 36);
+            *(int*)(dest + 40) = *(int*)(src + 40);
+            *(int*)(dest + 44) = *(int*)(src + 44);
+            *(int*)(dest + 48) = *(int*)(src + 48);
+            *(int*)(dest + 52) = *(int*)(src + 52);
+            *(int*)(dest + 56) = *(int*)(src + 56);
+            *(int*)(dest + 60) = *(int*)(src + 60);
 #endif
-                dest += 16;
-                src += 16;
-                count--;
-            }
-
-            if ((len & 8) != 0)
-            {
-#if BIT64
-                ((long*)dest)[0] = ((long*)src)[0];
+            dest += 64;
+            src += 64;
+            n--;
+            if (n != 0) goto MCPY06;
+
+            len %= 64;
+            if (len > 16) goto MCPY00;
+#if HAS_CUSTOM_BLOCKS
+            *(Block16*)(destEnd - 16) = *(Block16*)(srcEnd - 16);
+#elif BIT64
+            *(long*)(destEnd - 16) = *(long*)(srcEnd - 16);
+            *(long*)(destEnd - 8) = *(long*)(srcEnd - 8);
 #else
-                ((int*)dest)[0] = ((int*)src)[0];
-                ((int*)dest)[1] = ((int*)src)[1];
+            *(int*)(destEnd - 16) = *(int*)(srcEnd - 16);
+            *(int*)(destEnd - 12) = *(int*)(srcEnd - 12);
+            *(int*)(destEnd - 8) = *(int*)(srcEnd - 8);
+            *(int*)(destEnd - 4) = *(int*)(srcEnd - 4);
 #endif
-                dest += 8;
-                src += 8;
-            }
-            if ((len & 4) != 0)
-            {
-                ((int*)dest)[0] = ((int*)src)[0];
-                dest += 4;
-                src += 4;
-            }
-            if ((len & 2) != 0)
-            {
-                ((short*)dest)[0] = ((short*)src)[0];
-                dest += 2;
-                src += 2;
-            }
-            if ((len & 1) != 0)
-                *dest = *src;
-#endif // ALIGN_ACCESS
+            return;
+
+            PInvoke:
+            _Memmove(dest, src, len);
         }
 
         // Non-inlinable wrapper around the QCall that avoids poluting the fast path
@@ -458,5 +416,13 @@ private static unsafe void _Memmove(byte* dest, byte* src, nuint len)
         {
             RuntimeImports.memmove(dest, src, len);
         }
+
+#if HAS_CUSTOM_BLOCKS        
+        [StructLayout(LayoutKind.Sequential, Size = 16)]
+        private struct Block16 { }
+
+        [StructLayout(LayoutKind.Sequential, Size = 64)]
+        private struct Block64 { } 
+#endif // HAS_CUSTOM_BLOCKS 
     }
 }