From feacf6ae5a58c631323a827577392caf46e650fd Mon Sep 17 00:00:00 2001 From: James Ko Date: Thu, 26 Jan 2017 18:43:15 -0500 Subject: [PATCH 1/3] Rename the variables and the method --- src/mscorlib/src/System/Buffer.cs | 288 +++++++++++++++--------------- src/mscorlib/src/System/Span.cs | 4 +- 2 files changed, 147 insertions(+), 145 deletions(-) diff --git a/src/mscorlib/src/System/Buffer.cs b/src/mscorlib/src/System/Buffer.cs index eee2a81b51fe..3a44f37827a8 100644 --- a/src/mscorlib/src/System/Buffer.cs +++ b/src/mscorlib/src/System/Buffer.cs @@ -250,23 +250,25 @@ internal unsafe static void Memcpy(byte* pDest, int destIndex, byte[] src, int s [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] internal unsafe static void Memcpy(byte* dest, byte* src, int len) { Debug.Assert(len >= 0, "Negative length in memcopy!"); - Memmove(dest, src, (uint)len); + MemoryCopyCore(dest, src, (uint)len); } #endif // ARM - // This method has different signature for x64 and other platforms and is done for performance reasons. + // This method has different signature for x64 and other platforms for performance reasons. [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)] - internal unsafe static void Memmove(byte* dest, byte* src, nuint len) + internal unsafe static void MemoryCopyCore(byte* destination, byte* source, nuint length) { - // P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards - // This check can produce false positives for lengths greater than Int32.MaxInt. It is fine because we want to use PInvoke path for the large lengths anyway. + // P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards. + // This check can produce false positives for very large lengths if the destination is behind the source. + // It is fine because we want to use the P/Invoke path for such large lengths anyway. - if ((nuint)dest - (nuint)src < len) goto PInvoke; - - // This is portable version of memcpy. It mirrors what the hand optimized assembly versions of memcpy typically do. - // - // Ideally, we would just use the cpblk IL instruction here. Unfortunately, cpblk IL instruction is not as efficient as - // possible yet and so we have this implementation here for now. + if ((nuint)destination - (nuint)source < length) + { + goto PInvoke; + } + + // Currently, the following code seems to be faster than `Unsafe.CopyBlock` in benchmarks. If that is no longer + // the case after changes to the JIT, the below code can simply be replaced with a call to that method. // Note: It's important that this switch handles lengths at least up to 22. // See notes below near the main loop for why. @@ -274,215 +276,215 @@ internal unsafe static void Memmove(byte* dest, byte* src, nuint len) // The switch will be very fast since it can be implemented using a jump // table in assembly. See http://stackoverflow.com/a/449297/4077294 for more info. - switch (len) + switch (length) { case 0: return; case 1: - *dest = *src; + *destination = *source; return; case 2: - *(short*)dest = *(short*)src; + *(short*)destination = *(short*)source; return; case 3: - *(short*)dest = *(short*)src; - *(dest + 2) = *(src + 2); + *(short*)destination = *(short*)source; + *(destination + 2) = *(source + 2); return; case 4: - *(int*)dest = *(int*)src; + *(int*)destination = *(int*)source; return; case 5: - *(int*)dest = *(int*)src; - *(dest + 4) = *(src + 4); + *(int*)destination = *(int*)source; + *(destination + 4) = *(source + 4); return; case 6: - *(int*)dest = *(int*)src; - *(short*)(dest + 4) = *(short*)(src + 4); + *(int*)destination = *(int*)source; + *(short*)(destination + 4) = *(short*)(source + 4); return; case 7: - *(int*)dest = *(int*)src; - *(short*)(dest + 4) = *(short*)(src + 4); - *(dest + 6) = *(src + 6); + *(int*)destination = *(int*)source; + *(short*)(destination + 4) = *(short*)(source + 4); + *(destination + 6) = *(source + 6); return; case 8: #if BIT64 - *(long*)dest = *(long*)src; + *(long*)destination = *(long*)source; #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); #endif return; case 9: #if BIT64 - *(long*)dest = *(long*)src; + *(long*)destination = *(long*)source; #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); #endif - *(dest + 8) = *(src + 8); + *(destination + 8) = *(source + 8); return; case 10: #if BIT64 - *(long*)dest = *(long*)src; + *(long*)destination = *(long*)source; #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); #endif - *(short*)(dest + 8) = *(short*)(src + 8); + *(short*)(destination + 8) = *(short*)(source + 8); return; case 11: #if BIT64 - *(long*)dest = *(long*)src; + *(long*)destination = *(long*)source; #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); #endif - *(short*)(dest + 8) = *(short*)(src + 8); - *(dest + 10) = *(src + 10); + *(short*)(destination + 8) = *(short*)(source + 8); + *(destination + 10) = *(source + 10); return; case 12: #if BIT64 - *(long*)dest = *(long*)src; + *(long*)destination = *(long*)source; #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); #endif - *(int*)(dest + 8) = *(int*)(src + 8); + *(int*)(destination + 8) = *(int*)(source + 8); return; case 13: #if BIT64 - *(long*)dest = *(long*)src; + *(long*)destination = *(long*)source; #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); #endif - *(int*)(dest + 8) = *(int*)(src + 8); - *(dest + 12) = *(src + 12); + *(int*)(destination + 8) = *(int*)(source + 8); + *(destination + 12) = *(source + 12); return; case 14: #if BIT64 - *(long*)dest = *(long*)src; + *(long*)destination = *(long*)source; #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); #endif - *(int*)(dest + 8) = *(int*)(src + 8); - *(short*)(dest + 12) = *(short*)(src + 12); + *(int*)(destination + 8) = *(int*)(source + 8); + *(short*)(destination + 12) = *(short*)(source + 12); return; case 15: #if BIT64 - *(long*)dest = *(long*)src; + *(long*)destination = *(long*)source; #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); #endif - *(int*)(dest + 8) = *(int*)(src + 8); - *(short*)(dest + 12) = *(short*)(src + 12); - *(dest + 14) = *(src + 14); + *(int*)(destination + 8) = *(int*)(source + 8); + *(short*)(destination + 12) = *(short*)(source + 12); + *(destination + 14) = *(source + 14); return; case 16: #if BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); + *(long*)destination = *(long*)source; + *(long*)(destination + 8) = *(long*)(source + 8); #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); + *(int*)(destination + 8) = *(int*)(source + 8); + *(int*)(destination + 12) = *(int*)(source + 12); #endif return; case 17: #if BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); + *(long*)destination = *(long*)source; + *(long*)(destination + 8) = *(long*)(source + 8); #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); + *(int*)(destination + 8) = *(int*)(source + 8); + *(int*)(destination + 12) = *(int*)(source + 12); #endif - *(dest + 16) = *(src + 16); + *(destination + 16) = *(source + 16); return; case 18: #if BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); + *(long*)destination = *(long*)source; + *(long*)(destination + 8) = *(long*)(source + 8); #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); + *(int*)(destination + 8) = *(int*)(source + 8); + *(int*)(destination + 12) = *(int*)(source + 12); #endif - *(short*)(dest + 16) = *(short*)(src + 16); + *(short*)(destination + 16) = *(short*)(source + 16); return; case 19: #if BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); + *(long*)destination = *(long*)source; + *(long*)(destination + 8) = *(long*)(source + 8); #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); + *(int*)(destination + 8) = *(int*)(source + 8); + *(int*)(destination + 12) = *(int*)(source + 12); #endif - *(short*)(dest + 16) = *(short*)(src + 16); - *(dest + 18) = *(src + 18); + *(short*)(destination + 16) = *(short*)(source + 16); + *(destination + 18) = *(source + 18); return; case 20: #if BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); + *(long*)destination = *(long*)source; + *(long*)(destination + 8) = *(long*)(source + 8); #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); + *(int*)(destination + 8) = *(int*)(source + 8); + *(int*)(destination + 12) = *(int*)(source + 12); #endif - *(int*)(dest + 16) = *(int*)(src + 16); + *(int*)(destination + 16) = *(int*)(source + 16); return; case 21: #if BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); + *(long*)destination = *(long*)source; + *(long*)(destination + 8) = *(long*)(source + 8); #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); + *(int*)(destination + 8) = *(int*)(source + 8); + *(int*)(destination + 12) = *(int*)(source + 12); #endif - *(int*)(dest + 16) = *(int*)(src + 16); - *(dest + 20) = *(src + 20); + *(int*)(destination + 16) = *(int*)(source + 16); + *(destination + 20) = *(source + 20); return; case 22: #if BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); + *(long*)destination = *(long*)source; + *(long*)(destination + 8) = *(long*)(source + 8); #else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); + *(int*)destination = *(int*)source; + *(int*)(destination + 4) = *(int*)(source + 4); + *(int*)(destination + 8) = *(int*)(source + 8); + *(int*)(destination + 12) = *(int*)(source + 12); #endif - *(int*)(dest + 16) = *(int*)(src + 16); - *(short*)(dest + 20) = *(short*)(src + 20); + *(int*)(destination + 16) = *(int*)(source + 16); + *(short*)(destination + 20) = *(short*)(source + 20); return; } // P/Invoke into the native version for large lengths - if (len >= 512) goto PInvoke; + if (length >= 512) goto PInvoke; nuint i = 0; // byte offset at which we're copying - if (((int)dest & 3) != 0) + if (((int)destination & 3) != 0) { - if (((int)dest & 1) != 0) + if (((int)destination & 1) != 0) { - *(dest + i) = *(src + i); + *(destination + i) = *(source + i); i += 1; - if (((int)dest & 2) != 0) + if (((int)destination & 2) != 0) goto IntAligned; } - *(short*)(dest + i) = *(short*)(src + i); + *(short*)(destination + i) = *(short*)(source + i); i += 2; } @@ -490,22 +492,22 @@ internal unsafe static void Memmove(byte* dest, byte* src, nuint len) #if BIT64 // On 64-bit IntPtr.Size == 8, so we want to advance to the next 8-aligned address. If - // (int)dest % 8 is 0, 5, 6, or 7, we will already have advanced by 0, 3, 2, or 1 + // (int)destination % 8 is 0, 5, 6, or 7, we will already have advanced by 0, 3, 2, or 1 // bytes to the next aligned address (respectively), so do nothing. On the other hand, // if it is 1, 2, 3, or 4 we will want to copy-and-advance another 4 bytes until // we're aligned. // The thing 1, 2, 3, and 4 have in common that the others don't is that if you // subtract one from them, their 3rd lsb will not be set. Hence, the below check. - if ((((int)dest - 1) & 4) == 0) + if ((((int)destination - 1) & 4) == 0) { - *(int*)(dest + i) = *(int*)(src + i); + *(int*)(destination + i) = *(int*)(source + i); i += 4; } #endif // BIT64 - nuint end = len - 16; - len -= i; // lower 4 bits of len represent how many bytes are left *after* the unrolled loop + nuint end = length - 16; + length -= i; // lower 4 bits of length represent how many bytes are left *after* the unrolled loop // We know due to the above switch-case that this loop will always run 1 iteration; max // bytes we copy before checking is 23 (7 to align the pointers, 16 for 1 iteration) so @@ -526,16 +528,16 @@ internal unsafe static void Memmove(byte* dest, byte* src, nuint len) // these to use memory addressing operands. // So the only cost is a bit of code size, which is made up for by the fact that - // we save on writes to dest/src. + // we save on writes to destination/source. #if BIT64 - *(long*)(dest + i) = *(long*)(src + i); - *(long*)(dest + i + 8) = *(long*)(src + i + 8); + *(long*)(destination + i) = *(long*)(source + i); + *(long*)(destination + i + 8) = *(long*)(source + i + 8); #else - *(int*)(dest + i) = *(int*)(src + i); - *(int*)(dest + i + 4) = *(int*)(src + i + 4); - *(int*)(dest + i + 8) = *(int*)(src + i + 8); - *(int*)(dest + i + 12) = *(int*)(src + i + 12); + *(int*)(destination + i) = *(int*)(source + i); + *(int*)(destination + i + 4) = *(int*)(source + i + 4); + *(int*)(destination + i + 8) = *(int*)(source + i + 8); + *(int*)(destination + i + 12) = *(int*)(source + i + 12); #endif i = counter; @@ -545,29 +547,29 @@ internal unsafe static void Memmove(byte* dest, byte* src, nuint len) } while (counter <= end); - if ((len & 8) != 0) + if ((length & 8) != 0) { #if BIT64 - *(long*)(dest + i) = *(long*)(src + i); + *(long*)(destination + i) = *(long*)(source + i); #else - *(int*)(dest + i) = *(int*)(src + i); - *(int*)(dest + i + 4) = *(int*)(src + i + 4); + *(int*)(destination + i) = *(int*)(source + i); + *(int*)(destination + i + 4) = *(int*)(source + i + 4); #endif i += 8; } - if ((len & 4) != 0) + if ((length & 4) != 0) { - *(int*)(dest + i) = *(int*)(src + i); + *(int*)(destination + i) = *(int*)(source + i); i += 4; } - if ((len & 2) != 0) + if ((length & 2) != 0) { - *(short*)(dest + i) = *(short*)(src + i); + *(short*)(destination + i) = *(short*)(source + i); i += 2; } - if ((len & 1) != 0) + if ((length & 1) != 0) { - *(dest + i) = *(src + i); + *(destination + i) = *(source + i); // We're not using i after this, so not needed // i += 1; } @@ -575,7 +577,7 @@ internal unsafe static void Memmove(byte* dest, byte* src, nuint len) return; PInvoke: - _Memmove(dest, src, len); + _Memmove(destination, source, length); } @@ -603,7 +605,7 @@ public static unsafe void MemoryCopy(void* source, void* destination, long desti { ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.sourceBytesToCopy); } - Memmove((byte*)destination, (byte*)source, checked((nuint)sourceBytesToCopy)); + MemoryCopyCore((byte*)destination, (byte*)source, checked((nuint)sourceBytesToCopy)); } @@ -618,9 +620,9 @@ public static unsafe void MemoryCopy(void* source, void* destination, ulong dest ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.sourceBytesToCopy); } #if BIT64 - Memmove((byte*)destination, (byte*)source, sourceBytesToCopy); + MemoryCopyCore((byte*)destination, (byte*)source, sourceBytesToCopy); #else // BIT64 - Memmove((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy)); + MemoryCopyCore((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy)); #endif // BIT64 } } diff --git a/src/mscorlib/src/System/Span.cs b/src/mscorlib/src/System/Span.cs index d7f8dc5c85d6..bfd1acbd5955 100644 --- a/src/mscorlib/src/System/Span.cs +++ b/src/mscorlib/src/System/Span.cs @@ -455,9 +455,9 @@ internal static unsafe void CopyTo(ref T destination, ref T source, int eleme fixed (byte* pSource = &Unsafe.As(ref source)) { #if BIT64 - Buffer.Memmove(pDestination, pSource, (ulong)elementsCount * (ulong)Unsafe.SizeOf()); + Buffer.MemoryCopyCore(pDestination, pSource, (ulong)elementsCount * (ulong)Unsafe.SizeOf()); #else - Buffer.Memmove(pDestination, pSource, (uint)elementsCount * (uint)Unsafe.SizeOf()); + Buffer.MemoryCopyCore(pDestination, pSource, (uint)elementsCount * (uint)Unsafe.SizeOf()); #endif } } From 6a2ed86496daf54287001341673f539d1294dbe3 Mon Sep 17 00:00:00 2001 From: James Ko Date: Thu, 26 Jan 2017 19:57:49 -0500 Subject: [PATCH 2/3] Finish porting code, with ARM64 not optimized for --- src/mscorlib/src/System/Buffer.cs | 263 ++++++++++-------------------- 1 file changed, 84 insertions(+), 179 deletions(-) diff --git a/src/mscorlib/src/System/Buffer.cs b/src/mscorlib/src/System/Buffer.cs index 3a44f37827a8..53808d557d3f 100644 --- a/src/mscorlib/src/System/Buffer.cs +++ b/src/mscorlib/src/System/Buffer.cs @@ -48,7 +48,7 @@ internal static extern void InternalBlockCopy(Array src, int srcOffsetBytes, // internal unsafe static int IndexOfByte(byte* src, byte value, int index, int count) { - Debug.Assert(src != null, "src should not be null"); + Debug.Assert(src != null); byte* pByte = src + index; @@ -258,9 +258,18 @@ internal unsafe static void Memcpy(byte* dest, byte* src, int len) { [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)] internal unsafe static void MemoryCopyCore(byte* destination, byte* source, nuint length) { + const nuint PInvokeThreshold = 512; +#if AMD64 + const nuint CopyAlignment = 16; // SIMD is enabled for AMD64, so align on a 16-byte boundary + const nuint BytesPerIteration = 64; +#else + const nuint CopyAlignment = 4; // Align on a 4-byte boundary + const nuint BytesPerIteration = 16; +#endif + // P/Invoke into the native version when the buffers are overlapping and the copy needs to be performed backwards. // This check can produce false positives for very large lengths if the destination is behind the source. - // It is fine because we want to use the P/Invoke path for such large lengths anyway. + // It is fine because we would take the P/Invoke path later for such large lengths anyway. if ((nuint)destination - (nuint)source < length) { @@ -270,12 +279,8 @@ internal unsafe static void MemoryCopyCore(byte* destination, byte* source, nuin // Currently, the following code seems to be faster than `Unsafe.CopyBlock` in benchmarks. If that is no longer // the case after changes to the JIT, the below code can simply be replaced with a call to that method. - // Note: It's important that this switch handles lengths at least up to 22. - // See notes below near the main loop for why. - - // The switch will be very fast since it can be implemented using a jump - // table in assembly. See http://stackoverflow.com/a/449297/4077294 for more info. - + // This switch will be fast since it is compiled into a jump table in assembly. + // See http://stackoverflow.com/a/449297/4077294 for more info. switch (length) { case 0: @@ -382,198 +387,89 @@ internal unsafe static void MemoryCopyCore(byte* destination, byte* source, nuin *(short*)(destination + 12) = *(short*)(source + 12); *(destination + 14) = *(source + 14); return; - case 16: -#if BIT64 - *(long*)destination = *(long*)source; - *(long*)(destination + 8) = *(long*)(source + 8); -#else - *(int*)destination = *(int*)source; - *(int*)(destination + 4) = *(int*)(source + 4); - *(int*)(destination + 8) = *(int*)(source + 8); - *(int*)(destination + 12) = *(int*)(source + 12); -#endif - return; - case 17: -#if BIT64 - *(long*)destination = *(long*)source; - *(long*)(destination + 8) = *(long*)(source + 8); -#else - *(int*)destination = *(int*)source; - *(int*)(destination + 4) = *(int*)(source + 4); - *(int*)(destination + 8) = *(int*)(source + 8); - *(int*)(destination + 12) = *(int*)(source + 12); -#endif - *(destination + 16) = *(source + 16); - return; - case 18: -#if BIT64 - *(long*)destination = *(long*)source; - *(long*)(destination + 8) = *(long*)(source + 8); -#else - *(int*)destination = *(int*)source; - *(int*)(destination + 4) = *(int*)(source + 4); - *(int*)(destination + 8) = *(int*)(source + 8); - *(int*)(destination + 12) = *(int*)(source + 12); -#endif - *(short*)(destination + 16) = *(short*)(source + 16); - return; - case 19: -#if BIT64 - *(long*)destination = *(long*)source; - *(long*)(destination + 8) = *(long*)(source + 8); -#else - *(int*)destination = *(int*)source; - *(int*)(destination + 4) = *(int*)(source + 4); - *(int*)(destination + 8) = *(int*)(source + 8); - *(int*)(destination + 12) = *(int*)(source + 12); -#endif - *(short*)(destination + 16) = *(short*)(source + 16); - *(destination + 18) = *(source + 18); - return; - case 20: -#if BIT64 - *(long*)destination = *(long*)source; - *(long*)(destination + 8) = *(long*)(source + 8); -#else - *(int*)destination = *(int*)source; - *(int*)(destination + 4) = *(int*)(source + 4); - *(int*)(destination + 8) = *(int*)(source + 8); - *(int*)(destination + 12) = *(int*)(source + 12); -#endif - *(int*)(destination + 16) = *(int*)(source + 16); - return; - case 21: -#if BIT64 - *(long*)destination = *(long*)source; - *(long*)(destination + 8) = *(long*)(source + 8); -#else - *(int*)destination = *(int*)source; - *(int*)(destination + 4) = *(int*)(source + 4); - *(int*)(destination + 8) = *(int*)(source + 8); - *(int*)(destination + 12) = *(int*)(source + 12); -#endif - *(int*)(destination + 16) = *(int*)(source + 16); - *(destination + 20) = *(source + 20); - return; - case 22: -#if BIT64 - *(long*)destination = *(long*)source; - *(long*)(destination + 8) = *(long*)(source + 8); -#else - *(int*)destination = *(int*)source; - *(int*)(destination + 4) = *(int*)(source + 4); - *(int*)(destination + 8) = *(int*)(source + 8); - *(int*)(destination + 12) = *(int*)(source + 12); -#endif - *(int*)(destination + 16) = *(int*)(source + 16); - *(short*)(destination + 20) = *(short*)(source + 20); - return; } // P/Invoke into the native version for large lengths - if (length >= 512) goto PInvoke; - - nuint i = 0; // byte offset at which we're copying - - if (((int)destination & 3) != 0) - { - if (((int)destination & 1) != 0) - { - *(destination + i) = *(source + i); - i += 1; - if (((int)destination & 2) != 0) - goto IntAligned; - } - *(short*)(destination + i) = *(short*)(source + i); - i += 2; - } - - IntAligned: - -#if BIT64 - // On 64-bit IntPtr.Size == 8, so we want to advance to the next 8-aligned address. If - // (int)destination % 8 is 0, 5, 6, or 7, we will already have advanced by 0, 3, 2, or 1 - // bytes to the next aligned address (respectively), so do nothing. On the other hand, - // if it is 1, 2, 3, or 4 we will want to copy-and-advance another 4 bytes until - // we're aligned. - // The thing 1, 2, 3, and 4 have in common that the others don't is that if you - // subtract one from them, their 3rd lsb will not be set. Hence, the below check. - - if ((((int)destination - 1) & 4) == 0) + if (length > PInvokeThreshold) { - *(int*)(destination + i) = *(int*)(source + i); - i += 4; + goto PInvoke; } -#endif // BIT64 - - nuint end = length - 16; - length -= i; // lower 4 bits of length represent how many bytes are left *after* the unrolled loop - - // We know due to the above switch-case that this loop will always run 1 iteration; max - // bytes we copy before checking is 23 (7 to align the pointers, 16 for 1 iteration) so - // the switch handles lengths 0-22. - Debug.Assert(end >= 7 && i <= end); - - // This is separated out into a different variable, so the i + 16 addition can be - // performed at the start of the pipeline and the loop condition does not have - // a dependency on the writes. - nuint counter; - - do - { - counter = i + 16; - - // This loop looks very costly since there appear to be a bunch of temporary values - // being created with the adds, but the jit (for x86 anyways) will convert each of - // these to use memory addressing operands. + + // We've already handled lengths 0-15, so we can write at least 16 bytes. + // This calculates the offset of the next aligned address we know it's okay to write up to. + Debug.Assert(length >= 16); + nuint offset = 16 - ((nuint)destination % CopyAlignment); - // So the only cost is a bit of code size, which is made up for by the fact that - // we save on writes to destination/source. + Debug.Assert(offset > 0 && offset <= 16); + Debug.Assert((nuint)(destination + offset) % CopyAlignment == 0); -#if BIT64 - *(long*)(destination + i) = *(long*)(source + i); - *(long*)(destination + i + 8) = *(long*)(source + i + 8); +#if AMD64 + // SIMD is enabled for AMD64. Take advantage of that and use movdqu + *(Block16*)destination = *(Block16*)source; #else - *(int*)(destination + i) = *(int*)(source + i); - *(int*)(destination + i + 4) = *(int*)(source + i + 4); - *(int*)(destination + i + 8) = *(int*)(source + i + 8); - *(int*)(destination + i + 12) = *(int*)(source + i + 12); + // Make one unaligned 4-byte write, then 3 aligned 4-byte writes. + *(int*)destination = source; + *(int*)(destination + offset - 12) = *(int*)(source + offset - 12); + *(int*)(destination + offset - 8) = *(int*)(source + offset - 8); + *(int*)(destination + offset - 4) = *(int*)(source + offset - 4); #endif - i = counter; - - // See notes above for why this wasn't used instead - // i += 16; + // Catch unsigned overflow before we do the subtraction. + if (length < BytesPerIteration) + { + goto AfterUnrolledCopy; } - while (counter <= end); - if ((length & 8) != 0) + nuint endOffset = length - BytesPerIteration; + + while (offset <= endOffset) { -#if BIT64 - *(long*)(destination + i) = *(long*)(source + i); +#if AMD64 + // Write 64 bytes at a time, taking advantage of xmm register on AMD64 + // This will be translated to 4 movdqus (maybe movdqas in the future, see dotnet/coreclr#2725) + *(Block64*)destination = *(Block64*)source; #else - *(int*)(destination + i) = *(int*)(source + i); - *(int*)(destination + i + 4) = *(int*)(source + i + 4); + // Write 16 bytes at a time, via 4 4-byte writes. + *(int*)(destination + offset) = *(int*)(source + offset); + *(int*)(destination + offset + 4) = *(int*)(source + offset + 4); + *(int*)(destination + offset + 8) = *(int*)(source + offset + 8); + *(int*)(destination + offset + 12) = *(int*)(source + offset + 12); #endif - i += 8; - } - if ((length & 4) != 0) - { - *(int*)(destination + i) = *(int*)(source + i); - i += 4; + offset += BytesPerIteration; } - if ((length & 2) != 0) + + AfterUnrolledCopy: + + Debug.Assert((nuint)(destination + offset) % CopyAlignment == 0); + + nuint remainingLength = length - offset; + Debug.Assert(remainingLength < BytesPerIteration); + + // Finish up the copy by dividing it into blocks of smaller powers of 2. + // The bits of `remainingLength` tells us how it can be expressed as a sum of powers of 2. + +#if AMD64 + if ((remainingLength & 32) != 0) { - *(short*)(destination + i) = *(short*)(source + i); - i += 2; + *(Block32*)(destination + offset) = *(Block32*)(source + offset); + offset += 32; } - if ((length & 1) != 0) + + if ((remainingLength & 16) != 0) { - *(destination + i) = *(source + i); - // We're not using i after this, so not needed - // i += 1; + *(Block16*)(destination + offset) = *(Block16*)(source + offset); + offset += 16; } + // Make one potentially unaligned write and quit. + *(Block16*)(destination + length - 16) = *(Block16*)(source + length - 16); +#else + // Make 3 aligned 4-byte writes, then one unaligned 4-byte write. + *(int*)(destination + offset) = *(int*)(source + offset); + *(int*)(destination + offset + 4) = *(int*)(source + offset + 4); + *(int*)(destination + offset + 8) = *(int*)(source + offset + 8); + *(int*)(destination + length - 4) = *(int*)(source + length - 4); +#endif return; PInvoke: @@ -625,5 +521,14 @@ public static unsafe void MemoryCopy(void* source, void* destination, ulong dest MemoryCopyCore((byte*)destination, (byte*)source, checked((uint)sourceBytesToCopy)); #endif // BIT64 } + + [StructLayout(LayoutKind.Sequential, Size = 16)] + private struct Block16 { } + + [StructLayout(LayoutKind.Sequential, Size = 32)] + private struct Block32 { } + + [StructLayout(LayoutKind.Sequential, Size = 64)] + private struct Block64 { } } } From 884d995078cfcd344250ce876eb5329ee5fa6177 Mon Sep 17 00:00:00 2001 From: James Ko Date: Thu, 26 Jan 2017 20:04:19 -0500 Subject: [PATCH 3/3] Put back the PInvokeThreshold to 1024 --- src/mscorlib/src/System/Buffer.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mscorlib/src/System/Buffer.cs b/src/mscorlib/src/System/Buffer.cs index 53808d557d3f..57390bba3414 100644 --- a/src/mscorlib/src/System/Buffer.cs +++ b/src/mscorlib/src/System/Buffer.cs @@ -258,11 +258,12 @@ internal unsafe static void Memcpy(byte* dest, byte* src, int len) { [ReliabilityContract(Consistency.WillNotCorruptState, Cer.Success)] internal unsafe static void MemoryCopyCore(byte* destination, byte* source, nuint length) { - const nuint PInvokeThreshold = 512; #if AMD64 + const nuint PInvokeThreshold = 1024; const nuint CopyAlignment = 16; // SIMD is enabled for AMD64, so align on a 16-byte boundary const nuint BytesPerIteration = 64; #else + const nuint PInvokeThreshold = 512; const nuint CopyAlignment = 4; // Align on a 4-byte boundary const nuint BytesPerIteration = 16; #endif