diff --git a/src/System.Private.CoreLib/shared/System/Buffer.Unix.cs b/src/System.Private.CoreLib/shared/System/Buffer.Unix.cs index 372cf92557d9..0f65fdbc95c4 100644 --- a/src/System.Private.CoreLib/shared/System/Buffer.Unix.cs +++ b/src/System.Private.CoreLib/shared/System/Buffer.Unix.cs @@ -20,7 +20,7 @@ public static partial class Buffer #elif ARM private const nuint MemmoveNativeThreshold = 512; #else - private const nuint MemmoveNativeThreshold = 2048; + private const nuint MemmoveNativeThreshold = 4096; #endif } } diff --git a/src/System.Private.CoreLib/shared/System/Buffer.Windows.cs b/src/System.Private.CoreLib/shared/System/Buffer.Windows.cs index ccb577a7e1b6..b21ac5423f0f 100644 --- a/src/System.Private.CoreLib/shared/System/Buffer.Windows.cs +++ b/src/System.Private.CoreLib/shared/System/Buffer.Windows.cs @@ -17,7 +17,7 @@ public static partial class Buffer // https://github.com/dotnet/coreclr/issues/13843 private const nuint MemmoveNativeThreshold = ulong.MaxValue; #else - private const nuint MemmoveNativeThreshold = 2048; + private const nuint MemmoveNativeThreshold = 4096; #endif } } diff --git a/src/System.Private.CoreLib/shared/System/Buffer.cs b/src/System.Private.CoreLib/shared/System/Buffer.cs index f2ffaaea85dc..c2f5914b28ae 100644 --- a/src/System.Private.CoreLib/shared/System/Buffer.cs +++ b/src/System.Private.CoreLib/shared/System/Buffer.cs @@ -2,14 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -#if AMD64 || ARM64 || (BIT32 && !ARM) -#define HAS_CUSTOM_BLOCKS -#endif using System.Diagnostics; using System.Runtime; using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; using Internal.Runtime.CompilerServices; @@ -20,6 +16,8 @@ using nint = System.Int32; using nuint = System.UInt32; #endif +using Block16 = System.Runtime.Intrinsics.Vector128; +using Block32 = System.Runtime.Intrinsics.Vector256; namespace System { @@ -87,7 +85,7 @@ internal static unsafe void ZeroMemory(byte* dest, nuint len) // The attributes on this method are chosen for best JIT performance. // Please do not edit unless intentional. - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CLSCompliant(false)] public static unsafe void MemoryCopy(void* source, void* destination, long destinationSizeInBytes, long sourceBytesToCopy) { @@ -100,7 +98,7 @@ public static unsafe void MemoryCopy(void* source, void* destination, long desti // The attributes on this method are chosen for best JIT performance. // Please do not edit unless intentional. - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining)] [CLSCompliant(false)] public static unsafe void MemoryCopy(void* source, void* destination, ulong destinationSizeInBytes, ulong sourceBytesToCopy) { @@ -141,171 +139,7 @@ internal static unsafe void Memcpy(byte* pDest, int destIndex, byte[] src, int s // This method has different signature for x64 and other platforms and is done for performance reasons. internal static unsafe void Memmove(byte* dest, byte* src, nuint len) - { - // P/Invoke into the native version when the buffers are overlapping. - if (((nuint)dest - (nuint)src < len) || ((nuint)src - (nuint)dest < len)) - { - goto PInvoke; - } - - byte* srcEnd = src + len; - byte* destEnd = dest + len; - - if (len <= 16) goto MCPY02; - if (len > 64) goto MCPY05; - - MCPY00: - // Copy bytes which are multiples of 16 and leave the remainder for MCPY01 to handle. - Debug.Assert(len > 16 && len <= 64); -#if HAS_CUSTOM_BLOCKS - *(Block16*)dest = *(Block16*)src; // [0,16] -#elif BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); // [0,16] -#else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); // [0,16] -#endif - if (len <= 32) goto MCPY01; -#if HAS_CUSTOM_BLOCKS - *(Block16*)(dest + 16) = *(Block16*)(src + 16); // [0,32] -#elif BIT64 - *(long*)(dest + 16) = *(long*)(src + 16); - *(long*)(dest + 24) = *(long*)(src + 24); // [0,32] -#else - *(int*)(dest + 16) = *(int*)(src + 16); - *(int*)(dest + 20) = *(int*)(src + 20); - *(int*)(dest + 24) = *(int*)(src + 24); - *(int*)(dest + 28) = *(int*)(src + 28); // [0,32] -#endif - if (len <= 48) goto MCPY01; -#if HAS_CUSTOM_BLOCKS - *(Block16*)(dest + 32) = *(Block16*)(src + 32); // [0,48] -#elif BIT64 - *(long*)(dest + 32) = *(long*)(src + 32); - *(long*)(dest + 40) = *(long*)(src + 40); // [0,48] -#else - *(int*)(dest + 32) = *(int*)(src + 32); - *(int*)(dest + 36) = *(int*)(src + 36); - *(int*)(dest + 40) = *(int*)(src + 40); - *(int*)(dest + 44) = *(int*)(src + 44); // [0,48] -#endif - - MCPY01: - // Unconditionally copy the last 16 bytes using destEnd and srcEnd and return. - Debug.Assert(len > 16 && len <= 64); -#if HAS_CUSTOM_BLOCKS - *(Block16*)(destEnd - 16) = *(Block16*)(srcEnd - 16); -#elif BIT64 - *(long*)(destEnd - 16) = *(long*)(srcEnd - 16); - *(long*)(destEnd - 8) = *(long*)(srcEnd - 8); -#else - *(int*)(destEnd - 16) = *(int*)(srcEnd - 16); - *(int*)(destEnd - 12) = *(int*)(srcEnd - 12); - *(int*)(destEnd - 8) = *(int*)(srcEnd - 8); - *(int*)(destEnd - 4) = *(int*)(srcEnd - 4); -#endif - return; - - MCPY02: - // Copy the first 8 bytes and then unconditionally copy the last 8 bytes and return. - if ((len & 24) == 0) goto MCPY03; - Debug.Assert(len >= 8 && len <= 16); -#if BIT64 - *(long*)dest = *(long*)src; - *(long*)(destEnd - 8) = *(long*)(srcEnd - 8); -#else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(destEnd - 8) = *(int*)(srcEnd - 8); - *(int*)(destEnd - 4) = *(int*)(srcEnd - 4); -#endif - return; - - MCPY03: - // Copy the first 4 bytes and then unconditionally copy the last 4 bytes and return. - if ((len & 4) == 0) goto MCPY04; - Debug.Assert(len >= 4 && len < 8); - *(int*)dest = *(int*)src; - *(int*)(destEnd - 4) = *(int*)(srcEnd - 4); - return; - - MCPY04: - // Copy the first byte. For pending bytes, do an unconditionally copy of the last 2 bytes and return. - Debug.Assert(len < 4); - if (len == 0) return; - *dest = *src; - if ((len & 2) == 0) return; - *(short*)(destEnd - 2) = *(short*)(srcEnd - 2); - return; - - MCPY05: - // PInvoke to the native version when the copy length exceeds the threshold. - if (len > MemmoveNativeThreshold) - { - goto PInvoke; - } - - // Copy 64-bytes at a time until the remainder is less than 64. - // If remainder is greater than 16 bytes, then jump to MCPY00. Otherwise, unconditionally copy the last 16 bytes and return. - Debug.Assert(len > 64 && len <= MemmoveNativeThreshold); - nuint n = len >> 6; - - MCPY06: -#if HAS_CUSTOM_BLOCKS - *(Block64*)dest = *(Block64*)src; -#elif BIT64 - *(long*)dest = *(long*)src; - *(long*)(dest + 8) = *(long*)(src + 8); - *(long*)(dest + 16) = *(long*)(src + 16); - *(long*)(dest + 24) = *(long*)(src + 24); - *(long*)(dest + 32) = *(long*)(src + 32); - *(long*)(dest + 40) = *(long*)(src + 40); - *(long*)(dest + 48) = *(long*)(src + 48); - *(long*)(dest + 56) = *(long*)(src + 56); -#else - *(int*)dest = *(int*)src; - *(int*)(dest + 4) = *(int*)(src + 4); - *(int*)(dest + 8) = *(int*)(src + 8); - *(int*)(dest + 12) = *(int*)(src + 12); - *(int*)(dest + 16) = *(int*)(src + 16); - *(int*)(dest + 20) = *(int*)(src + 20); - *(int*)(dest + 24) = *(int*)(src + 24); - *(int*)(dest + 28) = *(int*)(src + 28); - *(int*)(dest + 32) = *(int*)(src + 32); - *(int*)(dest + 36) = *(int*)(src + 36); - *(int*)(dest + 40) = *(int*)(src + 40); - *(int*)(dest + 44) = *(int*)(src + 44); - *(int*)(dest + 48) = *(int*)(src + 48); - *(int*)(dest + 52) = *(int*)(src + 52); - *(int*)(dest + 56) = *(int*)(src + 56); - *(int*)(dest + 60) = *(int*)(src + 60); -#endif - dest += 64; - src += 64; - n--; - if (n != 0) goto MCPY06; - - len %= 64; - if (len > 16) goto MCPY00; -#if HAS_CUSTOM_BLOCKS - *(Block16*)(destEnd - 16) = *(Block16*)(srcEnd - 16); -#elif BIT64 - *(long*)(destEnd - 16) = *(long*)(srcEnd - 16); - *(long*)(destEnd - 8) = *(long*)(srcEnd - 8); -#else - *(int*)(destEnd - 16) = *(int*)(srcEnd - 16); - *(int*)(destEnd - 12) = *(int*)(srcEnd - 12); - *(int*)(destEnd - 8) = *(int*)(srcEnd - 8); - *(int*)(destEnd - 4) = *(int*)(srcEnd - 4); -#endif - return; - - PInvoke: - _Memmove(dest, src, len); - } + => Memmove(ref Unsafe.AsRef(dest), ref Unsafe.AsRef(src), len); // This method has different signature for x64 and other platforms and is done for performance reasons. [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -358,58 +192,25 @@ private static void Memmove(ref byte dest, ref byte src, nuint len) MCPY00: // Copy bytes which are multiples of 16 and leave the remainder for MCPY01 to handle. Debug.Assert(len > 16 && len <= 64); -#if HAS_CUSTOM_BLOCKS + Unsafe.As(ref dest) = Unsafe.As(ref src); // [0,16] -#elif BIT64 - Unsafe.As(ref dest) = Unsafe.As(ref src); - Unsafe.As(ref Unsafe.Add(ref dest, 8)) = Unsafe.As(ref Unsafe.Add(ref src, 8)); // [0,16] -#else - Unsafe.As(ref dest) = Unsafe.As(ref src); - Unsafe.As(ref Unsafe.Add(ref dest, 4)) = Unsafe.As(ref Unsafe.Add(ref src, 4)); - Unsafe.As(ref Unsafe.Add(ref dest, 8)) = Unsafe.As(ref Unsafe.Add(ref src, 8)); - Unsafe.As(ref Unsafe.Add(ref dest, 12)) = Unsafe.As(ref Unsafe.Add(ref src, 12)); // [0,16] -#endif + if (len <= 32) goto MCPY01; -#if HAS_CUSTOM_BLOCKS + Unsafe.As(ref Unsafe.Add(ref dest, 16)) = Unsafe.As(ref Unsafe.Add(ref src, 16)); // [0,32] -#elif BIT64 - Unsafe.As(ref Unsafe.Add(ref dest, 16)) = Unsafe.As(ref Unsafe.Add(ref src, 16)); - Unsafe.As(ref Unsafe.Add(ref dest, 24)) = Unsafe.As(ref Unsafe.Add(ref src, 24)); // [0,32] -#else - Unsafe.As(ref Unsafe.Add(ref dest, 16)) = Unsafe.As(ref Unsafe.Add(ref src, 16)); - Unsafe.As(ref Unsafe.Add(ref dest, 20)) = Unsafe.As(ref Unsafe.Add(ref src, 20)); - Unsafe.As(ref Unsafe.Add(ref dest, 24)) = Unsafe.As(ref Unsafe.Add(ref src, 24)); - Unsafe.As(ref Unsafe.Add(ref dest, 28)) = Unsafe.As(ref Unsafe.Add(ref src, 28)); // [0,32] -#endif + if (len <= 48) goto MCPY01; -#if HAS_CUSTOM_BLOCKS + Unsafe.As(ref Unsafe.Add(ref dest, 32)) = Unsafe.As(ref Unsafe.Add(ref src, 32)); // [0,48] -#elif BIT64 - Unsafe.As(ref Unsafe.Add(ref dest, 32)) = Unsafe.As(ref Unsafe.Add(ref src, 32)); - Unsafe.As(ref Unsafe.Add(ref dest, 40)) = Unsafe.As(ref Unsafe.Add(ref src, 40)); // [0,48] -#else - Unsafe.As(ref Unsafe.Add(ref dest, 32)) = Unsafe.As(ref Unsafe.Add(ref src, 32)); - Unsafe.As(ref Unsafe.Add(ref dest, 36)) = Unsafe.As(ref Unsafe.Add(ref src, 36)); - Unsafe.As(ref Unsafe.Add(ref dest, 40)) = Unsafe.As(ref Unsafe.Add(ref src, 40)); - Unsafe.As(ref Unsafe.Add(ref dest, 44)) = Unsafe.As(ref Unsafe.Add(ref src, 44)); // [0,48] -#endif MCPY01: // Unconditionally copy the last 16 bytes using destEnd and srcEnd and return. Debug.Assert(len > 16 && len <= 64); -#if HAS_CUSTOM_BLOCKS + Unsafe.As(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -16)); -#elif BIT64 - Unsafe.As(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -16)); - Unsafe.As(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -8)); -#else - Unsafe.As(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -16)); - Unsafe.As(ref Unsafe.Add(ref destEnd, -12)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -12)); - Unsafe.As(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -8)); - Unsafe.As(ref Unsafe.Add(ref destEnd, -4)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -4)); -#endif + return; MCPY02: @@ -461,35 +262,9 @@ private static void Memmove(ref byte dest, ref byte src, nuint len) nuint n = len >> 6; MCPY06: -#if HAS_CUSTOM_BLOCKS - Unsafe.As(ref dest) = Unsafe.As(ref src); -#elif BIT64 - Unsafe.As(ref dest) = Unsafe.As(ref src); - Unsafe.As(ref Unsafe.Add(ref dest, 8)) = Unsafe.As(ref Unsafe.Add(ref src, 8)); - Unsafe.As(ref Unsafe.Add(ref dest, 16)) = Unsafe.As(ref Unsafe.Add(ref src, 16)); - Unsafe.As(ref Unsafe.Add(ref dest, 24)) = Unsafe.As(ref Unsafe.Add(ref src, 24)); - Unsafe.As(ref Unsafe.Add(ref dest, 32)) = Unsafe.As(ref Unsafe.Add(ref src, 32)); - Unsafe.As(ref Unsafe.Add(ref dest, 40)) = Unsafe.As(ref Unsafe.Add(ref src, 40)); - Unsafe.As(ref Unsafe.Add(ref dest, 48)) = Unsafe.As(ref Unsafe.Add(ref src, 48)); - Unsafe.As(ref Unsafe.Add(ref dest, 56)) = Unsafe.As(ref Unsafe.Add(ref src, 56)); -#else - Unsafe.As(ref dest) = Unsafe.As(ref src); - Unsafe.As(ref Unsafe.Add(ref dest, 4)) = Unsafe.As(ref Unsafe.Add(ref src, 4)); - Unsafe.As(ref Unsafe.Add(ref dest, 8)) = Unsafe.As(ref Unsafe.Add(ref src, 8)); - Unsafe.As(ref Unsafe.Add(ref dest, 12)) = Unsafe.As(ref Unsafe.Add(ref src, 12)); - Unsafe.As(ref Unsafe.Add(ref dest, 16)) = Unsafe.As(ref Unsafe.Add(ref src, 16)); - Unsafe.As(ref Unsafe.Add(ref dest, 20)) = Unsafe.As(ref Unsafe.Add(ref src, 20)); - Unsafe.As(ref Unsafe.Add(ref dest, 24)) = Unsafe.As(ref Unsafe.Add(ref src, 24)); - Unsafe.As(ref Unsafe.Add(ref dest, 28)) = Unsafe.As(ref Unsafe.Add(ref src, 28)); - Unsafe.As(ref Unsafe.Add(ref dest, 32)) = Unsafe.As(ref Unsafe.Add(ref src, 32)); - Unsafe.As(ref Unsafe.Add(ref dest, 36)) = Unsafe.As(ref Unsafe.Add(ref src, 36)); - Unsafe.As(ref Unsafe.Add(ref dest, 40)) = Unsafe.As(ref Unsafe.Add(ref src, 40)); - Unsafe.As(ref Unsafe.Add(ref dest, 44)) = Unsafe.As(ref Unsafe.Add(ref src, 44)); - Unsafe.As(ref Unsafe.Add(ref dest, 48)) = Unsafe.As(ref Unsafe.Add(ref src, 48)); - Unsafe.As(ref Unsafe.Add(ref dest, 52)) = Unsafe.As(ref Unsafe.Add(ref src, 52)); - Unsafe.As(ref Unsafe.Add(ref dest, 56)) = Unsafe.As(ref Unsafe.Add(ref src, 56)); - Unsafe.As(ref Unsafe.Add(ref dest, 60)) = Unsafe.As(ref Unsafe.Add(ref src, 60)); -#endif + Unsafe.As(ref dest) = Unsafe.As(ref src); + Unsafe.As(ref Unsafe.Add(ref dest, 32)) = Unsafe.As(ref Unsafe.Add(ref src, 32)); + dest = ref Unsafe.Add(ref dest, 64); src = ref Unsafe.Add(ref src, 64); n--; @@ -499,17 +274,9 @@ private static void Memmove(ref byte dest, ref byte src, nuint len) len %= 64; if (len > 16) goto MCPY00; -#if HAS_CUSTOM_BLOCKS + Unsafe.As(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -16)); -#elif BIT64 - Unsafe.As(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -16)); - Unsafe.As(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -8)); -#else - Unsafe.As(ref Unsafe.Add(ref destEnd, -16)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -16)); - Unsafe.As(ref Unsafe.Add(ref destEnd, -12)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -12)); - Unsafe.As(ref Unsafe.Add(ref destEnd, -8)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -8)); - Unsafe.As(ref Unsafe.Add(ref destEnd, -4)) = Unsafe.As(ref Unsafe.Add(ref srcEnd, -4)); -#endif + return; BuffersOverlap: @@ -525,7 +292,7 @@ private static void Memmove(ref byte dest, ref byte src, nuint len) // Non-inlinable wrapper around the QCall that avoids polluting the fast path // with P/Invoke prolog/epilog. - [MethodImplAttribute(MethodImplOptions.NoInlining)] + [MethodImpl(MethodImplOptions.NoInlining)] private static unsafe void _Memmove(byte* dest, byte* src, nuint len) { __Memmove(dest, src, len); @@ -533,20 +300,12 @@ private static unsafe void _Memmove(byte* dest, byte* src, nuint len) // Non-inlinable wrapper around the QCall that avoids polluting the fast path // with P/Invoke prolog/epilog. - [MethodImplAttribute(MethodImplOptions.NoInlining)] + [MethodImpl(MethodImplOptions.NoInlining)] private static unsafe void _Memmove(ref byte dest, ref byte src, nuint len) { fixed (byte* pDest = &dest) fixed (byte* pSrc = &src) __Memmove(pDest, pSrc, len); } - -#if HAS_CUSTOM_BLOCKS - [StructLayout(LayoutKind.Sequential, Size = 16)] - private struct Block16 { } - - [StructLayout(LayoutKind.Sequential, Size = 64)] - private struct Block64 { } -#endif // HAS_CUSTOM_BLOCKS } }