Improve CopySign performance for integer types #90970

LEI-Hongfaan · 2023-08-23T02:45:34Z

ghost · 2023-08-23T02:45:44Z

Tagging subscribers to this area: @dotnet/area-system-runtime
See info in area-owners.md if you want to be subscribed.

Issue Details

Author:	LEI-Hongfaan
Assignees:	-
Labels:	`area-System.Runtime`
Milestone:	-

LEI-Hongfaan · 2023-08-23T02:46:45Z

@dotnet-policy-service agree

src/libraries/System.Private.CoreLib/src/System/Int16.cs

huoyaoyuan · 2023-08-23T03:10:10Z

It's better to include benchmarks for different cases, including positive, negative, throwing, and different size. I can expect result to vary among these.

huoyaoyuan · 2023-08-23T03:19:57Z

Codegen diff for Int32:

before:

G_M000_IG01:                ;; offset=0000H
       sub      rsp, 40

G_M000_IG02:                ;; offset=0004H
       mov      eax, ecx
       neg      eax
       test     ecx, ecx
       cmovl    ecx, eax
       test     edx, edx
       jl       SHORT G_M000_IG05

G_M000_IG03:                ;; offset=0011H
       test     ecx, ecx
       jl       SHORT G_M000_IG07
       mov      eax, ecx

G_M000_IG04:                ;; offset=0017H
       add      rsp, 40
       ret      

G_M000_IG05:                ;; offset=001CH
       mov      eax, ecx
       neg      eax

G_M000_IG06:                ;; offset=0020H
       add      rsp, 40
       ret      

G_M000_IG07:                ;; offset=0025H
       call     [CSPlayground.Program:ThrowNegateTwosCompOverflow()]
       int3

after:

G_M000_IG01:                ;; offset=0000H
       sub      rsp, 40

G_M000_IG02:                ;; offset=0004H
       cmp      ecx, 0xFFFFFFFF80000000
       jne      SHORT G_M000_IG04

G_M000_IG03:                ;; offset=000CH
       test     edx, edx
       jge      SHORT G_M000_IG06

G_M000_IG04:                ;; offset=0010H
       xor      edx, ecx
       mov      eax, edx
       not      eax
       sar      eax, 31
       sar      edx, 31
       sub      edx, eax
       mov      eax, edx
       imul     eax, ecx

G_M000_IG05:                ;; offset=0023H
       add      rsp, 40
       ret      

G_M000_IG06:                ;; offset=0028H
       xor      eax, eax
       sub      eax, 0xFFFFFFFF80000000
       jo       SHORT G_M000_IG08

G_M000_IG07:                ;; offset=0031H
       add      rsp, 40
       ret      

G_M000_IG08:                ;; offset=0036H
       call     CORINFO_HELP_OVERFLOW
       int3

LEI-Hongfaan · 2023-08-23T05:25:10Z

I don't know the exact reason, but in my project code or in these benchmarks' code, there are cases where CopySign is called inside a loop, and using my implementation does improve the performance.

I observed that some data show a bimodal pattern, where both peaks are faster than the original implementation. I am still unclear about the cause of the bimodal pattern.

It's better to include benchmarks for different cases, including positive, negative, throwing, and different size. I can expect result to vary among these.

I just want to note that throwing exceptions is regarded as an exceptional case, and it is not included and should not be included in the benchmarks' input data and statistics.

Test_0: xor (as a nop)
Test_CopySign_A1: original
Test_CopySign_A2 ~ CopySign_A4: new

public class CopySignInt16Benchmark {

    private Int16[] data;

    [Params(10000)]
    public int N;

    [GlobalSetup]
    public void Setup() {
        data = new Int16[2 * N];
        var rd = new Random();
        rd.NextBytes(MemoryMarshal.AsBytes(data.AsSpan()));
        for (int i = 0; i < data.Length; i += 2) {
            while (Int16.MinValue == data[i] && 0 <= data[i + 1]) {
                rd.NextBytes(MemoryMarshal.AsBytes(data.AsSpan(i, 2)));
            }
        }
    }

    public static Int16 CopySign_A1(Int16 value, Int16 sign) {
        return Int16.CopySign(value, sign);
    }

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static int SignZeroToOne(Int32 value) {
        return unchecked((int)(value >> (32 - 1)) - (int)(~value >> (32 - 1)));
    }

    public static Int16 CopySign_A2(Int16 value, Int16 sign) {
        if (Int16.MinValue != value || 0 > sign) {
            return unchecked((Int16)(value * SignZeroToOne(value ^ sign)));
        }
        return checked((Int16)unchecked(-value));
    }

    [DoesNotReturn]
    [StackTraceHidden]
    internal static OverflowException ThrowNegateTwosCompOverflow() {
        throw new OverflowException("SR.Overflow_NegateTwosCompNum");
    }

    [DoesNotReturn]
    [StackTraceHidden]
    internal static void ThrowNegateTwosCompOverflow2() {
        throw new OverflowException("SR.Overflow_NegateTwosCompNum");
    }

    public static Int16 CopySign_A3(Int16 value, Int16 sign) {
        if (Int16.MinValue != value || 0 > sign) {
            return unchecked((Int16)(value * SignZeroToOne(value ^ sign)));
        }
        throw ThrowNegateTwosCompOverflow();
    }

    public static Int16 CopySign_A4(Int16 value, Int16 sign) {
        if (value == Int16.MinValue && sign >= 0) {
            ThrowNegateTwosCompOverflow2();
        }
        return unchecked((Int16)(value * SignZeroToOne(value ^ sign)));
    }

    [Benchmark]
    public int Test_0() {
        var c = 0;
        var d = data;
        for (var i = 0; i < d.Length; i += 2) {
            c ^= d[i] ^ d[i + 1];
        }
        return c;
    }

    [Benchmark]
    public int Test_CopySign_A1() {
        var c = 0;
        var d = data;
        for (var i = 0; i < d.Length; i += 2) {
            c ^= CopySign_A1(d[i], d[i + 1]);
        }
        return c;
    }

    [Benchmark]
    public int Test_CopySign_A2() {
        var c = 0;
        var d = data;
        for (var i = 0; i < d.Length; i += 2) {
            c ^= CopySign_A2(d[i], d[i + 1]);
        }
        return c;
    }

    [Benchmark]
    public int Test_CopySign_A3() {
        var c = 0;
        var d = data;
        for (var i = 0; i < d.Length; i += 2) {
            c ^= CopySign_A3(d[i], d[i + 1]);
        }
        return c;
    }

    [Benchmark]
    public int Test_CopySign_A4() {
        var c = 0;
        var d = data;
        for (var i = 0; i < d.Length; i += 2) {
            c ^= CopySign_A4(d[i], d[i + 1]);
        }
        return c;
    }
}

BenchmarkDotNet v0.13.7, Windows 11 (10.0.22621.2134/22H2/2022Update/SunValley2)
13th Gen Intel Core i5-13490F, 1 CPU, 16 logical and 10 physical cores
.NET SDK 8.0.100-preview.7.23376.3
[Host] : .NET 8.0.0 (8.0.23.37506), X64 RyuJIT AVX2
DefaultJob : .NET 8.0.0 (8.0.23.37506), X64 RyuJIT AVX2

CopySignSByteBenchmark

// * Summary *

Method	N	Mean	Error	StdDev
Test_0	10000	4.846 us	0.0872 us	0.0816 us
Test_CopySign_A1	10000	54.738 us	1.0416 us	1.0230 us
Test_CopySign_A2	10000	8.494 us	0.1679 us	0.2123 us
Test_CopySign_A3	10000	8.940 us	0.1233 us	0.1154 us

CopySignInt16Benchmark

// * Summary *

Method	N	Mean	Error	StdDev	Median
Test_0	10000	9.045 us	0.1260 us	0.1178 us	9.076 us
Test_CopySign_A1	10000	53.833 us	0.4518 us	0.4226 us	53.928 us
Test_CopySign_A2	10000	10.439 us	0.9171 us	2.7042 us	8.955 us
Test_CopySign_A3	10000	8.855 us	0.0474 us	0.0396 us	8.858 us
Test_CopySign_A4	10000	8.615 us	0.1566 us	0.1465 us	8.622 us

// * Hints *
Outliers
CopySignInt16Benchmark.Test_0: Default -> 2 outliers were detected (8.80 us, 8.83 us)
CopySignInt16Benchmark.Test_CopySign_A3: Default -> 2 outliers were removed, 3 outliers were detected (8.76 us, 8.98 us, 10.38 us)

CopySignInt32Benchmark

// * Summary *
BenchmarkDotNet v0.13.7, Windows 11 (10.0.22621.2134/22H2/2022Update/SunValley2)
13th Gen Intel Core i5-13490F, 1 CPU, 16 logical and 10 physical cores
.NET SDK 8.0.100-preview.7.23376.3
[Host] : .NET 8.0.0 (8.0.23.37506), X64 RyuJIT AVX2
DefaultJob : .NET 8.0.0 (8.0.23.37506), X64 RyuJIT AVX2

Method	N	Mean	Error	StdDev
Test_0	10000	4.789 us	0.0865 us	0.1030 us
Test_CopySign_A1	10000	54.382 us	1.0585 us	0.9384 us
Test_CopySign_A2	10000	17.702 us	0.3525 us	0.9036 us

// * Hints *
Outliers
CopySignInt32Benchmark.Test_0: Default -> 1 outlier was removed, 2 outliers were detected (4.50 us, 5.05 us)
CopySignInt32Benchmark.Test_CopySign_A1: Default -> 1 outlier was removed, 2 outliers were detected (52.36 us, 58.88 us)
CopySignInt32Benchmark.Test_CopySign_A2: Default -> 4 outliers were removed, 9 outliers were detected (10.99 us..16.82 us, 18.82 us..19.28 us)

CopySignInt64Benchmark

// * Summary *

Method	N	Mean	Error	StdDev	Median
Test_0	10000	6.403 us	0.1270 us	0.3520 us	6.495 us
Test_CopySign_A1	10000	54.589 us	1.0200 us	1.1747 us	54.796 us
Test_CopySign_A2	10000	15.543 us	0.4935 us	1.4550 us	15.948 us

// * Hints *
Outliers
CopySignInt64Benchmark.Test_0: Default -> 12 outliers were detected (4.57 us..6.39 us)
CopySignInt64Benchmark.Test_CopySign_A1: Default -> 2 outliers were detected (50.53 us, 52.69 us)
CopySignInt64Benchmark.Test_CopySign_A2: Default -> 13 outliers were detected (8.41 us..15.26 us)

src/libraries/System.Private.CoreLib/src/System/Math.cs

huoyaoyuan

I'm now concerned about more scenarios.

https://sharplab.io/#v2:C4LghgzgtgPgAgJgIwFgBQcAMACOSB0ASgK4B2wAllAKb4DCA9lAA4UA21ATgMpcBuFAMbUIAbnTo4AZlwJsddAG902VWtV4AbNjYNSAc2wAxBgwAUugzuoAzYABodew5wr6AFsACU2ALwA+aztsACpsbjdSAC0uBgAVBgB5UmozVw9vcTR1FXU1AG0AWWpgdwYAEwBJFjYzYtKK6uY2ROZKPQh8AEF9fU4RCAo+akrSNgpSCf0vAF1cvOwJ4C5SMDZcJG0l8MiYzniklItnbD414movebzlbIW8uAB2bDMlrzMztgvsf0CzTQALNgALTYBBeHwwbBILL3bAAX2u13UWicVhMDAQxysHDsjksLjcnh8ASCwFCO300ViCWS1Cx6WJsLUyIK9TKVRqdRKHKaLTaFA63V6/Qgg2Go3GkwMs1ZqiWKzWG20BMp1P2tJSWNVnwuVzuC1ucJRzw+52oPz+gJBYIh2ChMLlCKRBoeMlRqsYzAAnhEqdjDLrqPiToMqfq4UbjapVWAAEYQABq5r8p3NWSd6goNhe8aTKYAPNhMBHo9go2W1Hnk18Lb4QdX05m1Ii0M35TmzGHSD96yX2+WB1nO43a9gi/3XZWK5W8nF3PsAO4AOWo+jAyzii4YEEYLESw04Nl0i7MXmZs+dbanZae2FHFwvxtbA7vwIf1Cf6lbOVdqLgQLzkuq7rpu267kwzAHlwx4MKeJKBKUS7YKQxBsGwWTwkAA==

Is a imul really faster than cmov? On 32 bit platform, 64 bit mul would be undoubtfully much slower.

huoyaoyuan · 2023-08-28T05:10:11Z

src/libraries/System.Private.CoreLib/src/System/Math.cs

+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static int SignZeroToOne(long value)
+        {
+            return (int)(value >> (64 - 2)) | 1;
+        }


Casting to int32 will introduce unnecessary truncate&extend on 64 bit platform, but saves a move on 32 bit platform.

Also note that we typically put these methods in System.Numerics.BitOperations.

I think returning an int (Int32) for Sign is reasonable, because it is the common return type for sign functions. Also, using a shorter bit length than long (Int64) may provide optimization opportunities for subsequent multiplications. Regarding where to put these methods, I think Sign and Abs are closely related, so it makes sense to group them together. Besides, the original code already referenced some throw helper methods from Math, so moving them to System.Numerics.BitOperations would make the code more scattered.

In such primitive function, one instruction counts. The helper isn't exposed to public, so it should be most performant form. In case when needed, you can use #if TARGETS_64BIT to provide different implementation.

We need to understand the performance better.

LEI-Hongfaan · 2023-08-29T03:25:32Z

I think the current compilers are not always good at using cmov. This can be seen from the disassembly results. The benchmark data shows that PR's code is still better than the original code, even on 32-bit platforms, except for the Int128 case. I will continue to modify the code to address this issue.

LEI-Hongfaan · 2023-08-30T06:09:32Z

        public static Int128 CopySign_A1(Int128 value, Int128 sign) {
            return Int128.CopySign(value, sign);
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static int SignZeroToOne(Int64 value) {
            return unchecked((int)(value >> (64 - 1)) - (int)(~value >> (64 - 1)));
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static int SignZeroToOne2(Int64 value) {
            return unchecked((int)(value >> (64 - 2)) | 1);
        }

        public static Int128 CopySign_A2(Int128 value, Int128 sign) {
            if (Int64.MinValue != unchecked((Int64)(value >> 64)) || UInt64.MinValue != unchecked((UInt64)value) || Int128.IsNegative(sign)) {
                return value * SignZeroToOne(unchecked((Int64)(value >> 64)) ^ unchecked((Int64)(sign >> 64)));
            }
            return checked(-unchecked((Int64)(value >> 64)));
        }

        [DoesNotReturn]
        public static void ThrowOverflowException() {
            throw new OverflowException();
        }

        public static Int128 CopySign_A3(Int128 value, Int128 sign) {
            if (Int64.MinValue == unchecked((Int64)(value >> 64)) && UInt64.MinValue == unchecked((UInt64)value) && Int128.IsNegative(sign)) {
                ThrowOverflowException();
            }
            return value * SignZeroToOne(unchecked((Int64)(value >> 64)) ^ unchecked((Int64)(sign >> 64)));
        }

        public static Int128 CopySign_A4(Int128 value, Int128 sign) {
            if (Int64.MinValue == unchecked((Int64)(value >> 64)) && UInt64.MinValue == unchecked((UInt64)value) && Int128.IsNegative(sign)) {
                ThrowOverflowException();
            }
            return value * SignZeroToOne2(unchecked((Int64)(value >> 64)) ^ unchecked((Int64)(sign >> 64)));
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
        public static Int128 CopySign_A5(Int128 value, Int128 sign) {
            Int128 absValue = value;

            if (Int128.IsNegative(absValue)) {
                absValue = -absValue;
            }

            if (Int128.IsPositive(sign)) {
                if (Int128.IsNegative(absValue)) {
                    ThrowOverflowException();
                }
                return absValue;
            }
            return -absValue;
        }

        [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
        public static Int128 CopySign_A6(Int128 value, Int128 sign) {
            Int128 t = value;
            if (Int128.IsPositive(sign)) {
                if (Int128.IsNegative(t)) {
                    t = -t;
                    if (Int128.IsNegative(t)) {
                        ThrowOverflowException();
                    }
                }
                return t;
            }
            if (Int128.IsPositive(t)) {
                t = -t;
            }
            return t;
        }

CopySignInt128Benchmark

// * Summary *

BenchmarkDotNet v0.13.7, Windows 11 (10.0.22621.2134/22H2/2022Update/SunValley2)
13th Gen Intel Core i5-13490F, 1 CPU, 16 logical and 10 physical cores
.NET SDK 8.0.100-preview.7.23376.3
[Host] : .NET 8.0.0 (8.0.23.37506), X64 RyuJIT AVX2
RyuJitX64 : .NET 8.0.0 (8.0.23.37506), X64 RyuJIT AVX2
RyuJitX86 : .NET 8.0.0 (8.0.23.37506), X86 RyuJIT AVX2

Jit=RyuJit PowerPlanMode=8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c Toolchain=.NET 8.0

Method	Job	Platform	N	Mean	Error	StdDev	Median	Code Size
Test_0	RyuJitX64	X64	10000	8.463 us	0.1690 us	0.3336 us	8.539 us	117 B
Test_CopySign_A1	RyuJitX64	X64	10000	57.894 us	0.8688 us	0.7255 us	57.767 us	168 B
Test_CopySign_A2	RyuJitX64	X64	10000	21.405 us	0.4058 us	0.3985 us	21.530 us	321 B
Test_CopySign_A3	RyuJitX64	X64	10000	21.475 us	0.4293 us	0.6684 us	21.453 us	292 B
Test_CopySign_A4	RyuJitX64	X64	10000	20.351 us	0.4018 us	0.7547 us	19.989 us	283 B
Test_CopySign_A5	RyuJitX64	X64	10000	59.744 us	1.1851 us	1.7371 us	59.859 us	168 B
Test_CopySign_A6	RyuJitX64	X64	10000	56.139 us	1.0636 us	1.2661 us	56.271 us	174 B
Test_0	RyuJitX86	X86	10000	29.334 us	0.5711 us	0.9852 us	29.342 us	NA
Test_CopySign_A1	RyuJitX86	X86	10000	94.423 us	1.8341 us	3.0134 us	94.598 us	NA
Test_CopySign_A2	RyuJitX86	X86	10000	128.252 us	2.5488 us	5.0903 us	128.422 us	NA
Test_CopySign_A3	RyuJitX86	X86	10000	102.497 us	2.0235 us	2.5591 us	102.036 us	NA
Test_CopySign_A4	RyuJitX86	X86	10000	104.585 us	2.0392 us	3.2929 us	104.967 us	NA
Test_CopySign_A5	RyuJitX86	X86	10000	91.686 us	1.7916 us	2.9934 us	90.855 us	NA
Test_CopySign_A6	RyuJitX86	X86	10000	72.938 us	1.4331 us	2.3546 us	73.717 us	NA

// * Summary *

BenchmarkDotNet v0.13.7, Windows 11 (10.0.22621.2134/22H2/2022Update/SunValley2)
13th Gen Intel Core i5-13490F, 1 CPU, 16 logical and 10 physical cores
.NET SDK 8.0.100-preview.7.23376.3
[Host] : .NET 8.0.0 (8.0.23.37506), X86 RyuJIT AVX2
RyuJitX64 : .NET 8.0.0 (8.0.23.37506), X64 RyuJIT AVX2
RyuJitX86 : .NET 8.0.0 (8.0.23.37506), X86 RyuJIT AVX2

Jit=RyuJit PowerPlanMode=8c5e7fda-e8bf-4a96-9a85-a6e23a8c635c Toolchain=.NET 8.0

Method	Job	Platform	N	Mean	Error	StdDev	Median	Code Size
Test_0	RyuJitX64	X64	10000	8.194 us	0.0834 us	0.0780 us	8.187 us	NA
Test_CopySign_A1	RyuJitX64	X64	10000	63.927 us	1.2753 us	3.1995 us	62.521 us	NA
Test_CopySign_A2	RyuJitX64	X64	10000	25.391 us	0.4967 us	0.7879 us	25.120 us	NA
Test_CopySign_A3	RyuJitX64	X64	10000	25.985 us	0.4744 us	0.5826 us	25.978 us	NA
Test_CopySign_A4	RyuJitX64	X64	10000	23.635 us	0.4704 us	0.8601 us	23.559 us	NA
Test_CopySign_A5	RyuJitX64	X64	10000	66.434 us	1.0841 us	1.0141 us	66.271 us	NA
Test_CopySign_A6	RyuJitX64	X64	10000	62.348 us	1.0976 us	0.9730 us	62.341 us	NA
Test_0	RyuJitX86	X86	10000	33.769 us	0.6723 us	1.8290 us	34.112 us	274 B
Test_CopySign_A1	RyuJitX86	X86	10000	105.257 us	1.8302 us	1.7120 us	105.058 us	558 B
Test_CopySign_A2	RyuJitX86	X86	10000	142.536 us	2.6789 us	2.5058 us	143.044 us	830 B
Test_CopySign_A3	RyuJitX86	X86	10000	114.854 us	2.2436 us	3.2177 us	113.707 us	631 B
Test_CopySign_A4	RyuJitX86	X86	10000	114.064 us	2.2447 us	2.2046 us	114.342 us	625 B
Test_CopySign_A5	RyuJitX86	X86	10000	104.429 us	1.9460 us	2.3166 us	104.998 us	558 B
Test_CopySign_A6	RyuJitX86	X86	10000	80.480 us	1.5891 us	2.8655 us	80.680 us	493 B

LEI-Hongfaan · 2023-09-05T03:57:29Z

hello?

huoyaoyuan · 2023-09-05T04:06:15Z

ping @tannergooding as the area owner.
I'm busy recently and can't provide more tests myself.

adamsitnik

@LEI-Hongfaan thank you for your contribution and apologies for the delay in the review.

The changes overall look good to me, but there is still place for improvement (please see my comments).

For the benchmarks I've provided in dotnet/performance#3462 I got following results:

dotnet run -c Release -f net8.0 --filter System.Tests.Perf_*CopySign* --join --corerun D:\projects\runtime\artifacts\bin\testhost\net9.0-windows-Release-x64\shared\Microsoft.NETCore.App\main\corerun.exe D:\projects\forks\copySign\artifacts\bin\testhost\net9.0-windows-Release-x64\shared\Microsoft.NETCore.App\pr\corerun.exe D:\projects\forks\copySign\artifacts\bin\testhost\net9.0-windows-Release-x64\shared\Microsoft.NETCore.App\9.0.0\corerun.exe

BenchmarkDotNet v0.13.7-nightly.20230717.35, Windows 11 (10.0.22621.2428/22H2/2022Update/SunValley2)
AMD Ryzen Threadripper PRO 3945WX 12-Cores, 1 CPU, 24 logical and 12 physical cores
.NET SDK 9.0.100-alpha.1.23531.2
  [Host]     : .NET 8.0.0 (8.0.23.47906), X64 RyuJIT AVX2
  suggestions : .NET 9.0.0 (42.42.42.42424), X64 RyuJIT AVX2
           PR : .NET 9.0.0 (42.42.42.42424), X64 RyuJIT AVX2
         main : .NET 9.0.0 (42.42.42.42424), X64 RyuJIT AVX2

Type	Method	Job	value	sign	Mean	Ratio
Perf_Int16	CopySign	suggestions	1	-1	0.2648 ns	0.69
Perf_Int16	CopySign	PR	1	-1	0.2602 ns	0.68
Perf_Int16	CopySign	main	1	-1	0.3834 ns	1.00

Perf_SByte	CopySign	suggestions	1	-1	0.2660 ns	0.63
Perf_SByte	CopySign	PR	1	-1	0.2364 ns	0.56
Perf_SByte	CopySign	main	1	-1	0.4206 ns	1.00

Perf_Int128	CopySign	suggestions	1	-1	1.1218 ns	0.17
Perf_Int128	CopySign	PR	1	-1	1.4312 ns	0.22
Perf_Int128	CopySign	main	1	-1	6.5770 ns	1.00

Perf_Int64	CopySign	suggestions	1	-1	0.1840 ns	0.37
Perf_Int64	CopySign	PR	1	-1	0.4993 ns	1.00
Perf_Int64	CopySign	main	1	-1	0.4980 ns	1.00

Perf_Int32	CopySign	suggestions	1	-1	0.2086 ns	1.02
Perf_Int32	CopySign	PR	1	-1	0.2003 ns	0.98
Perf_Int32	CopySign	main	1	-1	0.2048 ns	1.00

src/libraries/System.Private.CoreLib/src/System/Math.cs

src/libraries/System.Private.CoreLib/src/System/Int64.cs

tannergooding · 2023-11-02T16:33:28Z

src/libraries/System.Private.CoreLib/src/System/Int16.cs

            }
-
-            return (short)(-absValue);
+            return (short)(value * Math.SignZeroToOne(value ^ sign));


Multiplication is much more expensive on older CPUs and may significantly regress performance.

It's not necessarily "cheap" on modern CPUs either (still a minimum of 3-4 cycles for 32-bit, and 3-6 cycles for 64-bit) and while reduction of branches can be goodness, it might hurt things overall and hinder JIT optimizations for some constants.

tannergooding · 2023-11-02T16:34:31Z

src/libraries/System.Private.CoreLib/src/System/Math.cs

+        // >= 0: returns 1
+        // < 0: returns -1
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static int SignZeroToOne(int value)


This name isn't "clear" IMO and needs something that makes its meaning clear.

SignOrOneIfZero?

Co-authored-by: Adam Sitnik <adam.sitnik@gmail.com>

tannergooding · 2024-01-29T16:17:06Z

src/libraries/System.Private.CoreLib/src/System/Int128.cs

-            if (IsPositive(sign))
+#if TARGET_32BIT
+            Int128 t = value;
+            if (Int128.IsPositive(sign))


Int128. is unnecessary as we are in the Int128 type

I'm notably not really a fan of this additional complexity to maintain two copies of the code. 32-bit is already extremely pessimized for Int128 and I don't think it's worth the additional micro-optimizations here. Just have the singular code path that is consistent with the other types instead.

tannergooding · 2024-01-29T16:19:11Z

src/libraries/System.Private.CoreLib/src/System/Int128.cs

+            return t;
+#else
+            // Int128.MinValue == value && 0 <= sign
+            if ((long)value._upper == long.MinValue && value._lower == 0 && (long)sign._upper >= 0)


I'd just do this the simple/readable way as:

Suggested change

if ((long)value._upper == long.MinValue && value._lower == 0 && (long)sign._upper >= 0)

if ((value == Int128.MinValue) && IsPositive(sign))

The JIT should take care of inlining the comparison and IsPositive check to give good codegen.

tannergooding · 2024-01-29T16:27:38Z

src/libraries/System.Private.CoreLib/src/System/Int128.cs

+            {
+                Math.ThrowNegateTwosCompOverflow();
+            }
+            return value * Math.SignOrOneIfZero((long)value._upper ^ (long)sign._upper);


I'm still not a fan of this name nor of the cost for multiplication (which increases significantly as the size of the type does).

I think the entire logic here can be simplified down to:

if (SameSign(value, sign)) { return value; } else if (value == Int128.MinValue) { ThrowNegateTwosCompOverflow(); } return -value;

where you have:

bool SameSign(Int128 left, Int128 right) { // All positive values have the most significant bit clear // All negative values have the most significant bit set // // (x ^ y) produces 0 if the bits are the same and 1 if they // differ. Therefore, (x ^ y) produces a positive value if the // signs are the same and a negative value if they differ. return (long)(left._upper ^ right._upper) >= 0; }

This gives you up to the same 2 comparisons you currently have, but reduces the bit toggle to simply a two's complement operation (negation), which is far cheaper.

The SameSign operation can even be optimized for 32-bit platforms by only comparing the upper half of _upper, which is a much more reasonable platform specific optimization and keeps it heavily isolated away to only where needed.

tannergooding

Left some comments above on how this can be simplified while still keeping the perf improvements that you're looking for. Thanks for continuing to work on this.

ghost · 2024-02-13T21:01:49Z

This pull request has been automatically marked no-recent-activity because it has not had any activity for 14 days. It will be closed if no further activity occurs within 14 more days. Any new comment (by anyone, not necessarily the author) will remove no-recent-activity.

ghost · 2024-02-28T00:01:05Z

This pull request will now be closed since it had been marked no-recent-activity but received no further activity in the past 14 days. It is still possible to reopen or comment on the pull request, but please note that it will be locked if it remains inactive for another 30 days.

LEI-Hongfaan added 2 commits August 23, 2023 02:31

Fix dotnet#77579

271014a

Merge branch 'main' of https://github.com/LEI-Hongfaan/dotnet-runtime

630c704

dotnet-issue-labeler bot added the area-System.Runtime label Aug 23, 2023

ghost added the community-contribution Indicates that the PR has been added by a community member label Aug 23, 2023

huoyaoyuan reviewed Aug 23, 2023

View reviewed changes

src/libraries/System.Private.CoreLib/src/System/Int16.cs Outdated Show resolved Hide resolved

src/libraries/System.Private.CoreLib/src/System/Int16.cs Outdated Show resolved Hide resolved

LEI-Hongfaan added 2 commits August 23, 2023 16:56

Code style: Put constants on the right side.

e8cee13

Use Math.ThrowNegateTwosCompOverflow() to throw OverflowException.

1dd6e3d

LEI-Hongfaan requested a review from huoyaoyuan August 23, 2023 22:17

build-analysis bot mentioned this pull request Aug 24, 2023

Microsoft.NET.HostModel.Tests failing with "No space left on device" #91039

Closed

huoyaoyuan reviewed Aug 24, 2023

View reviewed changes

src/libraries/System.Private.CoreLib/src/System/Math.cs Outdated Show resolved Hide resolved

LEI-Hongfaan added 2 commits August 24, 2023 21:05

Use a better bit operation for SignZeroToOne.

12274ee

Code Style: Remove some unnecessary unchecked

725bd61

This was referenced Aug 25, 2023

LibraryImportGenerator unit test segfault illegal memory access #89054

Closed

Http2_MultipleConnectionsEnabled test failures #91075

Closed

LEI-Hongfaan requested a review from huoyaoyuan August 25, 2023 13:36

huoyaoyuan reviewed Aug 28, 2023

View reviewed changes

Avoid expensive imul in Int128.CopySign for 32-bit

fe5b230

LEI-Hongfaan requested a review from huoyaoyuan August 30, 2023 05:40

build-analysis bot mentioned this pull request Aug 30, 2023

[8.0] [mono] Test process gets killed #90834

Closed

adamsitnik self-assigned this Nov 2, 2023

adamsitnik requested changes Nov 2, 2023

View reviewed changes

ghost added the needs-author-action An issue or pull request that requires more info or actions from the author. label Nov 2, 2023

adamsitnik added the tenet-performance Performance related issue label Nov 2, 2023

tannergooding reviewed Nov 2, 2023

View reviewed changes

Update src/libraries/System.Private.CoreLib/src/System/Int64.cs

64044bd

Co-authored-by: Adam Sitnik <adam.sitnik@gmail.com>

ghost removed the needs-author-action An issue or pull request that requires more info or actions from the author. label Nov 8, 2023

LEI-Hongfaan added 2 commits November 8, 2023 03:02

Rename SignZeroToOne to SignOrOneIfZero.

3e650ba

Merge branch 'main' of https://github.com/LEI-Hongfaan/dotnet-runtime

3c1be6d

This was referenced Nov 8, 2023

Test_EventSource_EtwManifestGeneration* tests failing in CI #48798

Closed

Timeout in System.Net.Quic.Functional.Tests #86019

Closed

CI error: System.Net.Quic.QuicException: The connection timed out from inactivity #91757

Closed

tannergooding reviewed Jan 29, 2024

View reviewed changes

tannergooding requested changes Jan 30, 2024

View reviewed changes

ghost added the needs-author-action An issue or pull request that requires more info or actions from the author. label Jan 30, 2024

ghost added the no-recent-activity label Feb 13, 2024

ghost closed this Feb 28, 2024

github-actions bot locked and limited conversation to collaborators Mar 29, 2024

This pull request was closed.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Improve CopySign performance for integer types #90970

Improve CopySign performance for integer types #90970

LEI-Hongfaan commented Aug 23, 2023

ghost commented Aug 23, 2023

LEI-Hongfaan commented Aug 23, 2023

huoyaoyuan commented Aug 23, 2023

huoyaoyuan commented Aug 23, 2023

LEI-Hongfaan commented Aug 23, 2023

huoyaoyuan left a comment

huoyaoyuan Aug 28, 2023

LEI-Hongfaan Aug 28, 2023

huoyaoyuan Aug 29, 2023

LEI-Hongfaan commented Aug 29, 2023 •

edited

Loading

LEI-Hongfaan commented Aug 30, 2023 •

edited

Loading

LEI-Hongfaan commented Sep 5, 2023

huoyaoyuan commented Sep 5, 2023

adamsitnik left a comment

tannergooding Nov 2, 2023

tannergooding Nov 2, 2023

LEI-Hongfaan Nov 8, 2023

tannergooding Jan 29, 2024

tannergooding Jan 29, 2024

tannergooding Jan 29, 2024

tannergooding Jan 29, 2024

tannergooding left a comment

ghost commented Feb 13, 2024

ghost commented Feb 28, 2024

	if ((long)value._upper == long.MinValue && value._lower == 0 && (long)sign._upper >= 0)
	if ((value == Int128.MinValue) && IsPositive(sign))

Improve CopySign performance for integer types #90970

Improve CopySign performance for integer types #90970

Conversation

LEI-Hongfaan commented Aug 23, 2023

ghost commented Aug 23, 2023

LEI-Hongfaan commented Aug 23, 2023

huoyaoyuan commented Aug 23, 2023

huoyaoyuan commented Aug 23, 2023

LEI-Hongfaan commented Aug 23, 2023

huoyaoyuan left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

LEI-Hongfaan commented Aug 29, 2023 • edited Loading

LEI-Hongfaan commented Aug 30, 2023 • edited Loading

LEI-Hongfaan commented Sep 5, 2023

huoyaoyuan commented Sep 5, 2023

adamsitnik left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

tannergooding left a comment

Choose a reason for hiding this comment

ghost commented Feb 13, 2024

ghost commented Feb 28, 2024

LEI-Hongfaan commented Aug 29, 2023 •

edited

Loading

LEI-Hongfaan commented Aug 30, 2023 •

edited

Loading