JIT: Support containing compares in GT_SELECT for xarch (i.e. start emitting cmov instructions) #81267

jakobbotsch · 2023-01-27T14:42:38Z

This adds support for contained compares in GT_SELECT nodes for xarch. As part of this also enables if-conversion on xarch.

This fixes #6749. For:

    [MethodImpl(MethodImplOptions.NoInlining)]
    static long sete_or_mov(bool cond)
    {
        return cond ? 0 : 4;
    }

Before:

G_M14693_IG01:              ;; offset=0000H
                                                ;; size=0 bbWeight=1 PerfScore 0.00
G_M14693_IG02:              ;; offset=0000H
       84C9                 test     cl, cl
       7507                 jne      SHORT G_M14693_IG04
                                                ;; size=4 bbWeight=1 PerfScore 1.25
G_M14693_IG03:              ;; offset=0004H
       B804000000           mov      eax, 4
       EB02                 jmp      SHORT G_M14693_IG05
                                                ;; size=7 bbWeight=0.50 PerfScore 1.12
G_M14693_IG04:              ;; offset=000BH
       33C0                 xor      eax, eax
                                                ;; size=2 bbWeight=0.50 PerfScore 0.12
G_M14693_IG05:              ;; offset=000DH
       4898                 cdqe
                                                ;; size=2 bbWeight=1 PerfScore 0.25
G_M14693_IG06:              ;; offset=000FH
       C3                   ret
                                                ;; size=1 bbWeight=1 PerfScore 1.00

; Total bytes of code 16, prolog size 0, PerfScore 5.35, instruction count 7, allocated bytes for code 16 (MethodHash=bc7ac69a) for method Program:sete_or_mov(bool):long

After:

G_M14693_IG01:
                                                ;; size=0 bbWeight=1 PerfScore 0.00
G_M14693_IG02:
       xor      eax, eax
       mov      edx, 4
       test     cl, cl
       cmove    eax, edx
       cdqe
                                                ;; size=14 bbWeight=1 PerfScore 1.25
G_M14693_IG03:
       ret
                                                ;; size=1 bbWeight=1 PerfScore 1.00

; Total bytes of code 15, prolog size 0, PerfScore 3.75, instruction count 6, allocated bytes for code 15 (MethodHash=bc7ac69a) for method Program:sete_or_mov(bool):long

(Not completely optimal, no sign extension is necessary, but that's a separate issue.)

~~It does not get the cmov case from the issue, but that's because if-conversion does not handle it today (cc @a74nh -- seems like a nice opportunity).~~

TODO:

Interference checks
Only require delayed free for one operand
Double check delay free logic

This adds support for contained compares in GT_SELECT nodes for xarch. As part of this, also enables if-conversion on xarch. Fix dotnet#6749

ghost · 2023-01-27T14:42:51Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak
See info in area-owners.md if you want to be subscribed.

Issue Details

This adds support for contained compares in GT_SELECT nodes for xarch. As part of this also enables if-conversion on xarch.

This partially fixes #6749. For:

[MethodImpl(MethodImplOptions.NoInlining)]
static long cmov(long longValue)
{
    long tmp1 = longValue & 0x00000000ffffffff;
    return tmp1 == 0 ? longValue : tmp1;
}

Before:

; Assembly listing for method Program:sete_or_mov(bool):long
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  3,  3   )    bool  ->  rcx         single-def
;# V01 OutArgs      [V01    ] (  1,  1   )  lclBlk ( 0) [rsp+00H]   "OutgoingArgSpace"
;  V02 tmp1         [V02,T01] (  3,  2   )     int  ->  rax
;
; Lcl frame size = 0

G_M14693_IG01:              ;; offset=0000H
                                                ;; size=0 bbWeight=1 PerfScore 0.00
G_M14693_IG02:              ;; offset=0000H
       84C9                 test     cl, cl
       7507                 jne      SHORT G_M14693_IG04
                                                ;; size=4 bbWeight=1 PerfScore 1.25
G_M14693_IG03:              ;; offset=0004H
       B804000000           mov      eax, 4
       EB02                 jmp      SHORT G_M14693_IG05
                                                ;; size=7 bbWeight=0.50 PerfScore 1.12
G_M14693_IG04:              ;; offset=000BH
       33C0                 xor      eax, eax
                                                ;; size=2 bbWeight=0.50 PerfScore 0.12
G_M14693_IG05:              ;; offset=000DH
       4898                 cdqe
                                                ;; size=2 bbWeight=1 PerfScore 0.25
G_M14693_IG06:              ;; offset=000FH
       C3                   ret
                                                ;; size=1 bbWeight=1 PerfScore 1.00

; Total bytes of code 16, prolog size 0, PerfScore 5.35, instruction count 7, allocated bytes for code 16 (MethodHash=bc7ac69a) for method Program:sete_or_mov(bool):long
; ===========================================================

After:

; Assembly listing for method Program:sete_or_mov(bool):long
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  3,  3   )    bool  ->  rcx         single-def
;# V01 OutArgs      [V01    ] (  1,  1   )  lclBlk ( 0) [rsp+00H]   "OutgoingArgSpace"
;  V02 tmp1         [V02,T01] (  2,  2   )     int  ->  rdx
;
; Lcl frame size = 0

G_M14693_IG01:              ;; offset=0000H
                                                ;; size=0 bbWeight=1 PerfScore 0.00
G_M14693_IG02:              ;; offset=0000H
       B804000000           mov      eax, 4
       33D2                 xor      edx, edx
       84C9                 test     cl, cl
       0F44D0               cmove    edx, eax
       4863C2               movsxd   rax, edx
                                                ;; size=15 bbWeight=1 PerfScore 1.25
G_M14693_IG03:              ;; offset=000FH
       C3                   ret
                                                ;; size=1 bbWeight=1 PerfScore 1.00

; Total bytes of code 16, prolog size 0, PerfScore 3.85, instruction count 6, allocated bytes for code 16 (MethodHash=bc7ac69a) for method Program:sete_or_mov(bool):long
; ============================================================

It does not get the cmov case from the issue, but that's because if-conversion does not handle it today (cc @a74nh -- seems like a nice opportunity).

Author:	jakobbotsch
Assignees:	-
Labels:	`area-CodeGen-coreclr`
Milestone:	-

jakobbotsch · 2023-01-28T10:43:57Z

It does not get the cmov case from the issue, but that's because if-conversion does not handle it today (cc @a74nh -- seems like a nice opportunity).

Disregard this -- if-conversion works fine, I just had a flipped ifdef in my code. The code produced for cmov is:

[MethodImpl(MethodImplOptions.NoInlining)]
static long cmov(long longValue) {
    long tmp1 = longValue & 0x00000000ffffffff;
    return tmp1 == 0 ? longValue : tmp1;
}

Before:

G_M64344_IG01:
                                                ;; size=0 bbWeight=1 PerfScore 0.00
G_M64344_IG02:
       mov      eax, ecx
       test     rax, rax
       je       SHORT G_M64344_IG04
                                                ;; size=7 bbWeight=1 PerfScore 1.50
G_M64344_IG03:
       ret
                                                ;; size=1 bbWeight=0.50 PerfScore 0.50
G_M64344_IG04:
       mov      rax, rcx
                                                ;; size=3 bbWeight=0.50 PerfScore 0.12
G_M64344_IG05:
       ret
                                                ;; size=1 bbWeight=0.50 PerfScore 0.50

; Total bytes of code 12, prolog size 0, PerfScore 3.83, instruction count 6, allocated bytes for code 12 (MethodHash=16c004a7) for method Program:cmov(long):long

After:

G_M64344_IG01:
                                                ;; size=0 bbWeight=1 PerfScore 0.00
G_M64344_IG02:
       mov      eax, ecx
       test     rax, rax
       cmove    rax, rcx
                                                ;; size=9 bbWeight=1 PerfScore 0.75
G_M64344_IG03:
       ret
                                                ;; size=1 bbWeight=1 PerfScore 1.00

; Total bytes of code 10, prolog size 0, PerfScore 2.75, instruction count 4, allocated bytes for code 10 (MethodHash=16c004a7) for method Program:cmov(long):long

jakobbotsch · 2023-01-28T12:54:36Z

Definitely still some things to improve:

CBoolTest:AreZero2(int,int):

@@ -12,26 +12,23 @@
 ;
 ; Lcl frame size = 0

-G_M65508_IG01:        ; bbWeight=1, gcVars=0000000000000000 {}, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, gcvars, byref, nogc <-- Prolog IG
 ;
 ; Lcl frame size = 0

-G_M65508_IG01:        ; bbWeight=1, gcVars=0000000000000000 {}, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, gcvars, byref, nogc <-- Prolog IG
+G_M65508_IG01:        ; bbWeight=1, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, byref, nogc <-- Prolog IG
                                                ;; size=0 bbWeight=1 PerfScore 0.00
-G_M65508_IG02:        ; bbWeight=1, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, byref, isz
+G_M65508_IG02:        ; bbWeight=1, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, byref
        or       edx, ecx
-       jne      SHORT G_M65508_IG05
-                                               ;; size=4 bbWeight=1 PerfScore 1.25
-G_M65508_IG03:        ; bbWeight=0.50, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, byref
-       mov      eax, 1
-                                               ;; size=5 bbWeight=0.50 PerfScore 0.12
-G_M65508_IG04:        ; bbWeight=0.50, epilog, nogc, extend
+       setne    al
+       movzx    rax, al
+       xor      edx, edx
+       mov      ecx, 1
+       test     eax, eax
+       mov      eax, ecx
+       cmovne   eax, edx
+                                               ;; size=22 bbWeight=1 PerfScore 2.75
+G_M65508_IG03:        ; bbWeight=1, epilog, nogc, extend
        ret
-                                               ;; size=1 bbWeight=0.50 PerfScore 0.50
-G_M65508_IG05:        ; bbWeight=0.50, gcVars=0000000000000000 {}, gcrefRegs=00000000 {}, byrefRegs=00000000 {}, gcvars, byref
-       xor      eax, eax
-                                               ;; size=2 bbWeight=0.50 PerfScore 0.12
-G_M65508_IG06:        ; bbWeight=0.50, epilog, nogc, extend
-       ret

jakobbotsch · 2023-01-30T13:24:02Z

Definitely still some things to improve:

CBoolTest:AreZero2(int,int):

I turned off if-conversion for the problematic scenarios (these cases turn out to be very rare, actually).
I first tried adding the support for if-conversion handling these kinds of cases, but that was pretty clunky and hacky. I left some notes at #74867 (comment) on why that is.

src/coreclr/jit/lowerxarch.cpp

jakobbotsch · 2023-01-30T16:57:47Z

/azp run runtime-coreclr outerloop, runtime-coreclr jitstress, runtime-coreclr libraries-jitstress, Fuzzlyn

azure-pipelines · 2023-01-30T16:58:32Z

Azure Pipelines successfully started running 4 pipeline(s).

tannergooding · 2023-01-30T17:24:55Z

src/coreclr/jit/gentree.cpp

+    }
+
+#ifdef FEATURE_HW_INTRINSICS
+    if (OperIs(GT_HWINTRINSIC) && emitter::DoesWriteZeroFlag(HWIntrinsicInfo::lookupIns(AsHWIntrinsic())))


Worth noting that this isn't necessarily "correct".

lookupIns catches the most common cases, but certainly not all cases particularly when various intrinsics represent complex helpers that are lowered or when the table tracked instruction is INS_invalid due to it requiring additional lookups/handling.

Makes sense. I added another check to the heuristic in if-conversion for the rest of the hw intrinsic cases that this PR doesn't handle yet, so that we don't regress those.

tannergooding · 2023-01-30T17:25:36Z

src/coreclr/jit/gentree.cpp

+bool GenTree::CanSetZeroFlag()
+{
+#if defined(TARGET_XARCH)
+    if (OperIs(GT_AND, GT_OR, GT_XOR, GT_ADD, GT_SUB, GT_NEG))


This is missing many different cases, particularly those that leave ZF in an undefined state such as imul

This is just factoring the code that was already there. I can add a TODO if you want.

(It was factored out of Lowering::OptimizeConstCompare)

Ah, I see. I wonder if its going to cause any problematic behavior if we're depending on this to be accurate.

I could easily see someone checking !CanSetZeroFlag() and then assuming the zero flag can't be overrwritten. Inversely I could see someone checking CanSetZeroFlag() and assuming that it will write some 0/1 value and that it is strictly tied to the computed result.

Hmm, I can rename this. The specific meaning I was intending was that this is a node that supports setting the zero flag if GTF_SET_FLAGS is set. Maybe SupportsSettingZeroFlag() is better?

Not sure Supports fixes the main concern around people thinking !Supports means "does not support" (and therefore will not modify zf).

I don't have a good suggestion for a different name, however.

It might just be something we'll have to cover with docs and code review.

I renamed it to SupportsSettingZeroFlag and added some more docs on it

tannergooding · 2023-01-30T17:28:20Z

src/coreclr/jit/gentree.cpp

+    }
+#endif
+#elif defined(TARGET_ARM64)
+    if (OperIs(GT_AND, GT_ADD, GT_SUB))


Is this going to be problematic on Arm64 since we have two different instructions for each of these?

That is one that sets the flags (adds) and one that does not (add)

No, I think codegen just handles this via gtSetFlags(). Anyway there shouldn't be any behavior differences for arm64 here, this is just factoring this.

TIHan

This looks good to me, the LSRA changes seem about right based on my limited experience with it.

The diffs do look good, though there are a lot of regressions; wondering if they are actual regressions or not.

jakobbotsch · 2023-02-01T17:08:13Z

This does not improve many of the IfStatements micro benchmarks, here is the full table of results:

Method	Job	Toolchain	Mean	Error	StdDev	Median	Min	Max	Ratio	Allocated	Alloc Ratio
Single	Job-MLIHKJ	\Core_Root_base\corerun.exe	23.93 us	0.048 us	0.040 us	23.93 us	23.85 us	23.99 us	1.00	-	NA
Single	Job-FCEXGF	\Core_Root_diff\corerun.exe	19.26 us	0.144 us	0.135 us	19.29 us	19.07 us	19.48 us	0.80	-	NA

And	Job-MLIHKJ	\Core_Root_base\corerun.exe	25.06 us	0.199 us	0.186 us	25.11 us	24.81 us	25.32 us	1.00	-	NA
And	Job-FCEXGF	\Core_Root_diff\corerun.exe	25.03 us	0.155 us	0.145 us	25.07 us	24.68 us	25.21 us	1.00	-	NA

AndAnd	Job-MLIHKJ	\Core_Root_base\corerun.exe	27.33 us	0.243 us	0.227 us	27.35 us	27.01 us	27.64 us	1.00	-	NA
AndAnd	Job-FCEXGF	\Core_Root_diff\corerun.exe	27.24 us	0.106 us	0.094 us	27.26 us	27.09 us	27.35 us	1.00	-	NA

AndAndAnd	Job-MLIHKJ	\Core_Root_base\corerun.exe	27.42 us	0.220 us	0.206 us	27.33 us	27.20 us	27.75 us	1.00	-	NA
AndAndAnd	Job-FCEXGF	\Core_Root_diff\corerun.exe	27.48 us	0.175 us	0.164 us	27.50 us	27.22 us	27.75 us	1.00	-	NA

Or	Job-MLIHKJ	\Core_Root_base\corerun.exe	25.93 us	0.029 us	0.026 us	25.93 us	25.90 us	25.99 us	1.00	-	NA
Or	Job-FCEXGF	\Core_Root_diff\corerun.exe	26.31 us	0.150 us	0.140 us	26.33 us	25.98 us	26.50 us	1.01	-	NA

OrOr	Job-MLIHKJ	\Core_Root_base\corerun.exe	26.39 us	0.040 us	0.038 us	26.40 us	26.32 us	26.44 us	1.00	-	NA
OrOr	Job-FCEXGF	\Core_Root_diff\corerun.exe	26.43 us	0.104 us	0.086 us	26.40 us	26.35 us	26.66 us	1.00	-	NA

AndOr	Job-MLIHKJ	\Core_Root_base\corerun.exe	28.69 us	0.319 us	0.283 us	28.76 us	28.32 us	29.24 us	1.00	-	NA
AndOr	Job-FCEXGF	\Core_Root_diff\corerun.exe	28.35 us	0.035 us	0.029 us	28.35 us	28.31 us	28.40 us	0.99	-	NA

SingleArray	Job-MLIHKJ	\Core_Root_base\corerun.exe	25.70 us	0.268 us	0.251 us	25.75 us	25.37 us	26.04 us	1.00	-	NA
SingleArray	Job-FCEXGF	\Core_Root_diff\corerun.exe	21.17 us	0.061 us	0.057 us	21.16 us	21.10 us	21.27 us	0.82	-	NA

AndArray	Job-MLIHKJ	\Core_Root_base\corerun.exe	26.07 us	0.093 us	0.083 us	26.05 us	25.97 us	26.22 us	1.00	-	NA
AndArray	Job-FCEXGF	\Core_Root_diff\corerun.exe	26.00 us	0.130 us	0.122 us	25.96 us	25.87 us	26.24 us	1.00	-	NA

OrArray	Job-MLIHKJ	\Core_Root_base\corerun.exe	26.00 us	0.085 us	0.075 us	25.97 us	25.93 us	26.18 us	1.00	-	NA
OrArray	Job-FCEXGF	\Core_Root_diff\corerun.exe	25.98 us	0.069 us	0.065 us	25.98 us	25.90 us	26.10 us	1.00	-	NA

Single is converted ok and shows good benefit
And/AndAnd/AndAndAnd are filtered out during if-conversion for xarch because they end up testing the result of a GT_OR against zero, which the xarch backend does not handle efficiently. I.e.:

; Assembly listing for method IfStatements.IfStatements:AndAndInner(int,int,int)
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  5,  4.50)     int  ->  rcx        
;  V01 arg1         [V01,T01] (  4,  4   )     int  ->  rdx         single-def
;  V02 arg2         [V02,T02] (  4,  4   )     int  ->   r8         single-def
;  V03 OutArgs      [V03    ] (  1,  1   )  lclBlk (32) [rsp+00H]   "OutgoingArgSpace"
;
; Lcl frame size = 40
G_M12418_IG01:
       sub      rsp, 40
						;; size=4 bbWeight=1 PerfScore 0.25
G_M12418_IG02:
       mov      r9d, ecx
       and      r9d, 1
       mov      eax, edx
       and      eax, 1
       or       r9d, eax
       mov      eax, r8d
       and      eax, 1
       or       r9d, eax
       jne      SHORT G_M12418_IG04
						;; size=26 bbWeight=1 PerfScore 3.00
G_M12418_IG03:
       mov      ecx, 5
						;; size=5 bbWeight=0.50 PerfScore 0.12
G_M12418_IG04:
       xor      r9d, r9d
       call     [IfStatements.IfStatements:Consume(int,int,int,int)]
       nop      
						;; size=10 bbWeight=1 PerfScore 3.50
G_M12418_IG05:
       add      rsp, 40
       ret      
						;; size=5 bbWeight=1 PerfScore 1.25
; Total bytes of code 50, prolog size 4, PerfScore 13.13, instruction count 16, allocated bytes for code 50 (MethodHash=2b41cf7d) for method IfStatements.IfStatements:AndAndInner(int,int,int)
; ============================================================

Hopefully we can get these cases in the future.
3. Or/OrOr/AndOr cannot be if-converted, so there are no diffs
4. SingleArray is converted just fine and shows good benefit
5. AndArray is converted too, but oddly shows no benefit. The diff is:

@@ -1,55 +1,55 @@
 ; Assembly listing for method IfStatements.IfStatements:AndArrayInner(int,int)
 ; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
 ; Final local variable assignments
 ;
-;  V00 arg0         [V00,T00] (  6,  5.50)     int  ->  rcx        
+;  V00 arg0         [V00,T00] (  7,  6   )     int  ->  rcx        
 ;  V01 arg1         [V01,T01] (  5,  4   )     int  ->  rdx         single-def
 ;  V02 OutArgs      [V02    ] (  1,  1   )  lclBlk (32) [rsp+00H]   "OutgoingArgSpace"
 ;  V03 tmp1         [V03,T02] (  3,  6   )     ref  ->   r9         single-def "arr expr"
 ;  V04 tmp2         [V04,T05] (  2,  2   )     ref  ->   r8         single-def "arr expr"
 ;  V05 cse0         [V05,T03] (  3,  2.50)     ref  ->   r8         "CSE - aggressive"
 ;  V06 cse1         [V06,T04] (  3,  2.50)     int  ->  rax         "CSE - aggressive"
 ;
 ; Lcl frame size = 40
 G_M49807_IG01:
        sub      rsp, 40
 						;; size=4 bbWeight=1 PerfScore 0.25
 G_M49807_IG02:
        mov      r8, 0xD1FFAB1E      ; data for IfStatements.IfStatements:inputs
        mov      r8, gword ptr [r8]
        mov      r9, r8
        mov      eax, dword ptr [r9+08H]
        cmp      ecx, eax
        jae      SHORT G_M49807_IG06
        mov      r10d, ecx
        test     byte  ptr [r9+4*r10+10H], 1
        jne      SHORT G_M49807_IG04
 						;; size=35 bbWeight=1 PerfScore 10.00
 G_M49807_IG03:
        cmp      edx, eax
        jae      SHORT G_M49807_IG06
        mov      r9d, edx
+       mov      eax, 5
        test     byte  ptr [r8+4*r9+10H], 1
-       jne      SHORT G_M49807_IG04
-       mov      ecx, 5
-						;; size=20 bbWeight=0.50 PerfScore 2.88
+       cmove    ecx, eax
+						;; size=21 bbWeight=0.50 PerfScore 2.50
 G_M49807_IG04:
        xor      r8d, r8d
        xor      r9d, r9d
        call     [IfStatements.IfStatements:Consume(int,int,int,int)]
        nop      
 						;; size=13 bbWeight=1 PerfScore 3.75
 G_M49807_IG05:
        add      rsp, 40
        ret      
 						;; size=5 bbWeight=1 PerfScore 1.25
 G_M49807_IG06:
        call     CORINFO_HELP_RNGCHKFAIL
        int3     
 						;; size=6 bbWeight=0 PerfScore 0.00

This one is pretty strange to me considering that the input values should be random.
7. OrArray is not converted by if-conversion, so no diffs.

jakobbotsch · 2023-02-01T17:20:31Z

This one is pretty strange to me considering that the input values should be random.

Interestingly, if I change the implementation of AndArray from

[Benchmark]
public void AndArray() {
    for (int i = 0; i < Iterations; i++) {
        AndArrayInner(i, i + 1);
    }
}

to

[Benchmark]
public void AndArray() {
    int other = Iterations - 1;
    for (int i = 0; i < Iterations; i++) {
        AndArrayInner(i, other);
        other--;
    }
}

then I do start to see a benefit from cmov. Note that the code is:

[MethodImpl(MethodImplOptions.NoInlining)]
public static void AndArrayInner(int op1, int op2) {
    if (inputs[op1] % 2 == 0 && inputs[op2] % 2 == 0) {
        op1 = 5;
    }
    Consume(op1, op2, 0, 0);
    }

In the former case, the inputs[op1] comparison is going to have the same result as the inputs[op2] comparison from the previous loop iteration, seems like the branch predictor is smart enough to realize that.

jakobbotsch · 2023-02-01T18:42:20Z

/azp run runtime-coreclr jitstressregs, runtime-coreclr libraries-jitstressregs

azure-pipelines · 2023-02-01T18:42:44Z

Azure Pipelines successfully started running 2 pipeline(s).

jakobbotsch · 2023-02-01T22:44:07Z

cc @dotnet/jit-contrib PTAL @kunalspathak @BruceForstall (if you are available)

Diffs.
Practically all the regressions I have spot checked have been related to some more register moves fundamentally being required due to the removal of the control flow (e.g. see most examples posted above).

ARM64 diffs are due to the removal of some conditional reversing in if-conversion. Removing this benefits LSRA significantly on xarch and has an insignificant impact on arm64. Some more details in #81267 (comment)

kunalspathak · 2023-02-02T07:45:41Z

src/coreclr/jit/lsraxarch.cpp

+    // guarantee that the dstReg is used only in one of falseVal/trueVal, then
+    // we are good.
+    //
+    // To ensure the above we have some bespoke interference logic here on


I wish there was an easy way to handle this particular scenario.

Me too, but luckily this check didn't turn out to be too ugly.
I do wonder if we could use a similar trick in the interval building for RMW opers to avoid delay free in most cases, perhaps something to look at in a follow up.

Well, we could but it would defeat the "linear" nature of the linear scan provided that it doesn't hurt much in common scenarios.

Just to understand a bit more, BuildDelayFreeUses() doesn't achieve what you want? It doesn't delay free unconditionally, but does a check about matching interval, etc.?

it would defeat the "linear" nature of the linear scan

This does not affect the complexity since the number of ref positions created for op1/op2 is bounded by a constant (at most 2, for an indir with a contained LEA).

Just to understand a bit more, BuildDelayFreeUses() doesn't achieve what you want? It doesn't delay free unconditionally, but does a check about matching interval, etc.?

I did start out with BuildDelayFreeUses, but the extra flexibility we have in codegen (by swapping the trueVal/falseVal and reversing the condition) means that we don't need to mark something delay-free in practically all situations. BuildDelayFreeUses handles only situations where the other operand is a local, but for this particular node it could also be another contained indirection.

I looked at the RMW logic and I don't think it benefits from doing this since for the RMW opers we don't have the capability to contain two indirections anyway.

kunalspathak · 2023-02-02T07:46:37Z

src/tests/JIT/opt/Compares/compares.cs

@@ -82,7 +82,7 @@ public class SideEffects
    [MethodImpl(MethodImplOptions.NoInlining)]
    public static void Eq_byte_consume(byte a1, byte a2) {
        //ARM64-FULL-LINE: cmp {{w[0-9]+}}, {{w[0-9]+}}
-        //ARM64-FULL-LINE-NEXT: csel {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, eq
+        //ARM64-FULL-LINE-NEXT: csel {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{eq|ne}}


how does the x64 validation works for these examples?

It didn't do any disasm checks on them for x64. Added that now.

jakobbotsch · 2023-02-08T22:40:13Z

Ping @kunalspathak @BruceForstall

BruceForstall

Looks good!

src/coreclr/jit/lowerxarch.cpp

Co-authored-by: Bruce Forstall <brucefo@microsoft.com>

EgorBo · 2023-03-28T16:48:05Z

Lots of improvements on Win-x64:

[Perf] Windows/x64: 60 Improvements on 2/9/2023 12:40:40 PM perf-autofiling-issues#12985

Also

[Perf] Linux/x64: 35 Improvements on 2/9/2023 12:40:40 PM perf-autofiling-issues#12928
[Perf] Alpine/x64: 58 Improvements on 2/9/2023 12:40:40 PM perf-autofiling-issues#12943

JIT: Support containing compares in GT_SELECT for xarch

2963b54

This adds support for contained compares in GT_SELECT nodes for xarch. As part of this, also enables if-conversion on xarch. Fix dotnet#6749

dotnet-issue-labeler bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Jan 27, 2023

ghost assigned jakobbotsch Jan 27, 2023

jakobbotsch mentioned this pull request Jan 27, 2023

[JIT] X64 - Using cmovns for faster division #81252

Closed

Flip ifdef

2137d36

jakobbotsch added 2 commits January 28, 2023 11:54

Run jit-format

b25d4b0

Improve LSRA building

2f2942c

jakobbotsch added 7 commits January 28, 2023 20:26

Move some code up

4da299f

Fix compares that were swapped

4ca94e2

Fix GT_SELECT_HI handling

dfa5617

Handle constraints for compare operands correctly on x86

b6b1f41

Disallow 64-bit GT_SELECT on x86 in another case

640e59a

Nit

a91e977

Nit

9499a66

build-analysis bot mentioned this pull request Jan 29, 2023

Test failure Loader\\classloader\\DictionaryExpansion\\DictionaryExpansion\\DictionaryExpansion.cmd #75244

Closed

jakobbotsch added 3 commits January 29, 2023 01:52

Revise delay free logic, fix som FP conditionals

ac1c27b

Fix bad copy paste

934f719

Avoid if-conversion for some badly handled cases

3fb62ea

jakobbotsch commented Jan 30, 2023

View reviewed changes

src/coreclr/jit/lowerxarch.cpp Outdated Show resolved Hide resolved

Remove leftover code

5a95fd4

tannergooding reviewed Jan 30, 2023

View reviewed changes

TIHan reviewed Jan 30, 2023

View reviewed changes

EgorBo mentioned this pull request Feb 1, 2023

Difference of Codegen in JIT ASM when using Negation / if-else block code #81479

Closed

jakobbotsch marked this pull request as ready for review February 1, 2023 17:33

Fix linux-x86 build

3f56930

dotnet deleted a comment from azure-pipelines bot Feb 1, 2023

jakobbotsch requested review from kunalspathak and BruceForstall February 1, 2023 22:44

TIHan mentioned this pull request Feb 1, 2023

Utilizing flags set by various instructions, not just cmp and test. #76502

Open

kunalspathak reviewed Feb 2, 2023

View reviewed changes

jakobbotsch added 2 commits February 2, 2023 11:51

Add x64 disasm tests

3845b74

Fix test case

9da5486

jakobbotsch mentioned this pull request Feb 2, 2023

JIT/opt/Compares/compares should seemingly not be passing file-check on arm64 #81531

Closed

Actually fix test case

93ae093

jakobbotsch mentioned this pull request Feb 6, 2023

JIT: Relax reg-optionality validity checks #81614

Merged

BruceForstall approved these changes Feb 8, 2023

View reviewed changes

src/coreclr/jit/lowerxarch.cpp Outdated Show resolved Hide resolved

Update src/coreclr/jit/lowerxarch.cpp

44a89aa

Co-authored-by: Bruce Forstall <brucefo@microsoft.com>

jakobbotsch merged commit 430d6a4 into dotnet:main Feb 9, 2023

jakobbotsch deleted the xarch-select-contain-conditions branch February 9, 2023 10:16

jakobbotsch mentioned this pull request Feb 9, 2023

JIT: Transform SELECT(relop, 1/0, 0/1) to relop #81880

Merged

This was referenced Feb 14, 2023

Regressions in System.Text.RegularExpressions.Tests.Perf_Regex_Industry_RustLang_Sherlock #82106

Closed

[Perf] Windows/x64: 23 Regressions on 2/9/2023 12:40:40 PM #82111

Closed

ghost locked as resolved and limited conversation to collaborators Mar 11, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

JIT: Support containing compares in GT_SELECT for xarch (i.e. start emitting cmov instructions) #81267

JIT: Support containing compares in GT_SELECT for xarch (i.e. start emitting cmov instructions) #81267

jakobbotsch commented Jan 27, 2023 •

edited

Loading

ghost commented Jan 27, 2023

jakobbotsch commented Jan 28, 2023 •

edited

Loading

jakobbotsch commented Jan 28, 2023 •

edited

Loading

jakobbotsch commented Jan 30, 2023

jakobbotsch commented Jan 30, 2023

azure-pipelines bot commented Jan 30, 2023

tannergooding Jan 30, 2023

jakobbotsch Feb 1, 2023

tannergooding Jan 30, 2023

jakobbotsch Jan 30, 2023

jakobbotsch Jan 30, 2023

tannergooding Jan 30, 2023

jakobbotsch Jan 30, 2023

tannergooding Jan 30, 2023

tannergooding Jan 30, 2023

jakobbotsch Jan 30, 2023

tannergooding Jan 30, 2023

jakobbotsch Jan 30, 2023 •

edited

Loading

TIHan left a comment

jakobbotsch commented Feb 1, 2023 •

edited

Loading

jakobbotsch commented Feb 1, 2023 •

edited

Loading

jakobbotsch commented Feb 1, 2023

azure-pipelines bot commented Feb 1, 2023

jakobbotsch commented Feb 1, 2023 •

edited

Loading

kunalspathak Feb 2, 2023

jakobbotsch Feb 3, 2023

kunalspathak Feb 9, 2023

kunalspathak Feb 9, 2023

jakobbotsch Feb 9, 2023

kunalspathak Feb 2, 2023

jakobbotsch Feb 2, 2023

jakobbotsch commented Feb 8, 2023

BruceForstall left a comment

EgorBo commented Mar 28, 2023 •

edited by AndyAyersMS

Loading

JIT: Support containing compares in GT_SELECT for xarch (i.e. start emitting cmov instructions) #81267

JIT: Support containing compares in GT_SELECT for xarch (i.e. start emitting cmov instructions) #81267

Conversation

jakobbotsch commented Jan 27, 2023 • edited Loading

ghost commented Jan 27, 2023

jakobbotsch commented Jan 28, 2023 • edited Loading

jakobbotsch commented Jan 28, 2023 • edited Loading

jakobbotsch commented Jan 30, 2023

jakobbotsch commented Jan 30, 2023

azure-pipelines bot commented Jan 30, 2023

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

jakobbotsch Jan 30, 2023 • edited Loading

Choose a reason for hiding this comment

TIHan left a comment

Choose a reason for hiding this comment

jakobbotsch commented Feb 1, 2023 • edited Loading

jakobbotsch commented Feb 1, 2023 • edited Loading

jakobbotsch commented Feb 1, 2023

azure-pipelines bot commented Feb 1, 2023

jakobbotsch commented Feb 1, 2023 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

jakobbotsch commented Feb 8, 2023

BruceForstall left a comment

Choose a reason for hiding this comment

EgorBo commented Mar 28, 2023 • edited by AndyAyersMS Loading

jakobbotsch commented Jan 27, 2023 •

edited

Loading

jakobbotsch commented Jan 28, 2023 •

edited

Loading

jakobbotsch commented Jan 28, 2023 •

edited

Loading

jakobbotsch Jan 30, 2023 •

edited

Loading

jakobbotsch commented Feb 1, 2023 •

edited

Loading

jakobbotsch commented Feb 1, 2023 •

edited

Loading

jakobbotsch commented Feb 1, 2023 •

edited

Loading

EgorBo commented Mar 28, 2023 •

edited by AndyAyersMS

Loading