[Isel Aarch64] extra instruction (i256) or 2 instructions (i320) when chaining icmp and select based on underflow #103855

mratsim · 2024-08-14T12:14:48Z

Same IR as #103841 but applied to Aarch64 as an alternative to #103717

Unlike x86 there is always an extra instruction even for i256, and there are 2 unnecessary instruction for i320 or i384.

https://alive2.llvm.org/ce/z/-bGiUs

Full code

Original IR

; ModuleID = 'x86_poc'
target triple = "arm64"
; target triple = "x86_64"

@bn254_snarks_fp_mod = constant i256 21888242871839275222246405745257275088696311157297823662689037894645226208583, section "ctt.bn254_snarks_fp.constants", align 64
@bls12_381_fp_mod = constant i384 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787, section "ctt.bls12_381_fp.constants", align 64
@bls24_317_fp_mod = constant i320 136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051, section "ctt.bls24_317_fp.constants", align 64

; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
  %a = load i256, ptr %1, align 4
  %b = load i256, ptr %2, align 4
  %M = load i256, ptr %3, align 4
  %a_plus_b = add i256 %a, %b
  %5 = sub i256 %a_plus_b, %M
  %6 = lshr i256 %5, 255
  %7 = trunc i256 %6 to i1
  %8 = select i1 %7, i256 %a_plus_b, i256 %5
  store i256 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
  %a = load i320, ptr %1, align 4
  %b = load i320, ptr %2, align 4
  %M = load i320, ptr %3, align 4
  %a_plus_b = add i320 %a, %b
  %5 = sub i320 %a_plus_b, %M
  %6 = lshr i320 %5, 319
  %7 = trunc i320 %6 to i1
  %8 = select i1 %7, i320 %a_plus_b, i320 %5
  store i320 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
  %a = load i384, ptr %1, align 4
  %b = load i384, ptr %2, align 4
  %M = load i384, ptr %3, align 4
  %a_plus_b = add i384 %a, %b
  %5 = sub i384 %a_plus_b, %M
  %6 = lshr i384 %5, 383
  %7 = trunc i384 %6 to i1
  %8 = select i1 %7, i384 %a_plus_b, i384 %5
  store i384 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define void @bn254_snarks_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bn254_snarks_fp" {
  call fastcc void @_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr @bn254_snarks_fp_mod)
  ret void
}

; Function Attrs: hot
define void @bls24_317_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bls24_317_fp" {
  call fastcc void @_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr @bls24_317_fp_mod)
  ret void
}


; Function Attrs: hot
define void @bls12_381_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bls12_381_fp" {
  call fastcc void @_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr @bls12_381_fp_mod)
  ret void
}

attributes #2 = { hot }

After opt -O3

target triple = "arm64"
; target triple = "x86_64"

define void @bn254_snarks_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bn254_snarks_fp" {
  %.val = load i256, ptr %1, align 4
  %.val1 = load i256, ptr %2, align 4
  %a_plus_b.i = add i256 %.val1, %.val
  %4 = add i256 %a_plus_b.i, -21888242871839275222246405745257275088696311157297823662689037894645226208583
  %.not1.i = icmp slt i256 %4, 0
  %5 = select i1 %.not1.i, i256 %a_plus_b.i, i256 %4
  store i256 %5, ptr %0, align 4
  ret void
}

define void @bls24_317_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bls24_317_fp" {
  %.val = load i320, ptr %1, align 4
  %.val1 = load i320, ptr %2, align 4
  %a_plus_b.i = add i320 %.val1, %.val
  %4 = add i320 %a_plus_b.i, -136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051
  %.not1.i = icmp slt i320 %4, 0
  %5 = select i1 %.not1.i, i320 %a_plus_b.i, i320 %4
  store i320 %5, ptr %0, align 4
  ret void
}

define void @bls12_381_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bls12_381_fp" {
  %.val = load i384, ptr %1, align 4
  %.val1 = load i384, ptr %2, align 4
  %a_plus_b.i = add i384 %.val1, %.val
  %4 = add i384 %a_plus_b.i, -4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787
  %.not1.i = icmp slt i384 %4, 0
  %5 = select i1 %.not1.i, i384 %a_plus_b.i, i384 %4
  store i384 %5, ptr %0, align 4
  ret void
}

attributes #0 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }

Assembly

bn254_snarks_fp_add:                    // @bn254_snarks_fp_add
        ldp     x8, x10, [x1]
        mov     x15, #24534                     // =0x5fd6
        ldp     x9, x11, [x2]
        movk    x15, #7886, lsl #16
        ldp     x13, x14, [x2, #16]
        movk    x15, #45453, lsl #32
        movk    x15, #53147, lsl #48
        adds    x8, x9, x8
        ldp     x12, x9, [x1, #16]
        adcs    x10, x11, x10
        mov     x11, #697                       // =0x2b9
        movk    x11, #10115, lsl #16
        adcs    x12, x13, x12
        mov     x13, #13682                     // =0x3572
        movk    x11, #29673, lsl #32
        movk    x13, #38798, lsl #16
        adc     x9, x14, x9
        mov     x14, #42914                     // =0xa7a2
        movk    x11, #50143, lsl #48
        movk    x13, #38254, lsl #32
        movk    x14, #32382, lsl #16
        movk    x13, #26750, lsl #48
        adds    x11, x8, x11
        movk    x14, #47689, lsl #32
        movk    x14, #18351, lsl #48
        adcs    x13, x10, x13
        adcs    x14, x12, x14
        adc     x15, x9, x15
        cmp     x15, #0
        csel    x12, x12, x14, lt
        csel    x9, x9, x15, lt
        csel    x8, x8, x11, lt
        stp     x12, x9, [x0, #16]
        csel    x9, x10, x13, lt
        stp     x8, x9, [x0]
        ret
bls24_317_fp_add:                       // @bls24_317_fp_add
        ldp     x8, x10, [x1]
        mov     x16, #12230                     // =0x2fc6
        ldp     x9, x11, [x2]
        movk    x16, #18438, lsl #16
        ldp     x13, x14, [x2, #16]
        mov     x17, #30419                     // =0x76d3
        ldr     x15, [x2, #32]
        movk    x16, #14943, lsl #32
        movk    x17, #37023, lsl #16
        adds    x8, x9, x8
        ldp     x12, x9, [x1, #16]
        adcs    x10, x11, x10
        ldr     x11, [x1, #32]
        movk    x16, #3440, lsl #48
        movk    x17, #13789, lsl #32
        adcs    x12, x13, x12
        mov     x13, #54613                     // =0xd555
        movk    x17, #61351, lsl #48
        movk    x13, #41556, lsl #16
        adcs    x9, x14, x9
        mov     x14, #16513                     // =0x4081
        movk    x13, #53673, lsl #32
        movk    x14, #52187, lsl #16
        adc     x11, x15, x11
        mov     x15, #52153                     // =0xcbb9
        movk    x13, #29358, lsl #48
        movk    x14, #50715, lsl #32
        movk    x15, #31544, lsl #16
        movk    x14, #10508, lsl #48
        adds    x13, x8, x13
        movk    x15, #40473, lsl #32
        adcs    x14, x10, x14
        movk    x15, #59749, lsl #48
        adcs    x15, x12, x15
        adcs    x16, x9, x16
        adc     x17, x11, x17
        asr     x18, x17, #63
        cmp     x18, #0
        csel    x11, x11, x17, lt
        csel    x9, x9, x16, lt
        csel    x12, x12, x15, lt
        csel    x8, x8, x13, lt
        stp     x9, x11, [x0, #24]
        csel    x9, x10, x14, lt
        str     x8, [x0]
        stp     x9, x12, [x0, #8]
        ret
bls12_381_fp_add:                       // @bls12_381_fp_add
        ldp     x8, x10, [x1]
        mov     x17, #60736                     // =0xed40
        ldp     x9, x11, [x2]
        movk    x17, #3194, lsl #16
        ldp     x13, x14, [x2, #16]
        mov     x18, #21288                     // =0x5328
        ldp     x15, x16, [x2, #32]
        movk    x17, #46203, lsl #32
        adds    x8, x9, x8
        ldp     x12, x9, [x1, #16]
        adcs    x10, x11, x10
        movk    x18, #48308, lsl #16
        movk    x17, #39816, lsl #48
        movk    x18, #22601, lsl #32
        adcs    x12, x13, x12
        ldp     x11, x13, [x1, #32]
        adcs    x9, x14, x9
        mov     x14, #21845                     // =0x5555
        mov     x1, #6501                       // =0x1965
        movk    x14, #17921, lsl #48
        movk    x1, #50816, lsl #16
        movk    x18, #46308, lsl #48
        adcs    x11, x15, x11
        mov     x15, #1319895040                // =0x4eac0000
        movk    x1, #60949, lsl #32
        adc     x13, x16, x13
        mov     x16, #2523                      // =0x9db
        movk    x15, #1, lsl #32
        movk    x16, #2383, lsl #16
        movk    x15, #57684, lsl #48
        adds    x14, x8, x14
        movk    x16, #11615, lsl #32
        adcs    x15, x10, x15
        movk    x1, #58878, lsl #48
        movk    x16, #39119, lsl #48
        adcs    x16, x12, x16
        adcs    x17, x9, x17
        adcs    x18, x11, x18
        adc     x1, x13, x1
        asr     x2, x1, #63
        cmp     x2, #0
        csel    x11, x11, x18, lt
        csel    x13, x13, x1, lt
        csel    x9, x9, x17, lt
        stp     x11, x13, [x0, #32]
        csel    x11, x12, x16, lt
        csel    x8, x8, x14, lt
        stp     x11, x9, [x0, #16]
        csel    x9, x10, x15, lt
        stp     x8, x9, [x0]
        ret

Analysis

With i256, the cmp is useless in this sequence

        movk    x11, #50143, lsl #48
        movk    x13, #38254, lsl #32
        movk    x14, #32382, lsl #16
        movk    x13, #26750, lsl #48
        adds    x11, x8, x11
        movk    x14, #47689, lsl #32
        movk    x14, #18351, lsl #48
        adcs    x13, x10, x13
        adcs    x14, x12, x14
        adc     x15, x9, x15
        cmp     x15, #0 // <----- unnecessary
        csel    x12, x12, x14, lt
        csel    x9, x9, x15, lt
        csel    x8, x8, x11, lt
        stp     x12, x9, [x0, #16]
        csel    x9, x10, x13, lt
        stp     x8, x9, [x0]
        ret

as demonstrated by #103717

With i320, similar to x86 #103841, there is another additional asr instruction

        movk    x13, #29358, lsl #48
        movk    x14, #50715, lsl #32
        movk    x15, #31544, lsl #16
        movk    x14, #10508, lsl #48
        adds    x13, x8, x13
        movk    x15, #40473, lsl #32
        adcs    x14, x10, x14
        movk    x15, #59749, lsl #48
        adcs    x15, x12, x15
        adcs    x16, x9, x16
        adc     x17, x11, x17
        asr     x18, x17, #63  // <----- unnecessary
        cmp     x18, #0  // <----- unnecessary
        csel    x11, x11, x17, lt
        csel    x9, x9, x16, lt
        csel    x12, x12, x15, lt
        csel    x8, x8, x13, lt
        stp     x9, x11, [x0, #24]
        csel    x9, x10, x14, lt
        str     x8, [x0]
        stp     x9, x12, [x0, #8]
        ret

The text was updated successfully, but these errors were encountered:

llvmbot · 2024-08-14T13:58:13Z

@llvm/issue-subscribers-backend-aarch64

Author: Mamy Ratsimbazafy (mratsim)

Same IR as https://github.com//issues/103841 but applied to Aarch64 as an alternative to https://github.com//issues/103717

Unlike x86 there is always an extra instruction even for i256, and there are 2 unnecessary instruction for i320 or i384.

https://alive2.llvm.org/ce/z/-bGiUs

Full code

Original IR

; ModuleID = 'x86_poc'
target triple = "arm64"
; target triple = "x86_64"

@<!-- -->bn254_snarks_fp_mod = constant i256 21888242871839275222246405745257275088696311157297823662689037894645226208583, section "ctt.bn254_snarks_fp.constants", align 64
@<!-- -->bls12_381_fp_mod = constant i384 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787, section "ctt.bls12_381_fp.constants", align 64
@<!-- -->bls24_317_fp_mod = constant i320 136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051, section "ctt.bls24_317_fp.constants", align 64

; Function Attrs: hot
define internal fastcc void @<!-- -->_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr %3) #<!-- -->2 section "ctt.fields" {
  %a = load i256, ptr %1, align 4
  %b = load i256, ptr %2, align 4
  %M = load i256, ptr %3, align 4
  %a_plus_b = add i256 %a, %b
  %5 = sub i256 %a_plus_b, %M
  %6 = lshr i256 %5, 255
  %7 = trunc i256 %6 to i1
  %8 = select i1 %7, i256 %a_plus_b, i256 %5
  store i256 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define internal fastcc void @<!-- -->_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr %3) #<!-- -->2 section "ctt.fields" {
  %a = load i320, ptr %1, align 4
  %b = load i320, ptr %2, align 4
  %M = load i320, ptr %3, align 4
  %a_plus_b = add i320 %a, %b
  %5 = sub i320 %a_plus_b, %M
  %6 = lshr i320 %5, 319
  %7 = trunc i320 %6 to i1
  %8 = select i1 %7, i320 %a_plus_b, i320 %5
  store i320 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define internal fastcc void @<!-- -->_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr %3) #<!-- -->2 section "ctt.fields" {
  %a = load i384, ptr %1, align 4
  %b = load i384, ptr %2, align 4
  %M = load i384, ptr %3, align 4
  %a_plus_b = add i384 %a, %b
  %5 = sub i384 %a_plus_b, %M
  %6 = lshr i384 %5, 383
  %7 = trunc i384 %6 to i1
  %8 = select i1 %7, i384 %a_plus_b, i384 %5
  store i384 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define void @<!-- -->bn254_snarks_fp_add(ptr %0, ptr %1, ptr %2) #<!-- -->2 section "ctt.bn254_snarks_fp" {
  call fastcc void @<!-- -->_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr @<!-- -->bn254_snarks_fp_mod)
  ret void
}

; Function Attrs: hot
define void @<!-- -->bls24_317_fp_add(ptr %0, ptr %1, ptr %2) #<!-- -->2 section "ctt.bls24_317_fp" {
  call fastcc void @<!-- -->_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr @<!-- -->bls24_317_fp_mod)
  ret void
}


; Function Attrs: hot
define void @<!-- -->bls12_381_fp_add(ptr %0, ptr %1, ptr %2) #<!-- -->2 section "ctt.bls12_381_fp" {
  call fastcc void @<!-- -->_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr @<!-- -->bls12_381_fp_mod)
  ret void
}

attributes #<!-- -->2 = { hot }

After opt -O3

target triple = "arm64"
; target triple = "x86_64"

define void @<!-- -->bn254_snarks_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #<!-- -->0 section "ctt.bn254_snarks_fp" {
  %.val = load i256, ptr %1, align 4
  %.val1 = load i256, ptr %2, align 4
  %a_plus_b.i = add i256 %.val1, %.val
  %4 = add i256 %a_plus_b.i, -21888242871839275222246405745257275088696311157297823662689037894645226208583
  %.not1.i = icmp slt i256 %4, 0
  %5 = select i1 %.not1.i, i256 %a_plus_b.i, i256 %4
  store i256 %5, ptr %0, align 4
  ret void
}

define void @<!-- -->bls24_317_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #<!-- -->0 section "ctt.bls24_317_fp" {
  %.val = load i320, ptr %1, align 4
  %.val1 = load i320, ptr %2, align 4
  %a_plus_b.i = add i320 %.val1, %.val
  %4 = add i320 %a_plus_b.i, -136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051
  %.not1.i = icmp slt i320 %4, 0
  %5 = select i1 %.not1.i, i320 %a_plus_b.i, i320 %4
  store i320 %5, ptr %0, align 4
  ret void
}

define void @<!-- -->bls12_381_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #<!-- -->0 section "ctt.bls12_381_fp" {
  %.val = load i384, ptr %1, align 4
  %.val1 = load i384, ptr %2, align 4
  %a_plus_b.i = add i384 %.val1, %.val
  %4 = add i384 %a_plus_b.i, -4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787
  %.not1.i = icmp slt i384 %4, 0
  %5 = select i1 %.not1.i, i384 %a_plus_b.i, i384 %4
  store i384 %5, ptr %0, align 4
  ret void
}

attributes #<!-- -->0 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }

Assembly

bn254_snarks_fp_add:                    // @<!-- -->bn254_snarks_fp_add
        ldp     x8, x10, [x1]
        mov     x15, #<!-- -->24534                     // =0x5fd6
        ldp     x9, x11, [x2]
        movk    x15, #<!-- -->7886, lsl #<!-- -->16
        ldp     x13, x14, [x2, #<!-- -->16]
        movk    x15, #<!-- -->45453, lsl #<!-- -->32
        movk    x15, #<!-- -->53147, lsl #<!-- -->48
        adds    x8, x9, x8
        ldp     x12, x9, [x1, #<!-- -->16]
        adcs    x10, x11, x10
        mov     x11, #<!-- -->697                       // =0x2b9
        movk    x11, #<!-- -->10115, lsl #<!-- -->16
        adcs    x12, x13, x12
        mov     x13, #<!-- -->13682                     // =0x3572
        movk    x11, #<!-- -->29673, lsl #<!-- -->32
        movk    x13, #<!-- -->38798, lsl #<!-- -->16
        adc     x9, x14, x9
        mov     x14, #<!-- -->42914                     // =0xa7a2
        movk    x11, #<!-- -->50143, lsl #<!-- -->48
        movk    x13, #<!-- -->38254, lsl #<!-- -->32
        movk    x14, #<!-- -->32382, lsl #<!-- -->16
        movk    x13, #<!-- -->26750, lsl #<!-- -->48
        adds    x11, x8, x11
        movk    x14, #<!-- -->47689, lsl #<!-- -->32
        movk    x14, #<!-- -->18351, lsl #<!-- -->48
        adcs    x13, x10, x13
        adcs    x14, x12, x14
        adc     x15, x9, x15
        cmp     x15, #<!-- -->0
        csel    x12, x12, x14, lt
        csel    x9, x9, x15, lt
        csel    x8, x8, x11, lt
        stp     x12, x9, [x0, #<!-- -->16]
        csel    x9, x10, x13, lt
        stp     x8, x9, [x0]
        ret
bls24_317_fp_add:                       // @<!-- -->bls24_317_fp_add
        ldp     x8, x10, [x1]
        mov     x16, #<!-- -->12230                     // =0x2fc6
        ldp     x9, x11, [x2]
        movk    x16, #<!-- -->18438, lsl #<!-- -->16
        ldp     x13, x14, [x2, #<!-- -->16]
        mov     x17, #<!-- -->30419                     // =0x76d3
        ldr     x15, [x2, #<!-- -->32]
        movk    x16, #<!-- -->14943, lsl #<!-- -->32
        movk    x17, #<!-- -->37023, lsl #<!-- -->16
        adds    x8, x9, x8
        ldp     x12, x9, [x1, #<!-- -->16]
        adcs    x10, x11, x10
        ldr     x11, [x1, #<!-- -->32]
        movk    x16, #<!-- -->3440, lsl #<!-- -->48
        movk    x17, #<!-- -->13789, lsl #<!-- -->32
        adcs    x12, x13, x12
        mov     x13, #<!-- -->54613                     // =0xd555
        movk    x17, #<!-- -->61351, lsl #<!-- -->48
        movk    x13, #<!-- -->41556, lsl #<!-- -->16
        adcs    x9, x14, x9
        mov     x14, #<!-- -->16513                     // =0x4081
        movk    x13, #<!-- -->53673, lsl #<!-- -->32
        movk    x14, #<!-- -->52187, lsl #<!-- -->16
        adc     x11, x15, x11
        mov     x15, #<!-- -->52153                     // =0xcbb9
        movk    x13, #<!-- -->29358, lsl #<!-- -->48
        movk    x14, #<!-- -->50715, lsl #<!-- -->32
        movk    x15, #<!-- -->31544, lsl #<!-- -->16
        movk    x14, #<!-- -->10508, lsl #<!-- -->48
        adds    x13, x8, x13
        movk    x15, #<!-- -->40473, lsl #<!-- -->32
        adcs    x14, x10, x14
        movk    x15, #<!-- -->59749, lsl #<!-- -->48
        adcs    x15, x12, x15
        adcs    x16, x9, x16
        adc     x17, x11, x17
        asr     x18, x17, #<!-- -->63
        cmp     x18, #<!-- -->0
        csel    x11, x11, x17, lt
        csel    x9, x9, x16, lt
        csel    x12, x12, x15, lt
        csel    x8, x8, x13, lt
        stp     x9, x11, [x0, #<!-- -->24]
        csel    x9, x10, x14, lt
        str     x8, [x0]
        stp     x9, x12, [x0, #<!-- -->8]
        ret
bls12_381_fp_add:                       // @<!-- -->bls12_381_fp_add
        ldp     x8, x10, [x1]
        mov     x17, #<!-- -->60736                     // =0xed40
        ldp     x9, x11, [x2]
        movk    x17, #<!-- -->3194, lsl #<!-- -->16
        ldp     x13, x14, [x2, #<!-- -->16]
        mov     x18, #<!-- -->21288                     // =0x5328
        ldp     x15, x16, [x2, #<!-- -->32]
        movk    x17, #<!-- -->46203, lsl #<!-- -->32
        adds    x8, x9, x8
        ldp     x12, x9, [x1, #<!-- -->16]
        adcs    x10, x11, x10
        movk    x18, #<!-- -->48308, lsl #<!-- -->16
        movk    x17, #<!-- -->39816, lsl #<!-- -->48
        movk    x18, #<!-- -->22601, lsl #<!-- -->32
        adcs    x12, x13, x12
        ldp     x11, x13, [x1, #<!-- -->32]
        adcs    x9, x14, x9
        mov     x14, #<!-- -->21845                     // =0x5555
        mov     x1, #<!-- -->6501                       // =0x1965
        movk    x14, #<!-- -->17921, lsl #<!-- -->48
        movk    x1, #<!-- -->50816, lsl #<!-- -->16
        movk    x18, #<!-- -->46308, lsl #<!-- -->48
        adcs    x11, x15, x11
        mov     x15, #<!-- -->1319895040                // =0x4eac0000
        movk    x1, #<!-- -->60949, lsl #<!-- -->32
        adc     x13, x16, x13
        mov     x16, #<!-- -->2523                      // =0x9db
        movk    x15, #<!-- -->1, lsl #<!-- -->32
        movk    x16, #<!-- -->2383, lsl #<!-- -->16
        movk    x15, #<!-- -->57684, lsl #<!-- -->48
        adds    x14, x8, x14
        movk    x16, #<!-- -->11615, lsl #<!-- -->32
        adcs    x15, x10, x15
        movk    x1, #<!-- -->58878, lsl #<!-- -->48
        movk    x16, #<!-- -->39119, lsl #<!-- -->48
        adcs    x16, x12, x16
        adcs    x17, x9, x17
        adcs    x18, x11, x18
        adc     x1, x13, x1
        asr     x2, x1, #<!-- -->63
        cmp     x2, #<!-- -->0
        csel    x11, x11, x18, lt
        csel    x13, x13, x1, lt
        csel    x9, x9, x17, lt
        stp     x11, x13, [x0, #<!-- -->32]
        csel    x11, x12, x16, lt
        csel    x8, x8, x14, lt
        stp     x11, x9, [x0, #<!-- -->16]
        csel    x9, x10, x15, lt
        stp     x8, x9, [x0]
        ret

Analysis

With i256, the cmp is useless in this sequence

        movk    x11, #<!-- -->50143, lsl #<!-- -->48
        movk    x13, #<!-- -->38254, lsl #<!-- -->32
        movk    x14, #<!-- -->32382, lsl #<!-- -->16
        movk    x13, #<!-- -->26750, lsl #<!-- -->48
        adds    x11, x8, x11
        movk    x14, #<!-- -->47689, lsl #<!-- -->32
        movk    x14, #<!-- -->18351, lsl #<!-- -->48
        adcs    x13, x10, x13
        adcs    x14, x12, x14
        adc     x15, x9, x15
        cmp     x15, #<!-- -->0 // &lt;----- unnecessary
        csel    x12, x12, x14, lt
        csel    x9, x9, x15, lt
        csel    x8, x8, x11, lt
        stp     x12, x9, [x0, #<!-- -->16]
        csel    x9, x10, x13, lt
        stp     x8, x9, [x0]
        ret

as demonstrated by #103717

With i320, similar to x86 #103841, there is another additional asr instruction

        movk    x13, #<!-- -->29358, lsl #<!-- -->48
        movk    x14, #<!-- -->50715, lsl #<!-- -->32
        movk    x15, #<!-- -->31544, lsl #<!-- -->16
        movk    x14, #<!-- -->10508, lsl #<!-- -->48
        adds    x13, x8, x13
        movk    x15, #<!-- -->40473, lsl #<!-- -->32
        adcs    x14, x10, x14
        movk    x15, #<!-- -->59749, lsl #<!-- -->48
        adcs    x15, x12, x15
        adcs    x16, x9, x16
        adc     x17, x11, x17
        asr     x18, x17, #<!-- -->63  // &lt;----- unnecessary
        cmp     x18, #<!-- -->0  // &lt;----- unnecessary
        csel    x11, x11, x17, lt
        csel    x9, x9, x16, lt
        csel    x12, x12, x15, lt
        csel    x8, x8, x13, lt
        stp     x9, x11, [x0, #<!-- -->24]
        csel    x9, x10, x14, lt
        str     x8, [x0]
        stp     x9, x12, [x0, #<!-- -->8]
        ret

github-actions bot added the new issue label Aug 14, 2024

This was referenced Aug 14, 2024

Tracking compiler inefficiencies mratsim/constantine#357

Open

llvm: more tentatives at optimal field addition with pure LLVM IR mratsim/constantine#457

Draft

EugeneZelenko added backend:AArch64 missed-optimization and removed new issue labels Aug 14, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Isel Aarch64] extra instruction (i256) or 2 instructions (i320) when chaining icmp and select based on underflow #103855

[Isel Aarch64] extra instruction (i256) or 2 instructions (i320) when chaining icmp and select based on underflow #103855

mratsim commented Aug 14, 2024

llvmbot commented Aug 14, 2024

Full code

Original IR

After opt -O3

Assembly

Analysis

[Isel Aarch64] extra instruction (i256) or 2 instructions (i320) when chaining icmp and select based on underflow #103855

[Isel Aarch64] extra instruction (i256) or 2 instructions (i320) when chaining icmp and select based on underflow #103855

Comments

mratsim commented Aug 14, 2024

Full code

Original IR

After opt -O3

Assembly

Analysis

llvmbot commented Aug 14, 2024

Full code

Original IR

After opt -O3

Assembly

Analysis