Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Isel Aarch64] extra instruction (i256) or 2 instructions (i320) when chaining icmp and select based on underflow #103855

Open
mratsim opened this issue Aug 14, 2024 · 1 comment

Comments

@mratsim
Copy link

mratsim commented Aug 14, 2024

Same IR as #103841 but applied to Aarch64 as an alternative to #103717

Unlike x86 there is always an extra instruction even for i256, and there are 2 unnecessary instruction for i320 or i384.

https://alive2.llvm.org/ce/z/-bGiUs

Full code

Original IR

; ModuleID = 'x86_poc'
target triple = "arm64"
; target triple = "x86_64"

@bn254_snarks_fp_mod = constant i256 21888242871839275222246405745257275088696311157297823662689037894645226208583, section "ctt.bn254_snarks_fp.constants", align 64
@bls12_381_fp_mod = constant i384 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787, section "ctt.bls12_381_fp.constants", align 64
@bls24_317_fp_mod = constant i320 136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051, section "ctt.bls24_317_fp.constants", align 64

; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
  %a = load i256, ptr %1, align 4
  %b = load i256, ptr %2, align 4
  %M = load i256, ptr %3, align 4
  %a_plus_b = add i256 %a, %b
  %5 = sub i256 %a_plus_b, %M
  %6 = lshr i256 %5, 255
  %7 = trunc i256 %6 to i1
  %8 = select i1 %7, i256 %a_plus_b, i256 %5
  store i256 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
  %a = load i320, ptr %1, align 4
  %b = load i320, ptr %2, align 4
  %M = load i320, ptr %3, align 4
  %a_plus_b = add i320 %a, %b
  %5 = sub i320 %a_plus_b, %M
  %6 = lshr i320 %5, 319
  %7 = trunc i320 %6 to i1
  %8 = select i1 %7, i320 %a_plus_b, i320 %5
  store i320 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define internal fastcc void @_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
  %a = load i384, ptr %1, align 4
  %b = load i384, ptr %2, align 4
  %M = load i384, ptr %3, align 4
  %a_plus_b = add i384 %a, %b
  %5 = sub i384 %a_plus_b, %M
  %6 = lshr i384 %5, 383
  %7 = trunc i384 %6 to i1
  %8 = select i1 %7, i384 %a_plus_b, i384 %5
  store i384 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define void @bn254_snarks_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bn254_snarks_fp" {
  call fastcc void @_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr @bn254_snarks_fp_mod)
  ret void
}

; Function Attrs: hot
define void @bls24_317_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bls24_317_fp" {
  call fastcc void @_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr @bls24_317_fp_mod)
  ret void
}


; Function Attrs: hot
define void @bls12_381_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.bls12_381_fp" {
  call fastcc void @_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr @bls12_381_fp_mod)
  ret void
}

attributes #2 = { hot }

After opt -O3

target triple = "arm64"
; target triple = "x86_64"

define void @bn254_snarks_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bn254_snarks_fp" {
  %.val = load i256, ptr %1, align 4
  %.val1 = load i256, ptr %2, align 4
  %a_plus_b.i = add i256 %.val1, %.val
  %4 = add i256 %a_plus_b.i, -21888242871839275222246405745257275088696311157297823662689037894645226208583
  %.not1.i = icmp slt i256 %4, 0
  %5 = select i1 %.not1.i, i256 %a_plus_b.i, i256 %4
  store i256 %5, ptr %0, align 4
  ret void
}

define void @bls24_317_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bls24_317_fp" {
  %.val = load i320, ptr %1, align 4
  %.val1 = load i320, ptr %2, align 4
  %a_plus_b.i = add i320 %.val1, %.val
  %4 = add i320 %a_plus_b.i, -136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051
  %.not1.i = icmp slt i320 %4, 0
  %5 = select i1 %.not1.i, i320 %a_plus_b.i, i320 %4
  store i320 %5, ptr %0, align 4
  ret void
}

define void @bls12_381_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #0 section "ctt.bls12_381_fp" {
  %.val = load i384, ptr %1, align 4
  %.val1 = load i384, ptr %2, align 4
  %a_plus_b.i = add i384 %.val1, %.val
  %4 = add i384 %a_plus_b.i, -4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787
  %.not1.i = icmp slt i384 %4, 0
  %5 = select i1 %.not1.i, i384 %a_plus_b.i, i384 %4
  store i384 %5, ptr %0, align 4
  ret void
}

attributes #0 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }

Assembly

bn254_snarks_fp_add:                    // @bn254_snarks_fp_add
        ldp     x8, x10, [x1]
        mov     x15, #24534                     // =0x5fd6
        ldp     x9, x11, [x2]
        movk    x15, #7886, lsl #16
        ldp     x13, x14, [x2, #16]
        movk    x15, #45453, lsl #32
        movk    x15, #53147, lsl #48
        adds    x8, x9, x8
        ldp     x12, x9, [x1, #16]
        adcs    x10, x11, x10
        mov     x11, #697                       // =0x2b9
        movk    x11, #10115, lsl #16
        adcs    x12, x13, x12
        mov     x13, #13682                     // =0x3572
        movk    x11, #29673, lsl #32
        movk    x13, #38798, lsl #16
        adc     x9, x14, x9
        mov     x14, #42914                     // =0xa7a2
        movk    x11, #50143, lsl #48
        movk    x13, #38254, lsl #32
        movk    x14, #32382, lsl #16
        movk    x13, #26750, lsl #48
        adds    x11, x8, x11
        movk    x14, #47689, lsl #32
        movk    x14, #18351, lsl #48
        adcs    x13, x10, x13
        adcs    x14, x12, x14
        adc     x15, x9, x15
        cmp     x15, #0
        csel    x12, x12, x14, lt
        csel    x9, x9, x15, lt
        csel    x8, x8, x11, lt
        stp     x12, x9, [x0, #16]
        csel    x9, x10, x13, lt
        stp     x8, x9, [x0]
        ret
bls24_317_fp_add:                       // @bls24_317_fp_add
        ldp     x8, x10, [x1]
        mov     x16, #12230                     // =0x2fc6
        ldp     x9, x11, [x2]
        movk    x16, #18438, lsl #16
        ldp     x13, x14, [x2, #16]
        mov     x17, #30419                     // =0x76d3
        ldr     x15, [x2, #32]
        movk    x16, #14943, lsl #32
        movk    x17, #37023, lsl #16
        adds    x8, x9, x8
        ldp     x12, x9, [x1, #16]
        adcs    x10, x11, x10
        ldr     x11, [x1, #32]
        movk    x16, #3440, lsl #48
        movk    x17, #13789, lsl #32
        adcs    x12, x13, x12
        mov     x13, #54613                     // =0xd555
        movk    x17, #61351, lsl #48
        movk    x13, #41556, lsl #16
        adcs    x9, x14, x9
        mov     x14, #16513                     // =0x4081
        movk    x13, #53673, lsl #32
        movk    x14, #52187, lsl #16
        adc     x11, x15, x11
        mov     x15, #52153                     // =0xcbb9
        movk    x13, #29358, lsl #48
        movk    x14, #50715, lsl #32
        movk    x15, #31544, lsl #16
        movk    x14, #10508, lsl #48
        adds    x13, x8, x13
        movk    x15, #40473, lsl #32
        adcs    x14, x10, x14
        movk    x15, #59749, lsl #48
        adcs    x15, x12, x15
        adcs    x16, x9, x16
        adc     x17, x11, x17
        asr     x18, x17, #63
        cmp     x18, #0
        csel    x11, x11, x17, lt
        csel    x9, x9, x16, lt
        csel    x12, x12, x15, lt
        csel    x8, x8, x13, lt
        stp     x9, x11, [x0, #24]
        csel    x9, x10, x14, lt
        str     x8, [x0]
        stp     x9, x12, [x0, #8]
        ret
bls12_381_fp_add:                       // @bls12_381_fp_add
        ldp     x8, x10, [x1]
        mov     x17, #60736                     // =0xed40
        ldp     x9, x11, [x2]
        movk    x17, #3194, lsl #16
        ldp     x13, x14, [x2, #16]
        mov     x18, #21288                     // =0x5328
        ldp     x15, x16, [x2, #32]
        movk    x17, #46203, lsl #32
        adds    x8, x9, x8
        ldp     x12, x9, [x1, #16]
        adcs    x10, x11, x10
        movk    x18, #48308, lsl #16
        movk    x17, #39816, lsl #48
        movk    x18, #22601, lsl #32
        adcs    x12, x13, x12
        ldp     x11, x13, [x1, #32]
        adcs    x9, x14, x9
        mov     x14, #21845                     // =0x5555
        mov     x1, #6501                       // =0x1965
        movk    x14, #17921, lsl #48
        movk    x1, #50816, lsl #16
        movk    x18, #46308, lsl #48
        adcs    x11, x15, x11
        mov     x15, #1319895040                // =0x4eac0000
        movk    x1, #60949, lsl #32
        adc     x13, x16, x13
        mov     x16, #2523                      // =0x9db
        movk    x15, #1, lsl #32
        movk    x16, #2383, lsl #16
        movk    x15, #57684, lsl #48
        adds    x14, x8, x14
        movk    x16, #11615, lsl #32
        adcs    x15, x10, x15
        movk    x1, #58878, lsl #48
        movk    x16, #39119, lsl #48
        adcs    x16, x12, x16
        adcs    x17, x9, x17
        adcs    x18, x11, x18
        adc     x1, x13, x1
        asr     x2, x1, #63
        cmp     x2, #0
        csel    x11, x11, x18, lt
        csel    x13, x13, x1, lt
        csel    x9, x9, x17, lt
        stp     x11, x13, [x0, #32]
        csel    x11, x12, x16, lt
        csel    x8, x8, x14, lt
        stp     x11, x9, [x0, #16]
        csel    x9, x10, x15, lt
        stp     x8, x9, [x0]
        ret

Analysis

With i256, the cmp is useless in this sequence

        movk    x11, #50143, lsl #48
        movk    x13, #38254, lsl #32
        movk    x14, #32382, lsl #16
        movk    x13, #26750, lsl #48
        adds    x11, x8, x11
        movk    x14, #47689, lsl #32
        movk    x14, #18351, lsl #48
        adcs    x13, x10, x13
        adcs    x14, x12, x14
        adc     x15, x9, x15
        cmp     x15, #0 // <----- unnecessary
        csel    x12, x12, x14, lt
        csel    x9, x9, x15, lt
        csel    x8, x8, x11, lt
        stp     x12, x9, [x0, #16]
        csel    x9, x10, x13, lt
        stp     x8, x9, [x0]
        ret

as demonstrated by #103717

With i320, similar to x86 #103841, there is another additional asr instruction

        movk    x13, #29358, lsl #48
        movk    x14, #50715, lsl #32
        movk    x15, #31544, lsl #16
        movk    x14, #10508, lsl #48
        adds    x13, x8, x13
        movk    x15, #40473, lsl #32
        adcs    x14, x10, x14
        movk    x15, #59749, lsl #48
        adcs    x15, x12, x15
        adcs    x16, x9, x16
        adc     x17, x11, x17
        asr     x18, x17, #63  // <----- unnecessary
        cmp     x18, #0  // <----- unnecessary
        csel    x11, x11, x17, lt
        csel    x9, x9, x16, lt
        csel    x12, x12, x15, lt
        csel    x8, x8, x13, lt
        stp     x9, x11, [x0, #24]
        csel    x9, x10, x14, lt
        str     x8, [x0]
        stp     x9, x12, [x0, #8]
        ret
@llvmbot
Copy link

llvmbot commented Aug 14, 2024

@llvm/issue-subscribers-backend-aarch64

Author: Mamy Ratsimbazafy (mratsim)

Same IR as https://github.com//issues/103841 but applied to Aarch64 as an alternative to https://github.com//issues/103717

Unlike x86 there is always an extra instruction even for i256, and there are 2 unnecessary instruction for i320 or i384.

https://alive2.llvm.org/ce/z/-bGiUs

Full code

Original IR

; ModuleID = 'x86_poc'
target triple = "arm64"
; target triple = "x86_64"

@<!-- -->bn254_snarks_fp_mod = constant i256 21888242871839275222246405745257275088696311157297823662689037894645226208583, section "ctt.bn254_snarks_fp.constants", align 64
@<!-- -->bls12_381_fp_mod = constant i384 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787, section "ctt.bls12_381_fp.constants", align 64
@<!-- -->bls24_317_fp_mod = constant i320 136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051, section "ctt.bls24_317_fp.constants", align 64

; Function Attrs: hot
define internal fastcc void @<!-- -->_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr %3) #<!-- -->2 section "ctt.fields" {
  %a = load i256, ptr %1, align 4
  %b = load i256, ptr %2, align 4
  %M = load i256, ptr %3, align 4
  %a_plus_b = add i256 %a, %b
  %5 = sub i256 %a_plus_b, %M
  %6 = lshr i256 %5, 255
  %7 = trunc i256 %6 to i1
  %8 = select i1 %7, i256 %a_plus_b, i256 %5
  store i256 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define internal fastcc void @<!-- -->_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr %3) #<!-- -->2 section "ctt.fields" {
  %a = load i320, ptr %1, align 4
  %b = load i320, ptr %2, align 4
  %M = load i320, ptr %3, align 4
  %a_plus_b = add i320 %a, %b
  %5 = sub i320 %a_plus_b, %M
  %6 = lshr i320 %5, 319
  %7 = trunc i320 %6 to i1
  %8 = select i1 %7, i320 %a_plus_b, i320 %5
  store i320 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define internal fastcc void @<!-- -->_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr %3) #<!-- -->2 section "ctt.fields" {
  %a = load i384, ptr %1, align 4
  %b = load i384, ptr %2, align 4
  %M = load i384, ptr %3, align 4
  %a_plus_b = add i384 %a, %b
  %5 = sub i384 %a_plus_b, %M
  %6 = lshr i384 %5, 383
  %7 = trunc i384 %6 to i1
  %8 = select i1 %7, i384 %a_plus_b, i384 %5
  store i384 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define void @<!-- -->bn254_snarks_fp_add(ptr %0, ptr %1, ptr %2) #<!-- -->2 section "ctt.bn254_snarks_fp" {
  call fastcc void @<!-- -->_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr @<!-- -->bn254_snarks_fp_mod)
  ret void
}

; Function Attrs: hot
define void @<!-- -->bls24_317_fp_add(ptr %0, ptr %1, ptr %2) #<!-- -->2 section "ctt.bls24_317_fp" {
  call fastcc void @<!-- -->_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr @<!-- -->bls24_317_fp_mod)
  ret void
}


; Function Attrs: hot
define void @<!-- -->bls12_381_fp_add(ptr %0, ptr %1, ptr %2) #<!-- -->2 section "ctt.bls12_381_fp" {
  call fastcc void @<!-- -->_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr @<!-- -->bls12_381_fp_mod)
  ret void
}

attributes #<!-- -->2 = { hot }

After opt -O3

target triple = "arm64"
; target triple = "x86_64"

define void @<!-- -->bn254_snarks_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #<!-- -->0 section "ctt.bn254_snarks_fp" {
  %.val = load i256, ptr %1, align 4
  %.val1 = load i256, ptr %2, align 4
  %a_plus_b.i = add i256 %.val1, %.val
  %4 = add i256 %a_plus_b.i, -21888242871839275222246405745257275088696311157297823662689037894645226208583
  %.not1.i = icmp slt i256 %4, 0
  %5 = select i1 %.not1.i, i256 %a_plus_b.i, i256 %4
  store i256 %5, ptr %0, align 4
  ret void
}

define void @<!-- -->bls24_317_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #<!-- -->0 section "ctt.bls24_317_fp" {
  %.val = load i320, ptr %1, align 4
  %.val1 = load i320, ptr %2, align 4
  %a_plus_b.i = add i320 %.val1, %.val
  %4 = add i320 %a_plus_b.i, -136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051
  %.not1.i = icmp slt i320 %4, 0
  %5 = select i1 %.not1.i, i320 %a_plus_b.i, i320 %4
  store i320 %5, ptr %0, align 4
  ret void
}

define void @<!-- -->bls12_381_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #<!-- -->0 section "ctt.bls12_381_fp" {
  %.val = load i384, ptr %1, align 4
  %.val1 = load i384, ptr %2, align 4
  %a_plus_b.i = add i384 %.val1, %.val
  %4 = add i384 %a_plus_b.i, -4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787
  %.not1.i = icmp slt i384 %4, 0
  %5 = select i1 %.not1.i, i384 %a_plus_b.i, i384 %4
  store i384 %5, ptr %0, align 4
  ret void
}

attributes #<!-- -->0 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }

Assembly

bn254_snarks_fp_add:                    // @<!-- -->bn254_snarks_fp_add
        ldp     x8, x10, [x1]
        mov     x15, #<!-- -->24534                     // =0x5fd6
        ldp     x9, x11, [x2]
        movk    x15, #<!-- -->7886, lsl #<!-- -->16
        ldp     x13, x14, [x2, #<!-- -->16]
        movk    x15, #<!-- -->45453, lsl #<!-- -->32
        movk    x15, #<!-- -->53147, lsl #<!-- -->48
        adds    x8, x9, x8
        ldp     x12, x9, [x1, #<!-- -->16]
        adcs    x10, x11, x10
        mov     x11, #<!-- -->697                       // =0x2b9
        movk    x11, #<!-- -->10115, lsl #<!-- -->16
        adcs    x12, x13, x12
        mov     x13, #<!-- -->13682                     // =0x3572
        movk    x11, #<!-- -->29673, lsl #<!-- -->32
        movk    x13, #<!-- -->38798, lsl #<!-- -->16
        adc     x9, x14, x9
        mov     x14, #<!-- -->42914                     // =0xa7a2
        movk    x11, #<!-- -->50143, lsl #<!-- -->48
        movk    x13, #<!-- -->38254, lsl #<!-- -->32
        movk    x14, #<!-- -->32382, lsl #<!-- -->16
        movk    x13, #<!-- -->26750, lsl #<!-- -->48
        adds    x11, x8, x11
        movk    x14, #<!-- -->47689, lsl #<!-- -->32
        movk    x14, #<!-- -->18351, lsl #<!-- -->48
        adcs    x13, x10, x13
        adcs    x14, x12, x14
        adc     x15, x9, x15
        cmp     x15, #<!-- -->0
        csel    x12, x12, x14, lt
        csel    x9, x9, x15, lt
        csel    x8, x8, x11, lt
        stp     x12, x9, [x0, #<!-- -->16]
        csel    x9, x10, x13, lt
        stp     x8, x9, [x0]
        ret
bls24_317_fp_add:                       // @<!-- -->bls24_317_fp_add
        ldp     x8, x10, [x1]
        mov     x16, #<!-- -->12230                     // =0x2fc6
        ldp     x9, x11, [x2]
        movk    x16, #<!-- -->18438, lsl #<!-- -->16
        ldp     x13, x14, [x2, #<!-- -->16]
        mov     x17, #<!-- -->30419                     // =0x76d3
        ldr     x15, [x2, #<!-- -->32]
        movk    x16, #<!-- -->14943, lsl #<!-- -->32
        movk    x17, #<!-- -->37023, lsl #<!-- -->16
        adds    x8, x9, x8
        ldp     x12, x9, [x1, #<!-- -->16]
        adcs    x10, x11, x10
        ldr     x11, [x1, #<!-- -->32]
        movk    x16, #<!-- -->3440, lsl #<!-- -->48
        movk    x17, #<!-- -->13789, lsl #<!-- -->32
        adcs    x12, x13, x12
        mov     x13, #<!-- -->54613                     // =0xd555
        movk    x17, #<!-- -->61351, lsl #<!-- -->48
        movk    x13, #<!-- -->41556, lsl #<!-- -->16
        adcs    x9, x14, x9
        mov     x14, #<!-- -->16513                     // =0x4081
        movk    x13, #<!-- -->53673, lsl #<!-- -->32
        movk    x14, #<!-- -->52187, lsl #<!-- -->16
        adc     x11, x15, x11
        mov     x15, #<!-- -->52153                     // =0xcbb9
        movk    x13, #<!-- -->29358, lsl #<!-- -->48
        movk    x14, #<!-- -->50715, lsl #<!-- -->32
        movk    x15, #<!-- -->31544, lsl #<!-- -->16
        movk    x14, #<!-- -->10508, lsl #<!-- -->48
        adds    x13, x8, x13
        movk    x15, #<!-- -->40473, lsl #<!-- -->32
        adcs    x14, x10, x14
        movk    x15, #<!-- -->59749, lsl #<!-- -->48
        adcs    x15, x12, x15
        adcs    x16, x9, x16
        adc     x17, x11, x17
        asr     x18, x17, #<!-- -->63
        cmp     x18, #<!-- -->0
        csel    x11, x11, x17, lt
        csel    x9, x9, x16, lt
        csel    x12, x12, x15, lt
        csel    x8, x8, x13, lt
        stp     x9, x11, [x0, #<!-- -->24]
        csel    x9, x10, x14, lt
        str     x8, [x0]
        stp     x9, x12, [x0, #<!-- -->8]
        ret
bls12_381_fp_add:                       // @<!-- -->bls12_381_fp_add
        ldp     x8, x10, [x1]
        mov     x17, #<!-- -->60736                     // =0xed40
        ldp     x9, x11, [x2]
        movk    x17, #<!-- -->3194, lsl #<!-- -->16
        ldp     x13, x14, [x2, #<!-- -->16]
        mov     x18, #<!-- -->21288                     // =0x5328
        ldp     x15, x16, [x2, #<!-- -->32]
        movk    x17, #<!-- -->46203, lsl #<!-- -->32
        adds    x8, x9, x8
        ldp     x12, x9, [x1, #<!-- -->16]
        adcs    x10, x11, x10
        movk    x18, #<!-- -->48308, lsl #<!-- -->16
        movk    x17, #<!-- -->39816, lsl #<!-- -->48
        movk    x18, #<!-- -->22601, lsl #<!-- -->32
        adcs    x12, x13, x12
        ldp     x11, x13, [x1, #<!-- -->32]
        adcs    x9, x14, x9
        mov     x14, #<!-- -->21845                     // =0x5555
        mov     x1, #<!-- -->6501                       // =0x1965
        movk    x14, #<!-- -->17921, lsl #<!-- -->48
        movk    x1, #<!-- -->50816, lsl #<!-- -->16
        movk    x18, #<!-- -->46308, lsl #<!-- -->48
        adcs    x11, x15, x11
        mov     x15, #<!-- -->1319895040                // =0x4eac0000
        movk    x1, #<!-- -->60949, lsl #<!-- -->32
        adc     x13, x16, x13
        mov     x16, #<!-- -->2523                      // =0x9db
        movk    x15, #<!-- -->1, lsl #<!-- -->32
        movk    x16, #<!-- -->2383, lsl #<!-- -->16
        movk    x15, #<!-- -->57684, lsl #<!-- -->48
        adds    x14, x8, x14
        movk    x16, #<!-- -->11615, lsl #<!-- -->32
        adcs    x15, x10, x15
        movk    x1, #<!-- -->58878, lsl #<!-- -->48
        movk    x16, #<!-- -->39119, lsl #<!-- -->48
        adcs    x16, x12, x16
        adcs    x17, x9, x17
        adcs    x18, x11, x18
        adc     x1, x13, x1
        asr     x2, x1, #<!-- -->63
        cmp     x2, #<!-- -->0
        csel    x11, x11, x18, lt
        csel    x13, x13, x1, lt
        csel    x9, x9, x17, lt
        stp     x11, x13, [x0, #<!-- -->32]
        csel    x11, x12, x16, lt
        csel    x8, x8, x14, lt
        stp     x11, x9, [x0, #<!-- -->16]
        csel    x9, x10, x15, lt
        stp     x8, x9, [x0]
        ret

Analysis

With i256, the cmp is useless in this sequence

        movk    x11, #<!-- -->50143, lsl #<!-- -->48
        movk    x13, #<!-- -->38254, lsl #<!-- -->32
        movk    x14, #<!-- -->32382, lsl #<!-- -->16
        movk    x13, #<!-- -->26750, lsl #<!-- -->48
        adds    x11, x8, x11
        movk    x14, #<!-- -->47689, lsl #<!-- -->32
        movk    x14, #<!-- -->18351, lsl #<!-- -->48
        adcs    x13, x10, x13
        adcs    x14, x12, x14
        adc     x15, x9, x15
        cmp     x15, #<!-- -->0 // &lt;----- unnecessary
        csel    x12, x12, x14, lt
        csel    x9, x9, x15, lt
        csel    x8, x8, x11, lt
        stp     x12, x9, [x0, #<!-- -->16]
        csel    x9, x10, x13, lt
        stp     x8, x9, [x0]
        ret

as demonstrated by #103717

With i320, similar to x86 #103841, there is another additional asr instruction

        movk    x13, #<!-- -->29358, lsl #<!-- -->48
        movk    x14, #<!-- -->50715, lsl #<!-- -->32
        movk    x15, #<!-- -->31544, lsl #<!-- -->16
        movk    x14, #<!-- -->10508, lsl #<!-- -->48
        adds    x13, x8, x13
        movk    x15, #<!-- -->40473, lsl #<!-- -->32
        adcs    x14, x10, x14
        movk    x15, #<!-- -->59749, lsl #<!-- -->48
        adcs    x15, x12, x15
        adcs    x16, x9, x16
        adc     x17, x11, x17
        asr     x18, x17, #<!-- -->63  // &lt;----- unnecessary
        cmp     x18, #<!-- -->0  // &lt;----- unnecessary
        csel    x11, x11, x17, lt
        csel    x9, x9, x16, lt
        csel    x12, x12, x15, lt
        csel    x8, x8, x13, lt
        stp     x9, x11, [x0, #<!-- -->24]
        csel    x9, x10, x14, lt
        str     x8, [x0]
        stp     x9, x12, [x0, #<!-- -->8]
        ret

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

3 participants