-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Isel Aarch64] extra instruction (i256) or 2 instructions (i320) when chaining icmp and select based on underflow #103855
Comments
This was referenced Aug 14, 2024
@llvm/issue-subscribers-backend-aarch64 Author: Mamy Ratsimbazafy (mratsim)
Same IR as https://github.com//issues/103841 but applied to Aarch64 as an alternative to https://github.com//issues/103717
Unlike x86 there is always an extra instruction even for i256, and there are 2 unnecessary instruction for i320 or i384. https://alive2.llvm.org/ce/z/-bGiUs Full codeOriginal IR; ModuleID = 'x86_poc'
target triple = "arm64"
; target triple = "x86_64"
@<!-- -->bn254_snarks_fp_mod = constant i256 21888242871839275222246405745257275088696311157297823662689037894645226208583, section "ctt.bn254_snarks_fp.constants", align 64
@<!-- -->bls12_381_fp_mod = constant i384 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787, section "ctt.bls12_381_fp.constants", align 64
@<!-- -->bls24_317_fp_mod = constant i320 136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051, section "ctt.bls24_317_fp.constants", align 64
; Function Attrs: hot
define internal fastcc void @<!-- -->_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr %3) #<!-- -->2 section "ctt.fields" {
%a = load i256, ptr %1, align 4
%b = load i256, ptr %2, align 4
%M = load i256, ptr %3, align 4
%a_plus_b = add i256 %a, %b
%5 = sub i256 %a_plus_b, %M
%6 = lshr i256 %5, 255
%7 = trunc i256 %6 to i1
%8 = select i1 %7, i256 %a_plus_b, i256 %5
store i256 %8, ptr %0, align 4
ret void
}
; Function Attrs: hot
define internal fastcc void @<!-- -->_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr %3) #<!-- -->2 section "ctt.fields" {
%a = load i320, ptr %1, align 4
%b = load i320, ptr %2, align 4
%M = load i320, ptr %3, align 4
%a_plus_b = add i320 %a, %b
%5 = sub i320 %a_plus_b, %M
%6 = lshr i320 %5, 319
%7 = trunc i320 %6 to i1
%8 = select i1 %7, i320 %a_plus_b, i320 %5
store i320 %8, ptr %0, align 4
ret void
}
; Function Attrs: hot
define internal fastcc void @<!-- -->_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr %3) #<!-- -->2 section "ctt.fields" {
%a = load i384, ptr %1, align 4
%b = load i384, ptr %2, align 4
%M = load i384, ptr %3, align 4
%a_plus_b = add i384 %a, %b
%5 = sub i384 %a_plus_b, %M
%6 = lshr i384 %5, 383
%7 = trunc i384 %6 to i1
%8 = select i1 %7, i384 %a_plus_b, i384 %5
store i384 %8, ptr %0, align 4
ret void
}
; Function Attrs: hot
define void @<!-- -->bn254_snarks_fp_add(ptr %0, ptr %1, ptr %2) #<!-- -->2 section "ctt.bn254_snarks_fp" {
call fastcc void @<!-- -->_modadd_noo.u64x4(ptr %0, ptr %1, ptr %2, ptr @<!-- -->bn254_snarks_fp_mod)
ret void
}
; Function Attrs: hot
define void @<!-- -->bls24_317_fp_add(ptr %0, ptr %1, ptr %2) #<!-- -->2 section "ctt.bls24_317_fp" {
call fastcc void @<!-- -->_modadd_noo.u64x5(ptr %0, ptr %1, ptr %2, ptr @<!-- -->bls24_317_fp_mod)
ret void
}
; Function Attrs: hot
define void @<!-- -->bls12_381_fp_add(ptr %0, ptr %1, ptr %2) #<!-- -->2 section "ctt.bls12_381_fp" {
call fastcc void @<!-- -->_modadd_noo.u64x6(ptr %0, ptr %1, ptr %2, ptr @<!-- -->bls12_381_fp_mod)
ret void
}
attributes #<!-- -->2 = { hot } After opt -O3target triple = "arm64"
; target triple = "x86_64"
define void @<!-- -->bn254_snarks_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #<!-- -->0 section "ctt.bn254_snarks_fp" {
%.val = load i256, ptr %1, align 4
%.val1 = load i256, ptr %2, align 4
%a_plus_b.i = add i256 %.val1, %.val
%4 = add i256 %a_plus_b.i, -21888242871839275222246405745257275088696311157297823662689037894645226208583
%.not1.i = icmp slt i256 %4, 0
%5 = select i1 %.not1.i, i256 %a_plus_b.i, i256 %4
store i256 %5, ptr %0, align 4
ret void
}
define void @<!-- -->bls24_317_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #<!-- -->0 section "ctt.bls24_317_fp" {
%.val = load i320, ptr %1, align 4
%.val1 = load i320, ptr %2, align 4
%a_plus_b.i = add i320 %.val1, %.val
%4 = add i320 %a_plus_b.i, -136393071104295911515099765908274057061945112121419593977210139303905973197232025618026156731051
%.not1.i = icmp slt i320 %4, 0
%5 = select i1 %.not1.i, i320 %a_plus_b.i, i320 %4
store i320 %5, ptr %0, align 4
ret void
}
define void @<!-- -->bls12_381_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #<!-- -->0 section "ctt.bls12_381_fp" {
%.val = load i384, ptr %1, align 4
%.val1 = load i384, ptr %2, align 4
%a_plus_b.i = add i384 %.val1, %.val
%4 = add i384 %a_plus_b.i, -4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787
%.not1.i = icmp slt i384 %4, 0
%5 = select i1 %.not1.i, i384 %a_plus_b.i, i384 %4
store i384 %5, ptr %0, align 4
ret void
}
attributes #<!-- -->0 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) } Assemblybn254_snarks_fp_add: // @<!-- -->bn254_snarks_fp_add
ldp x8, x10, [x1]
mov x15, #<!-- -->24534 // =0x5fd6
ldp x9, x11, [x2]
movk x15, #<!-- -->7886, lsl #<!-- -->16
ldp x13, x14, [x2, #<!-- -->16]
movk x15, #<!-- -->45453, lsl #<!-- -->32
movk x15, #<!-- -->53147, lsl #<!-- -->48
adds x8, x9, x8
ldp x12, x9, [x1, #<!-- -->16]
adcs x10, x11, x10
mov x11, #<!-- -->697 // =0x2b9
movk x11, #<!-- -->10115, lsl #<!-- -->16
adcs x12, x13, x12
mov x13, #<!-- -->13682 // =0x3572
movk x11, #<!-- -->29673, lsl #<!-- -->32
movk x13, #<!-- -->38798, lsl #<!-- -->16
adc x9, x14, x9
mov x14, #<!-- -->42914 // =0xa7a2
movk x11, #<!-- -->50143, lsl #<!-- -->48
movk x13, #<!-- -->38254, lsl #<!-- -->32
movk x14, #<!-- -->32382, lsl #<!-- -->16
movk x13, #<!-- -->26750, lsl #<!-- -->48
adds x11, x8, x11
movk x14, #<!-- -->47689, lsl #<!-- -->32
movk x14, #<!-- -->18351, lsl #<!-- -->48
adcs x13, x10, x13
adcs x14, x12, x14
adc x15, x9, x15
cmp x15, #<!-- -->0
csel x12, x12, x14, lt
csel x9, x9, x15, lt
csel x8, x8, x11, lt
stp x12, x9, [x0, #<!-- -->16]
csel x9, x10, x13, lt
stp x8, x9, [x0]
ret
bls24_317_fp_add: // @<!-- -->bls24_317_fp_add
ldp x8, x10, [x1]
mov x16, #<!-- -->12230 // =0x2fc6
ldp x9, x11, [x2]
movk x16, #<!-- -->18438, lsl #<!-- -->16
ldp x13, x14, [x2, #<!-- -->16]
mov x17, #<!-- -->30419 // =0x76d3
ldr x15, [x2, #<!-- -->32]
movk x16, #<!-- -->14943, lsl #<!-- -->32
movk x17, #<!-- -->37023, lsl #<!-- -->16
adds x8, x9, x8
ldp x12, x9, [x1, #<!-- -->16]
adcs x10, x11, x10
ldr x11, [x1, #<!-- -->32]
movk x16, #<!-- -->3440, lsl #<!-- -->48
movk x17, #<!-- -->13789, lsl #<!-- -->32
adcs x12, x13, x12
mov x13, #<!-- -->54613 // =0xd555
movk x17, #<!-- -->61351, lsl #<!-- -->48
movk x13, #<!-- -->41556, lsl #<!-- -->16
adcs x9, x14, x9
mov x14, #<!-- -->16513 // =0x4081
movk x13, #<!-- -->53673, lsl #<!-- -->32
movk x14, #<!-- -->52187, lsl #<!-- -->16
adc x11, x15, x11
mov x15, #<!-- -->52153 // =0xcbb9
movk x13, #<!-- -->29358, lsl #<!-- -->48
movk x14, #<!-- -->50715, lsl #<!-- -->32
movk x15, #<!-- -->31544, lsl #<!-- -->16
movk x14, #<!-- -->10508, lsl #<!-- -->48
adds x13, x8, x13
movk x15, #<!-- -->40473, lsl #<!-- -->32
adcs x14, x10, x14
movk x15, #<!-- -->59749, lsl #<!-- -->48
adcs x15, x12, x15
adcs x16, x9, x16
adc x17, x11, x17
asr x18, x17, #<!-- -->63
cmp x18, #<!-- -->0
csel x11, x11, x17, lt
csel x9, x9, x16, lt
csel x12, x12, x15, lt
csel x8, x8, x13, lt
stp x9, x11, [x0, #<!-- -->24]
csel x9, x10, x14, lt
str x8, [x0]
stp x9, x12, [x0, #<!-- -->8]
ret
bls12_381_fp_add: // @<!-- -->bls12_381_fp_add
ldp x8, x10, [x1]
mov x17, #<!-- -->60736 // =0xed40
ldp x9, x11, [x2]
movk x17, #<!-- -->3194, lsl #<!-- -->16
ldp x13, x14, [x2, #<!-- -->16]
mov x18, #<!-- -->21288 // =0x5328
ldp x15, x16, [x2, #<!-- -->32]
movk x17, #<!-- -->46203, lsl #<!-- -->32
adds x8, x9, x8
ldp x12, x9, [x1, #<!-- -->16]
adcs x10, x11, x10
movk x18, #<!-- -->48308, lsl #<!-- -->16
movk x17, #<!-- -->39816, lsl #<!-- -->48
movk x18, #<!-- -->22601, lsl #<!-- -->32
adcs x12, x13, x12
ldp x11, x13, [x1, #<!-- -->32]
adcs x9, x14, x9
mov x14, #<!-- -->21845 // =0x5555
mov x1, #<!-- -->6501 // =0x1965
movk x14, #<!-- -->17921, lsl #<!-- -->48
movk x1, #<!-- -->50816, lsl #<!-- -->16
movk x18, #<!-- -->46308, lsl #<!-- -->48
adcs x11, x15, x11
mov x15, #<!-- -->1319895040 // =0x4eac0000
movk x1, #<!-- -->60949, lsl #<!-- -->32
adc x13, x16, x13
mov x16, #<!-- -->2523 // =0x9db
movk x15, #<!-- -->1, lsl #<!-- -->32
movk x16, #<!-- -->2383, lsl #<!-- -->16
movk x15, #<!-- -->57684, lsl #<!-- -->48
adds x14, x8, x14
movk x16, #<!-- -->11615, lsl #<!-- -->32
adcs x15, x10, x15
movk x1, #<!-- -->58878, lsl #<!-- -->48
movk x16, #<!-- -->39119, lsl #<!-- -->48
adcs x16, x12, x16
adcs x17, x9, x17
adcs x18, x11, x18
adc x1, x13, x1
asr x2, x1, #<!-- -->63
cmp x2, #<!-- -->0
csel x11, x11, x18, lt
csel x13, x13, x1, lt
csel x9, x9, x17, lt
stp x11, x13, [x0, #<!-- -->32]
csel x11, x12, x16, lt
csel x8, x8, x14, lt
stp x11, x9, [x0, #<!-- -->16]
csel x9, x10, x15, lt
stp x8, x9, [x0]
ret AnalysisWith i256, the movk x11, #<!-- -->50143, lsl #<!-- -->48
movk x13, #<!-- -->38254, lsl #<!-- -->32
movk x14, #<!-- -->32382, lsl #<!-- -->16
movk x13, #<!-- -->26750, lsl #<!-- -->48
adds x11, x8, x11
movk x14, #<!-- -->47689, lsl #<!-- -->32
movk x14, #<!-- -->18351, lsl #<!-- -->48
adcs x13, x10, x13
adcs x14, x12, x14
adc x15, x9, x15
cmp x15, #<!-- -->0 // <----- unnecessary
csel x12, x12, x14, lt
csel x9, x9, x15, lt
csel x8, x8, x11, lt
stp x12, x9, [x0, #<!-- -->16]
csel x9, x10, x13, lt
stp x8, x9, [x0]
ret as demonstrated by #103717 With i320, similar to x86 #103841, there is another additional movk x13, #<!-- -->29358, lsl #<!-- -->48
movk x14, #<!-- -->50715, lsl #<!-- -->32
movk x15, #<!-- -->31544, lsl #<!-- -->16
movk x14, #<!-- -->10508, lsl #<!-- -->48
adds x13, x8, x13
movk x15, #<!-- -->40473, lsl #<!-- -->32
adcs x14, x10, x14
movk x15, #<!-- -->59749, lsl #<!-- -->48
adcs x15, x12, x15
adcs x16, x9, x16
adc x17, x11, x17
asr x18, x17, #<!-- -->63 // <----- unnecessary
cmp x18, #<!-- -->0 // <----- unnecessary
csel x11, x11, x17, lt
csel x9, x9, x16, lt
csel x12, x12, x15, lt
csel x8, x8, x13, lt
stp x9, x11, [x0, #<!-- -->24]
csel x9, x10, x14, lt
str x8, [x0]
stp x9, x12, [x0, #<!-- -->8]
ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Same IR as #103841 but applied to Aarch64 as an alternative to #103717
Unlike x86 there is always an extra instruction even for i256, and there are 2 unnecessary instruction for i320 or i384.
https://alive2.llvm.org/ce/z/-bGiUs
Full code
Original IR
After opt -O3
Assembly
Analysis
With i256, the
cmp
is useless in this sequenceas demonstrated by #103717
With i320, similar to x86 #103841, there is another additional
asr
instructionThe text was updated successfully, but these errors were encountered: