Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[opt / isel] 66% more unnecessary instructions generated due to bad pattern or missed sbb/icmp/cmov fusion #103946

Open
mratsim opened this issue Aug 14, 2024 · 0 comments

Comments

@mratsim
Copy link

mratsim commented Aug 14, 2024

I'm not sure who to blame between opt or the code generator:

  • either opt generates IR that uses a pattern that the codegenerator doesn't recognize and fuse correctly
  • or the codegenerator should recognize the pattern

In any case the following code has a good version with 12 instructions in the critical path (add/adc, sub/sbb and cmov) and the bad version .v2 has 20 instructions in the critical path.

Both are from the same IR except that one has a constant inlined by opt.

This is similar to #102868 but even worse.

https://alive2.llvm.org/ce/z/KzsXa9

Original IR

define void @_modadd_mayo.u64x4(ptr %0, ptr %1, ptr %2, ptr %3) section "ctt.fields" {
  %a = load i256, ptr %1, align 4
  %b = load i256, ptr %2, align 4
  %M = load i256, ptr %3, align 4
  %ax = zext i256 %a to i320
  %bx = zext i256 %b to i320
  %a_plus_b = add i320 %ax, %bx
  %mx = zext i256 %M to i320
  %apb_minus_M = sub i320 %a_plus_b, %mx
  %5 = lshr i320 %apb_minus_M, 319
  %6 = trunc i320 %5 to i1
  %7 = select i1 %6, i320 %a_plus_b, i320 %apb_minus_M
  %8 = trunc i320 %7 to i256
  store i256 %8, ptr %0, align 4
  ret void
}

@secp256k1_fp_mod = constant i256 -4294968273, section "ctt.secp256k1_fp.constants", align 64

; Function Attrs: hot
define internal fastcc void @_modadd_mayo.u64x4.v2(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
  %a = load i256, ptr %1, align 4
  %b = load i256, ptr %2, align 4
  %M = load i256, ptr %3, align 4
  %ax = zext i256 %a to i320
  %bx = zext i256 %b to i320
  %a_plus_b = add i320 %ax, %bx
  %mx = zext i256 %M to i320
  %apb_minus_M = sub i320 %a_plus_b, %mx
  %5 = lshr i320 %apb_minus_M, 319
  %6 = trunc i320 %5 to i1
  %7 = select i1 %6, i320 %a_plus_b, i320 %apb_minus_M
  %8 = trunc i320 %7 to i256
  store i256 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define void @secp256k1_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.secp256k1_fp" {
  call fastcc void @_modadd_mayo.u64x4.v2(ptr %0, ptr %1, ptr %2, ptr @secp256k1_fp_mod)
  ret void
}

attributes #2 = { hot }

opt -O3

define void @_modadd_mayo.u64x4(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2, ptr nocapture readonly %3) local_unnamed_addr #0 section "ctt.fields" {
  %a = load i256, ptr %1, align 4
  %b = load i256, ptr %2, align 4
  %M = load i256, ptr %3, align 4
  %ax = zext i256 %a to i320
  %bx = zext i256 %b to i320
  %a_plus_b = add nuw nsw i320 %bx, %ax
  %mx = zext i256 %M to i320
  %apb_minus_M = sub nsw i320 %a_plus_b, %mx
  %.not1 = icmp slt i320 %apb_minus_M, 0
  %5 = select i1 %.not1, i320 %a_plus_b, i320 %apb_minus_M
  %6 = trunc i320 %5 to i256
  store i256 %6, ptr %0, align 4
  ret void
}

define void @secp256k1_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #1 section "ctt.secp256k1_fp" {
  %.val = load i256, ptr %1, align 4
  %.val1 = load i256, ptr %2, align 4
  %ax.i = zext i256 %.val to i320
  %bx.i = zext i256 %.val1 to i320
  %a_plus_b.i = add nuw nsw i320 %bx.i, %ax.i
  %apb_minus_M.i = add nuw nsw i320 %a_plus_b.i, 4294968273
  %.not.i = icmp ugt i320 %a_plus_b.i, 115792089237316195423570985008687907853269984665640564039457584007908834671662
  %4 = select i1 %.not.i, i320 %apb_minus_M.i, i320 %a_plus_b.i
  %5 = trunc i320 %4 to i256
  store i256 %5, ptr %0, align 4
  ret void
}

attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
attributes #1 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }

Codegen

_modadd_mayo.u64x4:                     # @_modadd_mayo.u64x4
        push    r14
        push    rbx
        mov     r9, qword ptr [rdx + 24]
        mov     r8, qword ptr [rdx + 16]
        mov     rax, qword ptr [rdx]
        mov     rdx, qword ptr [rdx + 8]
        add     rax, qword ptr [rsi]
        adc     rdx, qword ptr [rsi + 8]
        adc     r8, qword ptr [rsi + 16]
        adc     r9, qword ptr [rsi + 24]
        setb    sil
        movzx   esi, sil
        mov     r10, rax
        sub     r10, qword ptr [rcx]
        mov     r11, rdx
        sbb     r11, qword ptr [rcx + 8]
        mov     rbx, r8
        sbb     rbx, qword ptr [rcx + 16]
        mov     r14, r9
        sbb     r14, qword ptr [rcx + 24]
        sbb     rsi, 0
        sar     rsi, 63
        cmovs   r11, rdx
        cmovs   r14, r9
        cmovs   rbx, r8
        cmovs   r10, rax
        mov     qword ptr [rdi + 16], rbx
        mov     qword ptr [rdi + 24], r14
        mov     qword ptr [rdi], r10
        mov     qword ptr [rdi + 8], r11
        pop     rbx
        pop     r14
        ret
secp256k1_fp_add:                       # @secp256k1_fp_add
        push    r15
        push    r14
        push    r12
        push    rbx
        mov     r8, qword ptr [rdx + 24]
        mov     rcx, qword ptr [rdx + 16]
        mov     rax, qword ptr [rdx]
        mov     rdx, qword ptr [rdx + 8]
        add     rax, qword ptr [rsi]
        adc     rdx, qword ptr [rsi + 8]
        adc     rcx, qword ptr [rsi + 16]
        adc     r8, qword ptr [rsi + 24]
        setb    sil
        movzx   r11d, sil
        movabs  rsi, 4294968273
        add     rsi, rax
        mov     r9, rdx
        adc     r9, 0
        mov     r10, rcx
        adc     r10, 0
        mov     rbx, r8
        adc     rbx, 0
        xor     r14d, r14d
        movabs  r15, -4294968274
        cmp     r15, rax
        mov     r15, -1
        mov     r12, -1
        sbb     r12, rdx
        mov     r12, -1
        sbb     r12, rcx
        sbb     r15, r8
        mov     r15d, 0
        sbb     r15, r11
        mov     r11d, 0
        sbb     r11, r11
        mov     r11d, 0
        sbb     r11, r11
        sbb     r14, r14
        cmovae  r9, rdx
        cmovae  rbx, r8
        cmovae  r10, rcx
        cmovae  rsi, rax
        mov     qword ptr [rdi + 16], r10
        mov     qword ptr [rdi + 24], rbx
        mov     qword ptr [rdi], rsi
        mov     qword ptr [rdi + 8], r9
        pop     rbx
        pop     r12
        pop     r14
        pop     r15
        ret

Analysis

In the "good" version, the extra SAR when using i320 is unnecessary and has been already reported separately in #103841

In the bad version, the following is equivalent to substractiong by M

        movabs  rsi, 4294968273
        add     rsi, rax
        mov     r9, rdx
        adc     r9, 0
        mov     r10, rcx
        adc     r10, 0
        mov     rbx, r8
        adc     rbx, 0

The optimizer misses that there can't be carries here, but that's not the main issue.

Then there is this problematic extra 12 instructions as a followup

        movabs  r15, -4294968274
        cmp     r15, rax
        mov     r15, -1
        mov     r12, -1
        sbb     r12, rdx
        mov     r12, -1
        sbb     r12, rcx
        sbb     r15, r8
        mov     r15d, 0
        sbb     r15, r11
        mov     r11d, 0
        sbb     r11, r11
        mov     r11d, 0
        sbb     r11, r11
        sbb     r14, r14
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

2 participants