[opt / isel] 66% more unnecessary instructions generated due to bad pattern or missed sbb/icmp/cmov fusion #103946

mratsim · 2024-08-14T13:27:40Z

I'm not sure who to blame between opt or the code generator:

either opt generates IR that uses a pattern that the codegenerator doesn't recognize and fuse correctly
or the codegenerator should recognize the pattern

In any case the following code has a good version with 12 instructions in the critical path (add/adc, sub/sbb and cmov) and the bad version .v2 has 20 instructions in the critical path.

Both are from the same IR except that one has a constant inlined by opt.

This is similar to #102868 but even worse.

https://alive2.llvm.org/ce/z/KzsXa9

Original IR

define void @_modadd_mayo.u64x4(ptr %0, ptr %1, ptr %2, ptr %3) section "ctt.fields" {
  %a = load i256, ptr %1, align 4
  %b = load i256, ptr %2, align 4
  %M = load i256, ptr %3, align 4
  %ax = zext i256 %a to i320
  %bx = zext i256 %b to i320
  %a_plus_b = add i320 %ax, %bx
  %mx = zext i256 %M to i320
  %apb_minus_M = sub i320 %a_plus_b, %mx
  %5 = lshr i320 %apb_minus_M, 319
  %6 = trunc i320 %5 to i1
  %7 = select i1 %6, i320 %a_plus_b, i320 %apb_minus_M
  %8 = trunc i320 %7 to i256
  store i256 %8, ptr %0, align 4
  ret void
}

@secp256k1_fp_mod = constant i256 -4294968273, section "ctt.secp256k1_fp.constants", align 64

; Function Attrs: hot
define internal fastcc void @_modadd_mayo.u64x4.v2(ptr %0, ptr %1, ptr %2, ptr %3) #2 section "ctt.fields" {
  %a = load i256, ptr %1, align 4
  %b = load i256, ptr %2, align 4
  %M = load i256, ptr %3, align 4
  %ax = zext i256 %a to i320
  %bx = zext i256 %b to i320
  %a_plus_b = add i320 %ax, %bx
  %mx = zext i256 %M to i320
  %apb_minus_M = sub i320 %a_plus_b, %mx
  %5 = lshr i320 %apb_minus_M, 319
  %6 = trunc i320 %5 to i1
  %7 = select i1 %6, i320 %a_plus_b, i320 %apb_minus_M
  %8 = trunc i320 %7 to i256
  store i256 %8, ptr %0, align 4
  ret void
}

; Function Attrs: hot
define void @secp256k1_fp_add(ptr %0, ptr %1, ptr %2) #2 section "ctt.secp256k1_fp" {
  call fastcc void @_modadd_mayo.u64x4.v2(ptr %0, ptr %1, ptr %2, ptr @secp256k1_fp_mod)
  ret void
}

attributes #2 = { hot }

opt -O3

define void @_modadd_mayo.u64x4(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2, ptr nocapture readonly %3) local_unnamed_addr #0 section "ctt.fields" {
  %a = load i256, ptr %1, align 4
  %b = load i256, ptr %2, align 4
  %M = load i256, ptr %3, align 4
  %ax = zext i256 %a to i320
  %bx = zext i256 %b to i320
  %a_plus_b = add nuw nsw i320 %bx, %ax
  %mx = zext i256 %M to i320
  %apb_minus_M = sub nsw i320 %a_plus_b, %mx
  %.not1 = icmp slt i320 %apb_minus_M, 0
  %5 = select i1 %.not1, i320 %a_plus_b, i320 %apb_minus_M
  %6 = trunc i320 %5 to i256
  store i256 %6, ptr %0, align 4
  ret void
}

define void @secp256k1_fp_add(ptr nocapture writeonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) local_unnamed_addr #1 section "ctt.secp256k1_fp" {
  %.val = load i256, ptr %1, align 4
  %.val1 = load i256, ptr %2, align 4
  %ax.i = zext i256 %.val to i320
  %bx.i = zext i256 %.val1 to i320
  %a_plus_b.i = add nuw nsw i320 %bx.i, %ax.i
  %apb_minus_M.i = add nuw nsw i320 %a_plus_b.i, 4294968273
  %.not.i = icmp ugt i320 %a_plus_b.i, 115792089237316195423570985008687907853269984665640564039457584007908834671662
  %4 = select i1 %.not.i, i320 %apb_minus_M.i, i320 %a_plus_b.i
  %5 = trunc i320 %4 to i256
  store i256 %5, ptr %0, align 4
  ret void
}

attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
attributes #1 = { hot mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }

Codegen

_modadd_mayo.u64x4:                     # @_modadd_mayo.u64x4
        push    r14
        push    rbx
        mov     r9, qword ptr [rdx + 24]
        mov     r8, qword ptr [rdx + 16]
        mov     rax, qword ptr [rdx]
        mov     rdx, qword ptr [rdx + 8]
        add     rax, qword ptr [rsi]
        adc     rdx, qword ptr [rsi + 8]
        adc     r8, qword ptr [rsi + 16]
        adc     r9, qword ptr [rsi + 24]
        setb    sil
        movzx   esi, sil
        mov     r10, rax
        sub     r10, qword ptr [rcx]
        mov     r11, rdx
        sbb     r11, qword ptr [rcx + 8]
        mov     rbx, r8
        sbb     rbx, qword ptr [rcx + 16]
        mov     r14, r9
        sbb     r14, qword ptr [rcx + 24]
        sbb     rsi, 0
        sar     rsi, 63
        cmovs   r11, rdx
        cmovs   r14, r9
        cmovs   rbx, r8
        cmovs   r10, rax
        mov     qword ptr [rdi + 16], rbx
        mov     qword ptr [rdi + 24], r14
        mov     qword ptr [rdi], r10
        mov     qword ptr [rdi + 8], r11
        pop     rbx
        pop     r14
        ret
secp256k1_fp_add:                       # @secp256k1_fp_add
        push    r15
        push    r14
        push    r12
        push    rbx
        mov     r8, qword ptr [rdx + 24]
        mov     rcx, qword ptr [rdx + 16]
        mov     rax, qword ptr [rdx]
        mov     rdx, qword ptr [rdx + 8]
        add     rax, qword ptr [rsi]
        adc     rdx, qword ptr [rsi + 8]
        adc     rcx, qword ptr [rsi + 16]
        adc     r8, qword ptr [rsi + 24]
        setb    sil
        movzx   r11d, sil
        movabs  rsi, 4294968273
        add     rsi, rax
        mov     r9, rdx
        adc     r9, 0
        mov     r10, rcx
        adc     r10, 0
        mov     rbx, r8
        adc     rbx, 0
        xor     r14d, r14d
        movabs  r15, -4294968274
        cmp     r15, rax
        mov     r15, -1
        mov     r12, -1
        sbb     r12, rdx
        mov     r12, -1
        sbb     r12, rcx
        sbb     r15, r8
        mov     r15d, 0
        sbb     r15, r11
        mov     r11d, 0
        sbb     r11, r11
        mov     r11d, 0
        sbb     r11, r11
        sbb     r14, r14
        cmovae  r9, rdx
        cmovae  rbx, r8
        cmovae  r10, rcx
        cmovae  rsi, rax
        mov     qword ptr [rdi + 16], r10
        mov     qword ptr [rdi + 24], rbx
        mov     qword ptr [rdi], rsi
        mov     qword ptr [rdi + 8], r9
        pop     rbx
        pop     r12
        pop     r14
        pop     r15
        ret

Analysis

In the "good" version, the extra SAR when using i320 is unnecessary and has been already reported separately in #103841

In the bad version, the following is equivalent to substractiong by M

        movabs  rsi, 4294968273
        add     rsi, rax
        mov     r9, rdx
        adc     r9, 0
        mov     r10, rcx
        adc     r10, 0
        mov     rbx, r8
        adc     rbx, 0

The optimizer misses that there can't be carries here, but that's not the main issue.

Then there is this problematic extra 12 instructions as a followup

        movabs  r15, -4294968274
        cmp     r15, rax
        mov     r15, -1
        mov     r12, -1
        sbb     r12, rdx
        mov     r12, -1
        sbb     r12, rcx
        sbb     r15, r8
        mov     r15d, 0
        sbb     r15, r11
        mov     r11d, 0
        sbb     r11, r11
        mov     r11d, 0
        sbb     r11, r11
        sbb     r14, r14

The text was updated successfully, but these errors were encountered:

github-actions bot added the new issue label Aug 14, 2024

This was referenced Aug 14, 2024

Tracking compiler inefficiencies mratsim/constantine#357

Open

llvm: more tentatives at optimal field addition with pure LLVM IR mratsim/constantine#457

Draft

EugeneZelenko added llvm:optimizations missed-optimization and removed new issue labels Aug 14, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[opt / isel] 66% more unnecessary instructions generated due to bad pattern or missed sbb/icmp/cmov fusion #103946

[opt / isel] 66% more unnecessary instructions generated due to bad pattern or missed sbb/icmp/cmov fusion #103946

mratsim commented Aug 14, 2024

[opt / isel] 66% more unnecessary instructions generated due to bad pattern or missed sbb/icmp/cmov fusion #103946

[opt / isel] 66% more unnecessary instructions generated due to bad pattern or missed sbb/icmp/cmov fusion #103946

Comments

mratsim commented Aug 14, 2024

Original IR

opt -O3

Codegen

Analysis