Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Aarch64] Missed optimization: substraction with borrow #102062

Open
mratsim opened this issue Aug 5, 2024 · 1 comment
Open

[Aarch64] Missed optimization: substraction with borrow #102062

mratsim opened this issue Aug 5, 2024 · 1 comment

Comments

@mratsim
Copy link

mratsim commented Aug 5, 2024

https://alive2.llvm.org/ce/z/UVMJQX

The following supposedly ensure proper generation of substraction with borrow on x86, the codegen for aarch64 is unfortunately not up to par.

%uint128 = type { i64, i64 }
%uint256 = type { %uint128, %uint128 }
; The 256-bit subtraction implementation using two inlined usubo procedures for U128 type { i64, i64 }.
; This is similar to how LLVM legalize types in CodeGen.
define void @sub_U256_without_i128_or_recursive(ptr sret(%uint256) %0, ptr %1, ptr %2) nounwind {
; CHECK-LABEL: sub_U256_without_i128_or_recursive:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movq (%rsi), %rcx
; CHECK-NEXT: movq 8(%rsi), %rdi
; CHECK-NEXT: movq 16(%rsi), %r8
; CHECK-NEXT: movq 24(%rsi), %rsi
; CHECK-NEXT: xorl %r9d, %r9d
; CHECK-NEXT: subq 16(%rdx), %r8
; CHECK-NEXT: setb %r9b
; CHECK-NEXT: subq 24(%rdx), %rsi
; CHECK-NEXT: subq (%rdx), %rcx
; CHECK-NEXT: sbbq 8(%rdx), %rdi
; CHECK-NEXT: sbbq $0, %r8
; CHECK-NEXT: sbbq %r9, %rsi
; CHECK-NEXT: movq %rcx, (%rax)
; CHECK-NEXT: movq %rdi, 8(%rax)
; CHECK-NEXT: movq %r8, 16(%rax)
; CHECK-NEXT: movq %rsi, 24(%rax)
; CHECK-NEXT: retq
%4 = load i64, ptr %1, align 8
%5 = getelementptr inbounds %uint256, ptr %1, i64 0, i32 0, i32 1
%6 = load i64, ptr %5, align 8
%7 = load i64, ptr %2, align 8
%8 = getelementptr inbounds %uint256, ptr %2, i64 0, i32 0, i32 1
%9 = load i64, ptr %8, align 8
%10 = sub i64 %4, %7
%11 = icmp ult i64 %4, %7
%12 = sub i64 %6, %9
%13 = icmp ult i64 %6, %9
%14 = zext i1 %11 to i64
%15 = sub i64 %12, %14
%16 = icmp ult i64 %12, %14
%17 = or i1 %13, %16
%18 = getelementptr inbounds %uint256, ptr %1, i64 0, i32 1, i32 0
%19 = load i64, ptr %18, align 8
%20 = getelementptr inbounds %uint256, ptr %1, i64 0, i32 1, i32 1
%21 = load i64, ptr %20, align 8
%22 = getelementptr inbounds %uint256, ptr %2, i64 0, i32 1, i32 0
%23 = load i64, ptr %22, align 8
%24 = getelementptr inbounds %uint256, ptr %2, i64 0, i32 1, i32 1
%25 = load i64, ptr %24, align 8
%26 = sub i64 %19, %23
%27 = icmp ult i64 %19, %23
%28 = sub i64 %21, %25
%29 = zext i1 %27 to i64
%30 = sub i64 %28, %29
%31 = zext i1 %17 to i64
%32 = sub i64 %26, %31
%33 = icmp ult i64 %26, %31
%34 = zext i1 %33 to i64
%35 = sub i64 %30, %34
store i64 %10, ptr %0, align 8
%36 = getelementptr inbounds %uint256, ptr %0, i64 0, i32 0, i32 1
store i64 %15, ptr %36, align 8
%37 = getelementptr inbounds %uint256, ptr %0, i64 0, i32 1, i32 0
store i64 %32, ptr %37, align 8
%38 = getelementptr inbounds %uint256, ptr %0, i64 0, i32 1, i32 1
store i64 %35, ptr %38, align 8
ret void
}

; target triple = "x86_64-unknown-linux-gnu"
target triple = "aarch64-unknown-linux-gnu"

define i256 @sub256(i256 %a, i256 %b) nounwind {
; CHECK-LABEL: sub256:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    movq %rdi, %rax
; CHECK-NEXT:    subq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
; CHECK-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
; CHECK-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
; CHECK-NEXT:    movq %rcx, 16(%rdi)
; CHECK-NEXT:    movq %rdx, 8(%rdi)
; CHECK-NEXT:    movq %rsi, (%rdi)
; CHECK-NEXT:    movq %r8, 24(%rdi)
; CHECK-NEXT:    retq
entry:
  %0 = sub i256 %a, %b
  ret i256 %0
}

%uint128 = type { i64, i64 }
%uint256 = type { %uint128, %uint128 }

; The 256-bit subtraction implementation using two inlined usubo procedures for U128 type { i64, i64 }.
; This is similar to how LLVM legalize types in CodeGen.
define void @sub_U256_without_i128_or_recursive(%uint256* sret(%uint256) %0, %uint256* %1, %uint256* %2) nounwind {
; CHECK-LABEL: sub_U256_without_i128_or_recursive:
; CHECK:       # %bb.0:
; CHECK-NEXT:    movq %rdi, %rax
; CHECK-NEXT:    movq (%rsi), %rcx
; CHECK-NEXT:    movq 8(%rsi), %rdi
; CHECK-NEXT:    movq 16(%rsi), %r8
; CHECK-NEXT:    movq 24(%rsi), %rsi
; CHECK-NEXT:    xorl %r9d, %r9d
; CHECK-NEXT:    subq 16(%rdx), %r8
; CHECK-NEXT:    setb %r9b
; CHECK-NEXT:    subq 24(%rdx), %rsi
; CHECK-NEXT:    subq (%rdx), %rcx
; CHECK-NEXT:    sbbq 8(%rdx), %rdi
; CHECK-NEXT:    sbbq $0, %r8
; CHECK-NEXT:    sbbq %r9, %rsi
; CHECK-NEXT:    movq %rcx, (%rax)
; CHECK-NEXT:    movq %rdi, 8(%rax)
; CHECK-NEXT:    movq %r8, 16(%rax)
; CHECK-NEXT:    movq %rsi, 24(%rax)
; CHECK-NEXT:    retq
  %4 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 0, i32 0
  %5 = load i64, i64* %4, align 8
  %6 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 0, i32 1
  %7 = load i64, i64* %6, align 8
  %8 = getelementptr inbounds %uint256, %uint256* %2, i64 0, i32 0, i32 0
  %9 = load i64, i64* %8, align 8
  %10 = getelementptr inbounds %uint256, %uint256* %2, i64 0, i32 0, i32 1
  %11 = load i64, i64* %10, align 8
  %12 = sub i64 %5, %9
  %13 = icmp ult i64 %5, %9
  %14 = sub i64 %7, %11
  %15 = icmp ult i64 %7, %11
  %16 = zext i1 %13 to i64
  %17 = sub i64 %14, %16
  %18 = icmp ult i64 %14, %16
  %19 = or i1 %15, %18
  %20 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 1, i32 0
  %21 = load i64, i64* %20, align 8
  %22 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 1, i32 1
  %23 = load i64, i64* %22, align 8
  %24 = getelementptr inbounds %uint256, %uint256* %2, i64 0, i32 1, i32 0
  %25 = load i64, i64* %24, align 8
  %26 = getelementptr inbounds %uint256, %uint256* %2, i64 0, i32 1, i32 1
  %27 = load i64, i64* %26, align 8
  %28 = sub i64 %21, %25
  %29 = icmp ult i64 %21, %25
  %30 = sub i64 %23, %27
  %31 = zext i1 %29 to i64
  %32 = sub i64 %30, %31
  %33 = zext i1 %19 to i64
  %34 = sub i64 %28, %33
  %35 = icmp ult i64 %28, %33
  %36 = zext i1 %35 to i64
  %37 = sub i64 %32, %36
  %38 = getelementptr inbounds %uint256, %uint256* %0, i64 0, i32 0, i32 0
  store i64 %12, i64* %38, align 8
  %39 = getelementptr inbounds %uint256, %uint256* %0, i64 0, i32 0, i32 1
  store i64 %17, i64* %39, align 8
  %40 = getelementptr inbounds %uint256, %uint256* %0, i64 0, i32 1, i32 0
  store i64 %34, i64* %40, align 8
  %41 = getelementptr inbounds %uint256, %uint256* %0, i64 0, i32 1, i32 1
  store i64 %37, i64* %41, align 8
  ret void
}
sub256:                                 // @sub256
        subs    x0, x0, x4
        sbcs    x1, x1, x5
        sbcs    x2, x2, x6
        sbc     x3, x3, x7
        ret
sub_U256_without_i128_or_recursive:     // @sub_U256_without_i128_or_recursive
        ldp     x9, x12, [x1]
        ldp     x10, x11, [x0]
        ldp     x13, x15, [x1, #16]
        subs    x9, x10, x9
        cset    w10, lo
        subs    x11, x11, x12
        ldp     x12, x14, [x0, #16]
        ccmp    x11, x10, #0, hs
        cset    w16, lo
        sub     x10, x11, x10
        stp     x9, x10, [x8]
        subs    x12, x12, x13
        sub     x13, x14, x15
        cset    w11, lo
        subs    x12, x12, x16
        sub     x9, x13, x11
        cset    w11, lo
        sub     x9, x9, x11
        stp     x12, x9, [x8, #16]
        ret
@llvmbot
Copy link

llvmbot commented Aug 5, 2024

@llvm/issue-subscribers-backend-aarch64

Author: Mamy Ratsimbazafy (mratsim)

https://alive2.llvm.org/ce/z/UVMJQX

The following supposedly ensure proper generation of substraction with borrow on x86, the codegen for aarch64 is unfortunately not up to par.

%uint128 = type { i64, i64 }
%uint256 = type { %uint128, %uint128 }
; The 256-bit subtraction implementation using two inlined usubo procedures for U128 type { i64, i64 }.
; This is similar to how LLVM legalize types in CodeGen.
define void @sub_U256_without_i128_or_recursive(ptr sret(%uint256) %0, ptr %1, ptr %2) nounwind {
; CHECK-LABEL: sub_U256_without_i128_or_recursive:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movq (%rsi), %rcx
; CHECK-NEXT: movq 8(%rsi), %rdi
; CHECK-NEXT: movq 16(%rsi), %r8
; CHECK-NEXT: movq 24(%rsi), %rsi
; CHECK-NEXT: xorl %r9d, %r9d
; CHECK-NEXT: subq 16(%rdx), %r8
; CHECK-NEXT: setb %r9b
; CHECK-NEXT: subq 24(%rdx), %rsi
; CHECK-NEXT: subq (%rdx), %rcx
; CHECK-NEXT: sbbq 8(%rdx), %rdi
; CHECK-NEXT: sbbq $0, %r8
; CHECK-NEXT: sbbq %r9, %rsi
; CHECK-NEXT: movq %rcx, (%rax)
; CHECK-NEXT: movq %rdi, 8(%rax)
; CHECK-NEXT: movq %r8, 16(%rax)
; CHECK-NEXT: movq %rsi, 24(%rax)
; CHECK-NEXT: retq
%4 = load i64, ptr %1, align 8
%5 = getelementptr inbounds %uint256, ptr %1, i64 0, i32 0, i32 1
%6 = load i64, ptr %5, align 8
%7 = load i64, ptr %2, align 8
%8 = getelementptr inbounds %uint256, ptr %2, i64 0, i32 0, i32 1
%9 = load i64, ptr %8, align 8
%10 = sub i64 %4, %7
%11 = icmp ult i64 %4, %7
%12 = sub i64 %6, %9
%13 = icmp ult i64 %6, %9
%14 = zext i1 %11 to i64
%15 = sub i64 %12, %14
%16 = icmp ult i64 %12, %14
%17 = or i1 %13, %16
%18 = getelementptr inbounds %uint256, ptr %1, i64 0, i32 1, i32 0
%19 = load i64, ptr %18, align 8
%20 = getelementptr inbounds %uint256, ptr %1, i64 0, i32 1, i32 1
%21 = load i64, ptr %20, align 8
%22 = getelementptr inbounds %uint256, ptr %2, i64 0, i32 1, i32 0
%23 = load i64, ptr %22, align 8
%24 = getelementptr inbounds %uint256, ptr %2, i64 0, i32 1, i32 1
%25 = load i64, ptr %24, align 8
%26 = sub i64 %19, %23
%27 = icmp ult i64 %19, %23
%28 = sub i64 %21, %25
%29 = zext i1 %27 to i64
%30 = sub i64 %28, %29
%31 = zext i1 %17 to i64
%32 = sub i64 %26, %31
%33 = icmp ult i64 %26, %31
%34 = zext i1 %33 to i64
%35 = sub i64 %30, %34
store i64 %10, ptr %0, align 8
%36 = getelementptr inbounds %uint256, ptr %0, i64 0, i32 0, i32 1
store i64 %15, ptr %36, align 8
%37 = getelementptr inbounds %uint256, ptr %0, i64 0, i32 1, i32 0
store i64 %32, ptr %37, align 8
%38 = getelementptr inbounds %uint256, ptr %0, i64 0, i32 1, i32 1
store i64 %35, ptr %38, align 8
ret void
}

; target triple = "x86_64-unknown-linux-gnu"
target triple = "aarch64-unknown-linux-gnu"

define i256 @<!-- -->sub256(i256 %a, i256 %b) nounwind {
; CHECK-LABEL: sub256:
; CHECK:       # %bb.0: # %entry
; CHECK-NEXT:    movq %rdi, %rax
; CHECK-NEXT:    subq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
; CHECK-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
; CHECK-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
; CHECK-NEXT:    movq %rcx, 16(%rdi)
; CHECK-NEXT:    movq %rdx, 8(%rdi)
; CHECK-NEXT:    movq %rsi, (%rdi)
; CHECK-NEXT:    movq %r8, 24(%rdi)
; CHECK-NEXT:    retq
entry:
  %0 = sub i256 %a, %b
  ret i256 %0
}

%uint128 = type { i64, i64 }
%uint256 = type { %uint128, %uint128 }

; The 256-bit subtraction implementation using two inlined usubo procedures for U128 type { i64, i64 }.
; This is similar to how LLVM legalize types in CodeGen.
define void @<!-- -->sub_U256_without_i128_or_recursive(%uint256* sret(%uint256) %0, %uint256* %1, %uint256* %2) nounwind {
; CHECK-LABEL: sub_U256_without_i128_or_recursive:
; CHECK:       # %bb.0:
; CHECK-NEXT:    movq %rdi, %rax
; CHECK-NEXT:    movq (%rsi), %rcx
; CHECK-NEXT:    movq 8(%rsi), %rdi
; CHECK-NEXT:    movq 16(%rsi), %r8
; CHECK-NEXT:    movq 24(%rsi), %rsi
; CHECK-NEXT:    xorl %r9d, %r9d
; CHECK-NEXT:    subq 16(%rdx), %r8
; CHECK-NEXT:    setb %r9b
; CHECK-NEXT:    subq 24(%rdx), %rsi
; CHECK-NEXT:    subq (%rdx), %rcx
; CHECK-NEXT:    sbbq 8(%rdx), %rdi
; CHECK-NEXT:    sbbq $0, %r8
; CHECK-NEXT:    sbbq %r9, %rsi
; CHECK-NEXT:    movq %rcx, (%rax)
; CHECK-NEXT:    movq %rdi, 8(%rax)
; CHECK-NEXT:    movq %r8, 16(%rax)
; CHECK-NEXT:    movq %rsi, 24(%rax)
; CHECK-NEXT:    retq
  %4 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 0, i32 0
  %5 = load i64, i64* %4, align 8
  %6 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 0, i32 1
  %7 = load i64, i64* %6, align 8
  %8 = getelementptr inbounds %uint256, %uint256* %2, i64 0, i32 0, i32 0
  %9 = load i64, i64* %8, align 8
  %10 = getelementptr inbounds %uint256, %uint256* %2, i64 0, i32 0, i32 1
  %11 = load i64, i64* %10, align 8
  %12 = sub i64 %5, %9
  %13 = icmp ult i64 %5, %9
  %14 = sub i64 %7, %11
  %15 = icmp ult i64 %7, %11
  %16 = zext i1 %13 to i64
  %17 = sub i64 %14, %16
  %18 = icmp ult i64 %14, %16
  %19 = or i1 %15, %18
  %20 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 1, i32 0
  %21 = load i64, i64* %20, align 8
  %22 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 1, i32 1
  %23 = load i64, i64* %22, align 8
  %24 = getelementptr inbounds %uint256, %uint256* %2, i64 0, i32 1, i32 0
  %25 = load i64, i64* %24, align 8
  %26 = getelementptr inbounds %uint256, %uint256* %2, i64 0, i32 1, i32 1
  %27 = load i64, i64* %26, align 8
  %28 = sub i64 %21, %25
  %29 = icmp ult i64 %21, %25
  %30 = sub i64 %23, %27
  %31 = zext i1 %29 to i64
  %32 = sub i64 %30, %31
  %33 = zext i1 %19 to i64
  %34 = sub i64 %28, %33
  %35 = icmp ult i64 %28, %33
  %36 = zext i1 %35 to i64
  %37 = sub i64 %32, %36
  %38 = getelementptr inbounds %uint256, %uint256* %0, i64 0, i32 0, i32 0
  store i64 %12, i64* %38, align 8
  %39 = getelementptr inbounds %uint256, %uint256* %0, i64 0, i32 0, i32 1
  store i64 %17, i64* %39, align 8
  %40 = getelementptr inbounds %uint256, %uint256* %0, i64 0, i32 1, i32 0
  store i64 %34, i64* %40, align 8
  %41 = getelementptr inbounds %uint256, %uint256* %0, i64 0, i32 1, i32 1
  store i64 %37, i64* %41, align 8
  ret void
}
sub256:                                 // @<!-- -->sub256
        subs    x0, x0, x4
        sbcs    x1, x1, x5
        sbcs    x2, x2, x6
        sbc     x3, x3, x7
        ret
sub_U256_without_i128_or_recursive:     // @<!-- -->sub_U256_without_i128_or_recursive
        ldp     x9, x12, [x1]
        ldp     x10, x11, [x0]
        ldp     x13, x15, [x1, #<!-- -->16]
        subs    x9, x10, x9
        cset    w10, lo
        subs    x11, x11, x12
        ldp     x12, x14, [x0, #<!-- -->16]
        ccmp    x11, x10, #<!-- -->0, hs
        cset    w16, lo
        sub     x10, x11, x10
        stp     x9, x10, [x8]
        subs    x12, x12, x13
        sub     x13, x14, x15
        cset    w11, lo
        subs    x12, x12, x16
        sub     x9, x13, x11
        cset    w11, lo
        sub     x9, x9, x11
        stp     x12, x9, [x8, #<!-- -->16]
        ret

mratsim added a commit to mratsim/constantine that referenced this issue Aug 13, 2024
mratsim added a commit to mratsim/constantine that referenced this issue Aug 14, 2024
* feat(LLVM): add codegenerator for saturated field add/sub

* LLVM: WIP refactor - boilerplate, linkage, assembly sections, ...

* feat(llvm): try (and fail) to workaround bad modular addition codegen with inline function.

* llvm: partial workaround failure around https://github.com/llvm/llvm-project/issues/102868\#issuecomment-2284935755 module inlining breaks machine instruction fusion

* llvm: define our own addcarry/subborrow which properly optimize on x86 (but not ARM see llvm/llvm-project#102062)

* llvm: use builtin llvm.uadd.with.overflow.iXXX to try to generate optimal code (and fail for i320 and i384 llvm/llvm-project#103717)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

3 participants