-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Aarch64] Missed optimization: substraction with borrow #102062
Comments
@llvm/issue-subscribers-backend-aarch64 Author: Mamy Ratsimbazafy (mratsim)
https://alive2.llvm.org/ce/z/UVMJQX
The following supposedly ensure proper generation of substraction with borrow on x86, the codegen for aarch64 is unfortunately not up to par. llvm-project/llvm/test/CodeGen/X86/subcarry.ll Lines 586 to 652 in 1b9d7dd
; target triple = "x86_64-unknown-linux-gnu"
target triple = "aarch64-unknown-linux-gnu"
define i256 @<!-- -->sub256(i256 %a, i256 %b) nounwind {
; CHECK-LABEL: sub256:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: subq {{[0-9]+}}(%rsp), %rsi
; CHECK-NEXT: sbbq {{[0-9]+}}(%rsp), %rdx
; CHECK-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
; CHECK-NEXT: sbbq {{[0-9]+}}(%rsp), %r8
; CHECK-NEXT: movq %rcx, 16(%rdi)
; CHECK-NEXT: movq %rdx, 8(%rdi)
; CHECK-NEXT: movq %rsi, (%rdi)
; CHECK-NEXT: movq %r8, 24(%rdi)
; CHECK-NEXT: retq
entry:
%0 = sub i256 %a, %b
ret i256 %0
}
%uint128 = type { i64, i64 }
%uint256 = type { %uint128, %uint128 }
; The 256-bit subtraction implementation using two inlined usubo procedures for U128 type { i64, i64 }.
; This is similar to how LLVM legalize types in CodeGen.
define void @<!-- -->sub_U256_without_i128_or_recursive(%uint256* sret(%uint256) %0, %uint256* %1, %uint256* %2) nounwind {
; CHECK-LABEL: sub_U256_without_i128_or_recursive:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: movq (%rsi), %rcx
; CHECK-NEXT: movq 8(%rsi), %rdi
; CHECK-NEXT: movq 16(%rsi), %r8
; CHECK-NEXT: movq 24(%rsi), %rsi
; CHECK-NEXT: xorl %r9d, %r9d
; CHECK-NEXT: subq 16(%rdx), %r8
; CHECK-NEXT: setb %r9b
; CHECK-NEXT: subq 24(%rdx), %rsi
; CHECK-NEXT: subq (%rdx), %rcx
; CHECK-NEXT: sbbq 8(%rdx), %rdi
; CHECK-NEXT: sbbq $0, %r8
; CHECK-NEXT: sbbq %r9, %rsi
; CHECK-NEXT: movq %rcx, (%rax)
; CHECK-NEXT: movq %rdi, 8(%rax)
; CHECK-NEXT: movq %r8, 16(%rax)
; CHECK-NEXT: movq %rsi, 24(%rax)
; CHECK-NEXT: retq
%4 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 0, i32 0
%5 = load i64, i64* %4, align 8
%6 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 0, i32 1
%7 = load i64, i64* %6, align 8
%8 = getelementptr inbounds %uint256, %uint256* %2, i64 0, i32 0, i32 0
%9 = load i64, i64* %8, align 8
%10 = getelementptr inbounds %uint256, %uint256* %2, i64 0, i32 0, i32 1
%11 = load i64, i64* %10, align 8
%12 = sub i64 %5, %9
%13 = icmp ult i64 %5, %9
%14 = sub i64 %7, %11
%15 = icmp ult i64 %7, %11
%16 = zext i1 %13 to i64
%17 = sub i64 %14, %16
%18 = icmp ult i64 %14, %16
%19 = or i1 %15, %18
%20 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 1, i32 0
%21 = load i64, i64* %20, align 8
%22 = getelementptr inbounds %uint256, %uint256* %1, i64 0, i32 1, i32 1
%23 = load i64, i64* %22, align 8
%24 = getelementptr inbounds %uint256, %uint256* %2, i64 0, i32 1, i32 0
%25 = load i64, i64* %24, align 8
%26 = getelementptr inbounds %uint256, %uint256* %2, i64 0, i32 1, i32 1
%27 = load i64, i64* %26, align 8
%28 = sub i64 %21, %25
%29 = icmp ult i64 %21, %25
%30 = sub i64 %23, %27
%31 = zext i1 %29 to i64
%32 = sub i64 %30, %31
%33 = zext i1 %19 to i64
%34 = sub i64 %28, %33
%35 = icmp ult i64 %28, %33
%36 = zext i1 %35 to i64
%37 = sub i64 %32, %36
%38 = getelementptr inbounds %uint256, %uint256* %0, i64 0, i32 0, i32 0
store i64 %12, i64* %38, align 8
%39 = getelementptr inbounds %uint256, %uint256* %0, i64 0, i32 0, i32 1
store i64 %17, i64* %39, align 8
%40 = getelementptr inbounds %uint256, %uint256* %0, i64 0, i32 1, i32 0
store i64 %34, i64* %40, align 8
%41 = getelementptr inbounds %uint256, %uint256* %0, i64 0, i32 1, i32 1
store i64 %37, i64* %41, align 8
ret void
}
|
mratsim
added a commit
to mratsim/constantine
that referenced
this issue
Aug 13, 2024
This was referenced Aug 14, 2024
mratsim
added a commit
to mratsim/constantine
that referenced
this issue
Aug 14, 2024
* feat(LLVM): add codegenerator for saturated field add/sub * LLVM: WIP refactor - boilerplate, linkage, assembly sections, ... * feat(llvm): try (and fail) to workaround bad modular addition codegen with inline function. * llvm: partial workaround failure around https://github.com/llvm/llvm-project/issues/102868\#issuecomment-2284935755 module inlining breaks machine instruction fusion * llvm: define our own addcarry/subborrow which properly optimize on x86 (but not ARM see llvm/llvm-project#102062) * llvm: use builtin llvm.uadd.with.overflow.iXXX to try to generate optimal code (and fail for i320 and i384 llvm/llvm-project#103717)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://alive2.llvm.org/ce/z/UVMJQX
The following supposedly ensure proper generation of substraction with borrow on x86, the codegen for aarch64 is unfortunately not up to par.
llvm-project/llvm/test/CodeGen/X86/subcarry.ll
Lines 586 to 652 in 1b9d7dd
The text was updated successfully, but these errors were encountered: