Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AArch64] Lower __builtin_bswap16 to rev16 if bswap followed by any_extend #105375

Merged
merged 1 commit into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22369,6 +22369,25 @@ static SDValue performExtendCombine(SDNode *N,
N->getOperand(0)->getOpcode() == ISD::SETCC)
return performSignExtendSetCCCombine(N, DCI, DAG);

// If we see (any_extend (bswap ...)) with bswap returning an i16, we know
// that the top half of the result register must be unused, due to the
// any_extend. This means that we can replace this pattern with (rev16
// (any_extend ...)). This saves a machine instruction compared to (lsr (rev
// ...)), which is what this pattern would otherwise be lowered to.
// Only apply this optimisation if any_extend in original pattern to i32 or
// i64, because this type will become the input type to REV16 in the new
// pattern, so must be a legitimate REV16 input type.
SDValue Bswap = N->getOperand(0);
if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
Bswap.getValueType() == MVT::i16 &&
(N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
SDLoc DL(N);
SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
Bswap->getOperand(0));
return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
NewAnyExtend);
}

return SDValue();
}

Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,8 @@ def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;

def AArch64rev16_scalar : SDNode<"AArch64ISD::REV16", SDTIntUnaryOp>;

def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
Expand Down Expand Up @@ -2840,6 +2842,9 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>;
def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>;

def : Pat<(AArch64rev16_scalar GPR32:$Rn), (REV16Wr GPR32:$Rn)>;
def : Pat<(AArch64rev16_scalar GPR64:$Rn), (REV16Xr GPR64:$Rn)>;

def : Pat<(or (and (srl GPR64:$Rn, (i64 8)), (i64 0x00ff00ff00ff00ff)),
(and (shl GPR64:$Rn, (i64 8)), (i64 0xff00ff00ff00ff00))),
(REV16Xr GPR64:$Rn)>;
Expand Down
78 changes: 73 additions & 5 deletions llvm/test/CodeGen/AArch64/bswap.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,85 @@
; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI

; ====== Scalar Tests =====
define i16 @bswap_i16(i16 %a){
; CHECK-LABEL: bswap_i16:

; ====== Scalar bswap.i16 Tests =====
define i16 @bswap_i16_to_i16_anyext(i16 %a){
; CHECK-SD-LABEL: bswap_i16_to_i16_anyext:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: rev16 w0, w0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: bswap_i16_to_i16_anyext:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: rev w8, w0
; CHECK-GI-NEXT: lsr w0, w8, #16
; CHECK-GI-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
ret i16 %3
}
declare i16 @llvm.bswap.i16(i16)

; The zext here is optimised to an any_extend during isel.
define i64 @bswap_i16_to_i64_anyext(i16 %a) {
; CHECK-SD-LABEL: bswap_i16_to_i64_anyext:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: rev16 x8, x0
; CHECK-SD-NEXT: lsl x0, x8, #48
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: bswap_i16_to_i64_anyext:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: rev w8, w0
; CHECK-GI-NEXT: lsr w8, w8, #16
; CHECK-GI-NEXT: and x8, x8, #0xffff
; CHECK-GI-NEXT: lsl x0, x8, #48
; CHECK-GI-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
%4 = zext i16 %3 to i64
%5 = shl i64 %4, 48
ret i64 %5
}

; The zext here is optimised to an any_extend during isel..
define i128 @bswap_i16_to_i128_anyext(i16 %a) {
; CHECK-SD-LABEL: bswap_i16_to_i128_anyext:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: mov w8, w0
; CHECK-SD-NEXT: mov x0, xzr
; CHECK-SD-NEXT: rev w8, w8
; CHECK-SD-NEXT: lsr w8, w8, #16
; CHECK-SD-NEXT: lsl x1, x8, #48
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: bswap_i16_to_i128_anyext:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov w8, w0
; CHECK-GI-NEXT: mov x0, xzr
; CHECK-GI-NEXT: rev w8, w8
; CHECK-GI-NEXT: lsr w8, w8, #16
; CHECK-GI-NEXT: bfi x8, x8, #32, #32
; CHECK-GI-NEXT: and x8, x8, #0xffff
; CHECK-GI-NEXT: lsl x1, x8, #48
; CHECK-GI-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
%4 = zext i16 %3 to i128
%5 = shl i128 %4, 112
ret i128 %5
}

define i32 @bswap_i16_to_i32_zext(i16 %a){
; CHECK-LABEL: bswap_i16_to_i32_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: rev w8, w0
; CHECK-NEXT: lsr w0, w8, #16
; CHECK-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
ret i16 %3
%3 = call i16 @llvm.bswap.i16(i16 %a)
%4 = zext i16 %3 to i32
ret i32 %4
}
declare i16 @llvm.bswap.i16(i16)

; ====== Other scalar bswap tests =====
define i32 @bswap_i32(i32 %a){
; CHECK-LABEL: bswap_i32:
; CHECK: // %bb.0:
Expand Down
Loading