Skip to content

Commit

Permalink
[AArch64] Lower __builtin_bswap16 to rev16 if bswap followed by any_e…
Browse files Browse the repository at this point in the history
…xtend (llvm#105375)

GCC compiles the built-in function `__builtin_bswap16`, to the ARM
instruction rev16, which reverses the byte order of 16-bit data. On the
other Clang compiles the same built-in function to e.g.
```     
        rev     w8, w0
        lsr     w0, w8, llvm#16
```
i.e. it performs a byte reversal of a 32-bit register, (which moves the
lower half, which contains the 16-bit data, to the upper half) and then
right shifts the reversed 16-bit data back to the lower half of the
register.
We can improve Clang codegen by generating `rev16` instead of `rev` and
`lsr`, like GCC.
  • Loading branch information
adprasad-nvidia authored and VitaNuo committed Sep 12, 2024
1 parent 7ea810a commit 902d98d
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 5 deletions.
19 changes: 19 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22379,6 +22379,25 @@ static SDValue performExtendCombine(SDNode *N,
N->getOperand(0)->getOpcode() == ISD::SETCC)
return performSignExtendSetCCCombine(N, DCI, DAG);

// If we see (any_extend (bswap ...)) with bswap returning an i16, we know
// that the top half of the result register must be unused, due to the
// any_extend. This means that we can replace this pattern with (rev16
// (any_extend ...)). This saves a machine instruction compared to (lsr (rev
// ...)), which is what this pattern would otherwise be lowered to.
// Only apply this optimisation if any_extend in original pattern to i32 or
// i64, because this type will become the input type to REV16 in the new
// pattern, so must be a legitimate REV16 input type.
SDValue Bswap = N->getOperand(0);
if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
Bswap.getValueType() == MVT::i16 &&
(N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
SDLoc DL(N);
SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
Bswap->getOperand(0));
return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
NewAnyExtend);
}

return SDValue();
}

Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,8 @@ def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;

def AArch64rev16_scalar : SDNode<"AArch64ISD::REV16", SDTIntUnaryOp>;

def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
Expand Down Expand Up @@ -2840,6 +2842,9 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>;
def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>;

def : Pat<(AArch64rev16_scalar GPR32:$Rn), (REV16Wr GPR32:$Rn)>;
def : Pat<(AArch64rev16_scalar GPR64:$Rn), (REV16Xr GPR64:$Rn)>;

def : Pat<(or (and (srl GPR64:$Rn, (i64 8)), (i64 0x00ff00ff00ff00ff)),
(and (shl GPR64:$Rn, (i64 8)), (i64 0xff00ff00ff00ff00))),
(REV16Xr GPR64:$Rn)>;
Expand Down
78 changes: 73 additions & 5 deletions llvm/test/CodeGen/AArch64/bswap.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,85 @@
; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI

; ====== Scalar Tests =====
define i16 @bswap_i16(i16 %a){
; CHECK-LABEL: bswap_i16:

; ====== Scalar bswap.i16 Tests =====
define i16 @bswap_i16_to_i16_anyext(i16 %a){
; CHECK-SD-LABEL: bswap_i16_to_i16_anyext:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: rev16 w0, w0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: bswap_i16_to_i16_anyext:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: rev w8, w0
; CHECK-GI-NEXT: lsr w0, w8, #16
; CHECK-GI-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
ret i16 %3
}
declare i16 @llvm.bswap.i16(i16)

; The zext here is optimised to an any_extend during isel.
define i64 @bswap_i16_to_i64_anyext(i16 %a) {
; CHECK-SD-LABEL: bswap_i16_to_i64_anyext:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: rev16 x8, x0
; CHECK-SD-NEXT: lsl x0, x8, #48
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: bswap_i16_to_i64_anyext:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: rev w8, w0
; CHECK-GI-NEXT: lsr w8, w8, #16
; CHECK-GI-NEXT: and x8, x8, #0xffff
; CHECK-GI-NEXT: lsl x0, x8, #48
; CHECK-GI-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
%4 = zext i16 %3 to i64
%5 = shl i64 %4, 48
ret i64 %5
}

; The zext here is optimised to an any_extend during isel..
define i128 @bswap_i16_to_i128_anyext(i16 %a) {
; CHECK-SD-LABEL: bswap_i16_to_i128_anyext:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: mov w8, w0
; CHECK-SD-NEXT: mov x0, xzr
; CHECK-SD-NEXT: rev w8, w8
; CHECK-SD-NEXT: lsr w8, w8, #16
; CHECK-SD-NEXT: lsl x1, x8, #48
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: bswap_i16_to_i128_anyext:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov w8, w0
; CHECK-GI-NEXT: mov x0, xzr
; CHECK-GI-NEXT: rev w8, w8
; CHECK-GI-NEXT: lsr w8, w8, #16
; CHECK-GI-NEXT: bfi x8, x8, #32, #32
; CHECK-GI-NEXT: and x8, x8, #0xffff
; CHECK-GI-NEXT: lsl x1, x8, #48
; CHECK-GI-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
%4 = zext i16 %3 to i128
%5 = shl i128 %4, 112
ret i128 %5
}

define i32 @bswap_i16_to_i32_zext(i16 %a){
; CHECK-LABEL: bswap_i16_to_i32_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: rev w8, w0
; CHECK-NEXT: lsr w0, w8, #16
; CHECK-NEXT: ret
%3 = call i16 @llvm.bswap.i16(i16 %a)
ret i16 %3
%3 = call i16 @llvm.bswap.i16(i16 %a)
%4 = zext i16 %3 to i32
ret i32 %4
}
declare i16 @llvm.bswap.i16(i16)

; ====== Other scalar bswap tests =====
define i32 @bswap_i32(i32 %a){
; CHECK-LABEL: bswap_i32:
; CHECK: // %bb.0:
Expand Down

0 comments on commit 902d98d

Please sign in to comment.