Skip to content

Commit

Permalink
[AArch64] Improve ISel using across lane addition reduction.
Browse files Browse the repository at this point in the history
In vectorized add reduction code, the final "reduce" step is sub-optimal.
This change wll combine :

ext  v1.16b, v0.16b, v0.16b, rust-lang#8
add  v0.4s, v1.4s, v0.4s
dup  v1.4s, v0.s[1]
add  v0.4s, v1.4s, v0.4s

into

addv s0, v0.4s

PR21371
http://reviews.llvm.org/D12325
Patch by Jun Bum Lim <junbuml@codeaurora.org>!

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@246790 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
Chad Rosier committed Sep 3, 2015
1 parent 7ab2009 commit ad69a64
Show file tree
Hide file tree
Showing 2 changed files with 152 additions and 0 deletions.
99 changes: 99 additions & 0 deletions lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::INTRINSIC_VOID);
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);

MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
Expand Down Expand Up @@ -8584,6 +8585,102 @@ static SDValue performPostLD1Combine(SDNode *N,
return SDValue();
}

/// Target-specific DAG combine for the across vector reduction.
/// This function specifically handles the final clean-up step of a vector
/// reduction produced by the LoopVectorizer. It is the log2-shuffle pattern,
/// consisting of log2(NumVectorElements) steps and, in each step, 2^(s)
/// elements are reduced, where s is an induction variable from 0
/// to log2(NumVectorElements).
/// For example,
/// %1 = vector_shuffle %0, <2,3,u,u>
/// %2 = add %0, %1
/// %3 = vector_shuffle %2, <1,u,u,u>
/// %4 = add %2, %3
/// %5 = extract_vector_elt %4, 0
/// becomes :
/// %0 = uaddv %0
/// %1 = extract_vector_elt %0, 0
///
/// FIXME: Currently this function is implemented and tested specifically
/// for the add reduction. We could also support other types of across lane
/// reduction available in AArch64, including SMAXV, SMINV, UMAXV, UMINV,
/// SADDLV, UADDLV, FMAXNMV, FMAXV, FMINNMV, FMINV.
static SDValue
performAcrossLaneReductionCombine(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);

// Check if the input vector is fed by the operator we want to handle.
// We specifically check only ADD for now.
if (N0->getOpcode() != ISD::ADD)
return SDValue();

// The vector extract idx must constant zero because we only expect the final
// result of the reduction is placed in lane 0.
if (!isa<ConstantSDNode>(N1) || cast<ConstantSDNode>(N1)->getZExtValue())
return SDValue();

EVT EltTy = N0.getValueType().getVectorElementType();
if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
return SDValue();

int NumVecElts = N0.getValueType().getVectorNumElements();
if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
return SDValue();

int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
SDValue PreOp = N0;
// Iterate over each step of the across vector reduction.
for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
// We specifically check ADD for now.
if (PreOp.getOpcode() != ISD::ADD)
return SDValue();
SDValue CurOp = PreOp.getOperand(0);
SDValue Shuffle = PreOp.getOperand(1);
if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
// Try to swap the 1st and 2nd operand as add is commutative.
CurOp = PreOp.getOperand(1);
Shuffle = PreOp.getOperand(0);
if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
return SDValue();
}
// Check if it forms one step of the across vector reduction.
// E.g.,
// %cur = add %1, %0
// %shuffle = vector_shuffle %cur, <2, 3, u, u>
// %pre = add %cur, %shuffle
if (Shuffle.getOperand(0) != CurOp)
return SDValue();

int NumMaskElts = 1 << CurStep;
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask();
// Check mask values in each step.
// We expect the shuffle mask in each step follows a specific pattern
// denoted here by the <M, U> form, where M is a sequence of integers
// starting from NumMaskElts, increasing by 1, and the number integers
// in M should be NumMaskElts. U is a sequence of UNDEFs and the number
// of undef in U should be NumVecElts - NumMaskElts.
// E.g., for <8 x i16>, mask values in each step should be :
// step 0 : <1,u,u,u,u,u,u,u>
// step 1 : <2,3,u,u,u,u,u,u>
// step 2 : <4,5,6,7,u,u,u,u>
for (int i = 0; i < NumVecElts; ++i)
if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) ||
(i >= NumMaskElts && !(Mask[i] < 0)))
return SDValue();

PreOp = CurOp;
}
SDLoc DL(N);
return DAG.getNode(
ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
DAG.getNode(AArch64ISD::UADDV, DL, PreOp.getSimpleValueType(), PreOp),
DAG.getConstant(0, DL, MVT::i64));
}

/// Target-specific DAG combine function for NEON load/store intrinsics
/// to merge base address updates.
static SDValue performNEONPostLDSTCombine(SDNode *N,
Expand Down Expand Up @@ -9178,6 +9275,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performNVCASTCombine(N);
case ISD::INSERT_VECTOR_ELT:
return performPostLD1Combine(N, DCI, true);
case ISD::EXTRACT_VECTOR_ELT:
return performAcrossLaneReductionCombine(N, DAG, Subtarget);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
Expand Down
53 changes: 53 additions & 0 deletions test/CodeGen/AArch64/aarch64-addv.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
; RUN: llc -march=aarch64 < %s | FileCheck %s

define i8 @f_v16i8(<16 x i8>* %arr) {
; CHECK-LABEL: f_v16i8
; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
%bin.rdx = load <16 x i8>, <16 x i8>* %arr
%rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx0 = add <16 x i8> %bin.rdx, %rdx.shuf0
%rdx.shuf = shufflevector <16 x i8> %bin.rdx0, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef >
%bin.rdx11 = add <16 x i8> %bin.rdx0, %rdx.shuf
%rdx.shuf12 = shufflevector <16 x i8> %bin.rdx11, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
%bin.rdx13 = add <16 x i8> %bin.rdx11, %rdx.shuf12
%rdx.shuf13 = shufflevector <16 x i8> %bin.rdx13, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
%bin.rdx14 = add <16 x i8> %bin.rdx13, %rdx.shuf13
%r = extractelement <16 x i8> %bin.rdx14, i32 0
ret i8 %r
}

define i16 @f_v8i16(<8 x i16>* %arr) {
; CHECK-LABEL: f_v8i16
; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
%bin.rdx = load <8 x i16>, <8 x i16>* %arr
%rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef>
%bin.rdx11 = add <8 x i16> %bin.rdx, %rdx.shuf
%rdx.shuf12 = shufflevector <8 x i16> %bin.rdx11, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx13 = add <8 x i16> %bin.rdx11, %rdx.shuf12
%rdx.shuf13 = shufflevector <8 x i16> %bin.rdx13, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%bin.rdx14 = add <8 x i16> %bin.rdx13, %rdx.shuf13
%r = extractelement <8 x i16> %bin.rdx14, i32 0
ret i16 %r
}

define i32 @f_v4i32( <4 x i32>* %arr) {
; CHECK-LABEL: f_v4i32
; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
%bin.rdx = load <4 x i32>, <4 x i32>* %arr
%rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
%bin.rdx11 = add <4 x i32> %bin.rdx, %rdx.shuf
%rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
%bin.rdx13 = add <4 x i32> %bin.rdx11, %rdx.shuf12
%r = extractelement <4 x i32> %bin.rdx13, i32 0
ret i32 %r
}

define i64 @f_v2i64(<2 x i64>* %arr) {
; CHECK-LABEL: f_v2i64
; CHECK-NOT: addv
%bin.rdx = load <2 x i64>, <2 x i64>* %arr
%rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
%bin.rdx0 = add <2 x i64> %bin.rdx, %rdx.shuf0
%r = extractelement <2 x i64> %bin.rdx0, i32 0
ret i64 %r
}

0 comments on commit ad69a64

Please sign in to comment.