Skip to content

Commit

Permalink
AMDGPU: Add 24-bit mul intrinsics
Browse files Browse the repository at this point in the history
Insert these during codegenprepare.

This works around a DAG issue where generic combines eliminate the and
asserting the high bits are zero, which then exposes an unknown read
source to the mul combine. It doesn't worth the hassle of trying to
insert an AssertZext or something to try to deal with it.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@366094 91177308-0d34-0410-b5e6-96231b3b80d8
  • Loading branch information
arsenm committed Jul 15, 2019
1 parent 260d1e9 commit b508009
Show file tree
Hide file tree
Showing 9 changed files with 751 additions and 11 deletions.
10 changes: 10 additions & 0 deletions include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1350,6 +1350,16 @@ def int_amdgcn_alignbyte : Intrinsic<[llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]
>;

def int_amdgcn_mul_i24 : Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]
>;

def int_amdgcn_mul_u24 : Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]
>;

// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
//
// bar_val is the total number of waves that will wait on this
Expand Down
127 changes: 127 additions & 0 deletions lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
AssumptionCache *AC = nullptr;
LegacyDivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
bool HasUnsafeFPMath = false;

/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
Expand Down Expand Up @@ -133,6 +134,16 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// \returns True.
bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;


unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const;
unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const;
bool isI24(Value *V, unsigned ScalarSize) const;
bool isU24(Value *V, unsigned ScalarSize) const;

/// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
/// SelectionDAG has an issue where an and asserting the bits are known
bool replaceMulWithMul24(BinaryOperator &I) const;

/// Expands 24 bit div or rem.
Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
Value *Num, Value *Den,
Expand Down Expand Up @@ -392,6 +403,118 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
return true;
}

unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op,
unsigned ScalarSize) const {
KnownBits Known = computeKnownBits(Op, *DL, 0, AC);
return ScalarSize - Known.countMinLeadingZeros();
}

unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op,
unsigned ScalarSize) const {
// In order for this to be a signed 24-bit value, bit 23, must
// be a sign bit.
return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC);
}

bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const {
return ScalarSize >= 24 && // Types less than 24-bit should be treated
// as unsigned 24-bit values.
numBitsSigned(V, ScalarSize) < 24;
}

bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
return numBitsUnsigned(V, ScalarSize) <= 24;
}

static void extractValues(IRBuilder<> &Builder,
SmallVectorImpl<Value *> &Values, Value *V) {
VectorType *VT = dyn_cast<VectorType>(V->getType());
if (!VT) {
Values.push_back(V);
return;
}

for (int I = 0, E = VT->getNumElements(); I != E; ++I)
Values.push_back(Builder.CreateExtractElement(V, I));
}

static Value *insertValues(IRBuilder<> &Builder,
Type *Ty,
SmallVectorImpl<Value *> &Values) {
if (Values.size() == 1)
return Values[0];

Value *NewVal = UndefValue::get(Ty);
for (int I = 0, E = Values.size(); I != E; ++I)
NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);

return NewVal;
}

bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
if (I.getOpcode() != Instruction::Mul)
return false;

Type *Ty = I.getType();
unsigned Size = Ty->getScalarSizeInBits();
if (Size <= 16 && ST->has16BitInsts())
return false;

// Prefer scalar if this could be s_mul_i32
if (DA->isUniform(&I))
return false;

Value *LHS = I.getOperand(0);
Value *RHS = I.getOperand(1);
IRBuilder<> Builder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());

Intrinsic::ID IntrID = Intrinsic::not_intrinsic;

// TODO: Should this try to match mulhi24?
if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) {
IntrID = Intrinsic::amdgcn_mul_u24;
} else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) {
IntrID = Intrinsic::amdgcn_mul_i24;
} else
return false;

SmallVector<Value *, 4> LHSVals;
SmallVector<Value *, 4> RHSVals;
SmallVector<Value *, 4> ResultVals;
extractValues(Builder, LHSVals, LHS);
extractValues(Builder, RHSVals, RHS);


IntegerType *I32Ty = Builder.getInt32Ty();
FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID);
for (int I = 0, E = LHSVals.size(); I != E; ++I) {
Value *LHS, *RHS;
if (IntrID == Intrinsic::amdgcn_mul_u24) {
LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
} else {
LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
}

Value *Result = Builder.CreateCall(Intrin, {LHS, RHS});

if (IntrID == Intrinsic::amdgcn_mul_u24) {
ResultVals.push_back(Builder.CreateZExtOrTrunc(Result,
LHSVals[I]->getType()));
} else {
ResultVals.push_back(Builder.CreateSExtOrTrunc(Result,
LHSVals[I]->getType()));
}
}

I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals));
I.eraseFromParent();

return true;
}

static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
if (!CNum)
Expand Down Expand Up @@ -756,6 +879,9 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
DA->isUniform(&I) && promoteUniformOpToI32(I))
return true;

if (replaceMulWithMul24(I))
return true;

bool Changed = false;
Instruction::BinaryOps Opc = I.getOpcode();
Type *Ty = I.getType();
Expand Down Expand Up @@ -882,6 +1008,7 @@ bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {

bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Mod = &M;
DL = &Mod->getDataLayout();
return false;
}

Expand Down
5 changes: 5 additions & 0 deletions lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5836,6 +5836,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_cos:
return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));

case Intrinsic::amdgcn_mul_u24:
return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
case Intrinsic::amdgcn_mul_i24:
return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));

case Intrinsic::amdgcn_log_clamp: {
if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return SDValue();
Expand Down
Loading

0 comments on commit b508009

Please sign in to comment.