Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose various floating-point intrinsics for Avx512F and Avx512DQ #85716

Merged
merged 10 commits into from
May 4, 2023
Merged
1 change: 1 addition & 0 deletions src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -961,6 +961,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
void genHWIntrinsic_R_R_RM_R(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr);
void genHWIntrinsic_R_R_R_RM(
instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, GenTree* op3);
void genHWIntrinsic_R_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, emitAttr attr, int8_t ival);
void genBaseIntrinsic(GenTreeHWIntrinsic* node);
void genX86BaseIntrinsic(GenTreeHWIntrinsic* node);
void genSSEIntrinsic(GenTreeHWIntrinsic* node);
Expand Down
157 changes: 157 additions & 0 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8675,6 +8675,109 @@ void emitter::emitIns_SIMD_R_R_S_R(
emitIns_R_S(ins, attr, targetReg, varx, offs);
}
}

//------------------------------------------------------------------------
// emitIns_SIMD_R_R_R_A_I: emits the code for a SIMD instruction that takes two register operands, a GenTreeIndir
// address, an immediate operand, and that returns a value in register
//
// Arguments:
// ins -- The instruction being emitted
// attr -- The emit attribute
// targetReg -- The target register
// op1Reg -- The register of the first operand
// op2Reg -- The register of the second operand
// indir -- The GenTreeIndir used for the memory address
// ival -- The immediate value
//
void emitter::emitIns_SIMD_R_R_R_A_I(instruction ins,
emitAttr attr,
regNumber targetReg,
regNumber op1Reg,
regNumber op2Reg,
GenTreeIndir* indir,
int ival)
{
assert(UseSimdEncoding());
emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true);
emitIns_R_R_A_I(ins, attr, targetReg, op2Reg, indir, ival, IF_RWR_RRD_ARD_CNS);
}

//------------------------------------------------------------------------
// emitIns_SIMD_R_R_R_C_I: emits the code for a SIMD instruction that takes two register operands, a field handle +
// offset, an immediate operand, and that returns a value in register
//
// Arguments:
// ins -- The instruction being emitted
// attr -- The emit attribute
// targetReg -- The target register
// op1Reg -- The register of the first operand
// op2Reg -- The register of the second operand
// fldHnd -- The CORINFO_FIELD_HANDLE used for the memory address
// offs -- The offset added to the memory address from fldHnd
// ival -- The immediate value
//
void emitter::emitIns_SIMD_R_R_R_C_I(instruction ins,
emitAttr attr,
regNumber targetReg,
regNumber op1Reg,
regNumber op2Reg,
CORINFO_FIELD_HANDLE fldHnd,
int offs,
int ival)
{
assert(UseSimdEncoding());
emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true);
emitIns_R_R_C_I(ins, attr, targetReg, op2Reg, fldHnd, offs, ival);
}

//------------------------------------------------------------------------
// emitIns_SIMD_R_R_R_R_I: emits the code for a SIMD instruction that takes three register operands, an immediate
// operand, and that returns a value in register
//
// Arguments:
// ins -- The instruction being emitted
// attr -- The emit attribute
// targetReg -- The target register
// op1Reg -- The register of the first operand
// op2Reg -- The register of the second operand
// op3Reg -- The register of the third operand
// ival -- The immediate value
//
void emitter::emitIns_SIMD_R_R_R_R_I(
instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, regNumber op3Reg, int ival)
{
assert(UseSimdEncoding());
emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true);
emitIns_R_R_R_I(ins, attr, targetReg, op2Reg, op3Reg, ival);
}

//------------------------------------------------------------------------
// emitIns_SIMD_R_R_R_S_I: emits the code for a SIMD instruction that takes two register operands, a variable index +
// offset, an immediate operand, and that returns a value in register
//
// Arguments:
// ins -- The instruction being emitted
// attr -- The emit attribute
// targetReg -- The target register
// op1Reg -- The register of the first operand
// op2Reg -- The register of the second operand
// varx -- The variable index used for the memory address
// offs -- The offset added to the memory address from varx
// ival -- The immediate value
//
void emitter::emitIns_SIMD_R_R_R_S_I(instruction ins,
emitAttr attr,
regNumber targetReg,
regNumber op1Reg,
regNumber op2Reg,
int varx,
int offs,
int ival)
{
assert(UseSimdEncoding());
emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true);
emitIns_R_R_S_I(ins, attr, targetReg, op2Reg, varx, offs, ival);
}
#endif // FEATURE_HW_INTRINSICS

/*****************************************************************************
Expand Down Expand Up @@ -18030,9 +18133,35 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_vcvttpd2qq:
case INS_vcvttpd2uqq:
case INS_vcvtuqq2pd:
case INS_vfixupimmpd:
case INS_vfixupimmps:
case INS_vfixupimmsd:
case INS_vfixupimmss:
case INS_vgetexppd:
case INS_vgetexpps:
case INS_vgetexpsd:
case INS_vgetexpss:
case INS_vgetmantpd:
case INS_vgetmantps:
case INS_vgetmantsd:
case INS_vgetmantss:
case INS_vrangepd:
case INS_vrangeps:
case INS_vrangesd:
case INS_vrangess:
case INS_vreducepd:
case INS_vreduceps:
case INS_vreducesd:
case INS_vreducess:
case INS_vscalefpd:
case INS_vscalefps:
case INS_vscalefsd:
case INS_vscalefss:
{
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency += PERFSCORE_LATENCY_4C;
break;
}

case INS_vpmovdb:
case INS_vpmovdw:
Expand Down Expand Up @@ -18102,13 +18231,41 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
result.insLatency += PERFSCORE_LATENCY_4C;
break;

case INS_vrcp14pd:
case INS_vrcp14ps:
case INS_vrcp14sd:
case INS_vrcp14ss:
case INS_vrsqrt14pd:
case INS_vrsqrt14sd:
case INS_vrsqrt14ps:
case INS_vrsqrt14ss:
{
if (opSize == EA_64BYTE)
{
result.insThroughput = PERFSCORE_THROUGHPUT_2C;
result.insLatency += PERFSCORE_LATENCY_8C;
}
else
{
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += PERFSCORE_LATENCY_4C;
}
break;
}

case INS_roundpd:
case INS_roundps:
case INS_roundsd:
case INS_roundss:
case INS_vrndscalepd:
case INS_vrndscaleps:
case INS_vrndscalesd:
case INS_vrndscaless:
{
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += PERFSCORE_LATENCY_8C;
break;
}

case INS_cvttsd2si:
case INS_cvtsd2si:
Expand Down
31 changes: 31 additions & 0 deletions src/coreclr/jit/emitxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,37 @@ void emitIns_SIMD_R_R_C_R(instruction ins,
int offs);
void emitIns_SIMD_R_R_S_R(
instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg, int varx, int offs);

void emitIns_SIMD_R_R_R_A_I(instruction ins,
emitAttr attr,
regNumber targetReg,
regNumber op1Reg,
regNumber op2Reg,
GenTreeIndir* indir,
int ival);
void emitIns_SIMD_R_R_R_C_I(instruction ins,
emitAttr attr,
regNumber targetReg,
regNumber op1Reg,
regNumber op2Reg,
CORINFO_FIELD_HANDLE fldHnd,
int offs,
int ival);
void emitIns_SIMD_R_R_R_R_I(instruction ins,
emitAttr attr,
regNumber targetReg,
regNumber op1Reg,
regNumber op2Reg,
regNumber op3Reg,
int ival);
void emitIns_SIMD_R_R_R_S_I(instruction ins,
emitAttr attr,
regNumber targetReg,
regNumber op1Reg,
regNumber op2Reg,
int varx,
int offs,
int ival);
#endif // FEATURE_HW_INTRINSICS

enum EmitCallType
Expand Down
57 changes: 53 additions & 4 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19236,12 +19236,15 @@ bool GenTree::isRMWHWIntrinsic(Compiler* comp)
assert(comp != nullptr);

#if defined(TARGET_XARCH)
GenTreeHWIntrinsic* hwintrinsic = AsHWIntrinsic();
NamedIntrinsic intrinsicId = hwintrinsic->GetHWIntrinsicId();

if (!comp->canUseVexEncoding())
{
return HWIntrinsicInfo::HasRMWSemantics(AsHWIntrinsic()->GetHWIntrinsicId());
return HWIntrinsicInfo::HasRMWSemantics(intrinsicId);
}

switch (AsHWIntrinsic()->GetHWIntrinsicId())
switch (intrinsicId)
{
// TODO-XArch-Cleanup: Move this switch block to be table driven.

Expand Down Expand Up @@ -19269,6 +19272,50 @@ bool GenTree::isRMWHWIntrinsic(Compiler* comp)
return true;
}

case NI_AVX512F_Fixup:
case NI_AVX512F_FixupScalar:
case NI_AVX512F_VL_Fixup:
{
// We are actually only RMW in the case where the lookup table
// has any value that could result in `op1` being picked. So
// in the case `op3` is a constant and none of the nibbles are
// `0`, then we don't have to be RMW and can actually "drop" `op1`

GenTree* op3 = hwintrinsic->Op(3);

if (!op3->IsCnsVec())
{
return true;
}

GenTreeVecCon* vecCon = op3->AsVecCon();

var_types simdBaseType = hwintrinsic->GetSimdBaseType();
unsigned simdSize = hwintrinsic->GetSimdSize();
uint32_t count = simdSize / sizeof(uint32_t);
uint32_t incSize = (simdBaseType == TYP_FLOAT) ? 1 : 2;

if (intrinsicId == NI_AVX512F_FixupScalar)
{
// Upper elements come from op2
count = 1;
}

for (uint32_t i = 0; i < count; i += incSize)
{
uint32_t tbl = vecCon->gtSimdVal.u32[i];

if (((tbl & 0x0000000F) == 0) || ((tbl & 0x000000F0) == 0) || ((tbl & 0x00000F00) == 0) ||
((tbl & 0x0000F000) == 0) || ((tbl & 0x000F0000) == 0) || ((tbl & 0x00F00000) == 0) ||
((tbl & 0x0F000000) == 0) || ((tbl & 0xF0000000) == 0))
{
return true;
}
}

return false;
}

default:
{
return false;
Expand Down Expand Up @@ -20489,7 +20536,8 @@ GenTree* Compiler::gtNewSimdCeilNode(var_types type, GenTree* op1, CorInfoType s
else if (simdSize == 64)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F));
intrinsic = NI_AVX512F_Ceiling;
GenTree* op2 = gtNewIconNode(static_cast<int32_t>(FloatRoundingMode::ToPositiveInfinity));
return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_AVX512F_RoundScale, simdBaseJitType, simdSize);
}
else
{
Expand Down Expand Up @@ -22060,7 +22108,8 @@ GenTree* Compiler::gtNewSimdFloorNode(var_types type, GenTree* op1, CorInfoType
else if (simdSize == 64)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F));
intrinsic = NI_AVX512F_Floor;
GenTree* op2 = gtNewIconNode(static_cast<int32_t>(FloatRoundingMode::ToNegativeInfinity));
return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_AVX512F_RoundScale, simdBaseJitType, simdSize);
}
else
{
Expand Down
13 changes: 11 additions & 2 deletions src/coreclr/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1009,11 +1009,14 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
switch (numArgs)
{
case 0:
{
assert(!isScalar);
retNode = gtNewSimdHWIntrinsicNode(retType, intrinsic, simdBaseJitType, simdSize);
break;
}

case 1:
{
op1 = getArgForHWIntrinsic(sigReader.GetOp1Type(), sigReader.op1ClsHnd);

if ((category == HW_Category_MemoryLoad) && op1->OperIs(GT_CAST))
Expand Down Expand Up @@ -1067,8 +1070,10 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
#endif // TARGET_XARCH

break;
}

case 2:
{
op2 = getArgForHWIntrinsic(sigReader.GetOp2Type(), sigReader.op2ClsHnd);
op2 = addRangeCheckIfNeeded(intrinsic, op2, mustExpand, immLowerBound, immUpperBound);
op1 = getArgForHWIntrinsic(sigReader.GetOp1Type(), sigReader.op1ClsHnd);
Expand Down Expand Up @@ -1121,8 +1126,10 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
}
#endif
break;
}

case 3:
{
op3 = getArgForHWIntrinsic(sigReader.GetOp3Type(), sigReader.op3ClsHnd);
op2 = getArgForHWIntrinsic(sigReader.GetOp2Type(), sigReader.op2ClsHnd);
op1 = getArgForHWIntrinsic(sigReader.GetOp1Type(), sigReader.op1ClsHnd);
Expand Down Expand Up @@ -1164,9 +1171,10 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
}
#endif
break;
}

#ifdef TARGET_ARM64
case 4:
{
op4 = getArgForHWIntrinsic(sigReader.GetOp4Type(), sigReader.op4ClsHnd);
op4 = addRangeCheckIfNeeded(intrinsic, op4, mustExpand, immLowerBound, immUpperBound);
op3 = getArgForHWIntrinsic(sigReader.GetOp3Type(), sigReader.op3ClsHnd);
Expand All @@ -1176,7 +1184,8 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
assert(!isScalar);
retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, op3, op4, intrinsic, simdBaseJitType, simdSize);
break;
#endif
}

default:
break;
}
Expand Down
Loading