Skip to content

Commit

Permalink
Expose various Convert intrinsics for Avx512F, Avx512BW, and Avx512DQ (
Browse files Browse the repository at this point in the history
…#85281)

* Expose various Convert intrinsics for Avx512F and Avx512DQ

* Expose various integer conversion APIs for Avx512F and Avx512BW

* Ensure special instructions are handled in codegen

* Apply formatting patch

* Ensure the AVX512F_VL variant is picked for simdSize=16/32

* Ensure conversion instructions are handled in PERFSCORE

* Ensure instructions use the right tuple type

* Removing an invalid API and fix more PERFSCORE entries

* Resolve additional failures masked by #85056

* Ensure TieredCompilation=0 is also passing

* Apply formatting patch

* Fixing some more test edge cases

* Ensure uint64->double and uint64->Float masks the input
  • Loading branch information
tannergooding authored Apr 25, 2023
1 parent 5b9848f commit f1a4cdd
Show file tree
Hide file tree
Showing 20 changed files with 2,963 additions and 302 deletions.
41 changes: 35 additions & 6 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5658,18 +5658,47 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
break;
}

case NI_AVX512F_ConvertToVector256Int32:
case NI_AVX512F_ConvertToVector256UInt32:
case NI_AVX512F_VL_ConvertToVector128UInt32:
case NI_AVX512F_VL_ConvertToVector128UInt32WithSaturation:
{
assert(!varTypeIsFloating(baseType));
FALLTHROUGH;
}

case NI_AVX512F_ConvertToVector128Byte:
case NI_AVX512F_ConvertToVector128ByteWithSaturation:
case NI_AVX512F_ConvertToVector128Int16:
case NI_AVX512F_ConvertToVector128Int32:
case NI_AVX512F_ConvertToVector128Int16WithSaturation:
case NI_AVX512F_ConvertToVector128SByte:
case NI_AVX512F_ConvertToVector128SByteWithSaturation:
case NI_AVX512F_ConvertToVector128UInt16:
case NI_AVX512F_ConvertToVector128UInt32:
case NI_AVX512F_ConvertToVector128UInt16WithSaturation:
case NI_AVX512F_ConvertToVector256Int16:
case NI_AVX512F_ConvertToVector256Int32:
case NI_AVX512F_ConvertToVector256Int16WithSaturation:
case NI_AVX512F_ConvertToVector256Int32WithSaturation:
case NI_AVX512F_ConvertToVector256UInt16:
case NI_AVX512F_ConvertToVector256UInt32:
case NI_AVX512BW_ConvertToVector128Byte:
case NI_AVX512BW_ConvertToVector128SByte:
case NI_AVX512F_ConvertToVector256UInt16WithSaturation:
case NI_AVX512F_ConvertToVector256UInt32WithSaturation:
case NI_AVX512F_VL_ConvertToVector128Byte:
case NI_AVX512F_VL_ConvertToVector128ByteWithSaturation:
case NI_AVX512F_VL_ConvertToVector128Int16:
case NI_AVX512F_VL_ConvertToVector128Int16WithSaturation:
case NI_AVX512F_VL_ConvertToVector128Int32:
case NI_AVX512F_VL_ConvertToVector128Int32WithSaturation:
case NI_AVX512F_VL_ConvertToVector128SByte:
case NI_AVX512F_VL_ConvertToVector128SByteWithSaturation:
case NI_AVX512F_VL_ConvertToVector128UInt16:
case NI_AVX512F_VL_ConvertToVector128UInt16WithSaturation:
case NI_AVX512BW_ConvertToVector256Byte:
case NI_AVX512BW_ConvertToVector256ByteWithSaturation:
case NI_AVX512BW_ConvertToVector256SByte:
case NI_AVX512BW_ConvertToVector256SByteWithSaturation:
case NI_AVX512BW_VL_ConvertToVector128Byte:
case NI_AVX512BW_VL_ConvertToVector128ByteWithSaturation:
case NI_AVX512BW_VL_ConvertToVector128SByte:
case NI_AVX512BW_VL_ConvertToVector128SByteWithSaturation:
{
// These intrinsics are "ins reg/mem, xmm"
ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
Expand Down
69 changes: 61 additions & 8 deletions src/coreclr/jit/emit.h
Original file line number Diff line number Diff line change
Expand Up @@ -1905,7 +1905,7 @@ class emitter
ssize_t emitGetInsCIdisp(instrDesc* id);
unsigned emitGetInsCIargs(instrDesc* id);

inline static emitAttr emitGetMemOpSize(instrDesc* id);
inline emitAttr emitGetMemOpSize(instrDesc* id) const;

// Return the argument count for a direct call "id".
int emitGetInsCDinfo(instrDesc* id);
Expand Down Expand Up @@ -3456,11 +3456,12 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
// Arguments:
// id - Instruction descriptor
//
/* static */ emitAttr emitter::emitGetMemOpSize(instrDesc* id)
emitAttr emitter::emitGetMemOpSize(instrDesc* id) const
{
emitAttr defaultSize = id->idOpSize();
emitAttr defaultSize = id->idOpSize();
instruction ins = id->idIns();

switch (id->idIns())
switch (ins)
{
case INS_pextrb:
case INS_pinsrb:
Expand Down Expand Up @@ -3570,9 +3571,6 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)

case INS_cvtdq2pd:
case INS_cvtps2pd:
case INS_vpmovdw:
case INS_vpmovqd:
case INS_vpmovwb:
{
if (defaultSize == 64)
{
Expand All @@ -3589,6 +3587,57 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
}
}

case INS_vpmovdb:
case INS_vpmovdw:
case INS_vpmovqb:
case INS_vpmovqd:
case INS_vpmovqw:
case INS_vpmovwb:
case INS_vpmovsdb:
case INS_vpmovsdw:
case INS_vpmovsqb:
case INS_vpmovsqd:
case INS_vpmovsqw:
case INS_vpmovswb:
case INS_vpmovusdb:
case INS_vpmovusdw:
case INS_vpmovusqb:
case INS_vpmovusqd:
case INS_vpmovusqw:
case INS_vpmovuswb:
{
insTupleType tupleType = insTupleTypeInfo(ins);
unsigned memSize = 0;

switch (tupleType)
{
case INS_TT_HALF_MEM:
{
memSize = defaultSize / 2;
break;
}

case INS_TT_QUARTER_MEM:
{
memSize = defaultSize / 4;
break;
}

case INS_TT_EIGHTH_MEM:
{
memSize = defaultSize / 8;
break;
}

default:
{
unreached();
}
}

return EA_ATTR(memSize);
}

case INS_vbroadcastf128:
case INS_vbroadcasti128:
case INS_vextractf128:
Expand All @@ -3613,7 +3662,11 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)

case INS_movddup:
{
if (defaultSize == 32)
if (defaultSize == 64)
{
return EA_64BYTE;
}
else if (defaultSize == 32)
{
return EA_32BYTE;
}
Expand Down
100 changes: 88 additions & 12 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1362,6 +1362,10 @@ bool emitter::TakesRexWPrefix(const instrDesc* id) const
case INS_shlx:
case INS_shrx:
#endif // TARGET_AMD64
case INS_vcvtsd2usi:
case INS_vcvtss2usi:
case INS_vcvttsd2usi:
case INS_vcvttss2usi:
{
if (attr == EA_8BYTE)
{
Expand Down Expand Up @@ -2582,6 +2586,10 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id)
case INS_sarx:
case INS_shrx:
#endif
case INS_vcvtsd2usi:
case INS_vcvtss2usi:
case INS_vcvttsd2usi:
case INS_vcvttss2usi:
{
// These SSE instructions write to a general purpose integer register.
return false;
Expand Down Expand Up @@ -3010,7 +3018,7 @@ inline bool hasTupleTypeInfo(instruction ins)
// Return Value:
// the tuple type info for a given CPU instruction.
//
inline insTupleType insTupleTypeInfo(instruction ins)
insTupleType emitter::insTupleTypeInfo(instruction ins) const
{
assert((unsigned)ins < ArrLen(insTupleTypeInfos));
assert(insTupleTypeInfos[ins] != INS_TT_NONE);
Expand All @@ -3020,9 +3028,9 @@ inline insTupleType insTupleTypeInfo(instruction ins)
// Return true if the instruction uses the SSE38 or SSE3A macro in instrsXArch.h.
bool emitter::EncodedBySSE38orSSE3A(instruction ins) const
{
const size_t SSE38 = 0x0F660038;
const size_t SSE3A = 0x0F66003A;
const size_t MASK = 0xFFFF00FF;
const size_t SSE38 = 0x0F000038;
const size_t SSE3A = 0x0F00003A;
const size_t MASK = 0xFF0000FF;

size_t insCode = 0;

Expand All @@ -3044,8 +3052,19 @@ bool emitter::EncodedBySSE38orSSE3A(instruction ins) const
insCode = insCodeMR(ins);
}

insCode &= MASK;
return insCode == SSE38 || insCode == SSE3A;
size_t mskCode = insCode & MASK;

if ((mskCode != SSE38) && (mskCode != SSE3A))
{
return false;
}

#if defined(DEBUG)
insCode = (insCode >> 16) & 0xFF;
assert((insCode == 0x66) || (insCode == 0xF2) || (insCode == 0xF3));
#endif // DEBUG

return true;
}

/*****************************************************************************
Expand Down Expand Up @@ -11214,6 +11233,10 @@ void emitter::emitDispIns(
case INS_cvtss2si:
case INS_cvtsd2si:
case INS_cvttss2si:
case INS_vcvtsd2usi:
case INS_vcvtss2usi:
case INS_vcvttsd2usi:
case INS_vcvttss2usi:
{
printf(" %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_16BYTE));
break;
Expand Down Expand Up @@ -15528,9 +15551,9 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
disp8Compression = inputSize * 4;
break;
case INS_TT_TUPLE8:
// N = input size in bytes * 4, 32bit for 512 only
// N = input size in bytes * 8, 32bit for 512 only
assert((inputSize == 4 && vectorLength >= 64));
disp8Compression = inputSize * 4;
disp8Compression = inputSize * 8;
break;
case INS_TT_HALF_MEM:
// N = vector length in bytes / 2
Expand Down Expand Up @@ -17825,11 +17848,39 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_cvttps2dq:
case INS_cvtps2dq:
case INS_cvtdq2ps:
case INS_vcvtpd2qq:
case INS_vcvtpd2uqq:
case INS_vcvtps2udq:
case INS_vcvtqq2pd:
case INS_vcvttps2udq:
case INS_vcvtudq2ps:
case INS_vcvttpd2qq:
case INS_vcvttpd2uqq:
case INS_vcvtuqq2pd:
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency += PERFSCORE_LATENCY_4C;
break;

case INS_vpmovdb:
case INS_vpmovdw:
case INS_vpmovqb:
case INS_vpmovqd:
case INS_vpmovqw:
case INS_vpmovsdb:
case INS_vpmovsdw:
case INS_vpmovsqb:
case INS_vpmovsqd:
case INS_vpmovsqw:
case INS_vpmovswb:
case INS_vpmovusdb:
case INS_vpmovusdw:
case INS_vpmovusqb:
case INS_vpmovusqd:
case INS_vpmovusqw:
case INS_vpmovuswb:
case INS_vpmovwb:
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency += PERFSCORE_LATENCY_4C;
result.insThroughput = PERFSCORE_THROUGHPUT_2C;
result.insLatency += (opSize == EA_16BYTE) ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_4C;
break;

case INS_haddps:
Expand Down Expand Up @@ -17892,12 +17943,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_cvtsi2ss32:
case INS_cvtsi2sd64:
case INS_cvtsi2ss64:
case INS_vcvtsd2usi:
case INS_vcvttsd2usi:
case INS_vcvtusi2sd32:
case INS_vcvtusi2sd64:
case INS_vcvtusi2ss32:
case INS_vcvtusi2ss64:
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += PERFSCORE_LATENCY_7C;
break;

case INS_cvttss2si:
case INS_cvtss2si:
case INS_vcvtss2usi:
case INS_vcvttss2usi:
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_8C : PERFSCORE_LATENCY_7C;
break;
Expand Down Expand Up @@ -18241,6 +18300,15 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_cvtdq2pd:
case INS_cvtpd2ps:
case INS_cvttpd2dq:
case INS_vcvtpd2udq:
case INS_vcvtps2qq:
case INS_vcvtps2uqq:
case INS_vcvtqq2ps:
case INS_vcvttpd2udq:
case INS_vcvttps2qq:
case INS_vcvttps2uqq:
case INS_vcvtudq2pd:
case INS_vcvtuqq2ps:
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_7C : PERFSCORE_LATENCY_5C;
break;
Expand Down Expand Up @@ -18282,17 +18350,25 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
case INS_vpbroadcastq_gpr:
case INS_vbroadcasti128:
case INS_vbroadcastf128:
case INS_vbroadcastf64x2:
case INS_vbroadcasti64x2:
case INS_vbroadcastf64x4:
case INS_vbroadcasti64x4:
case INS_vbroadcastf32x2:
case INS_vbroadcasti32x2:
case INS_vbroadcastf32x8:
case INS_vbroadcasti32x8:
case INS_vbroadcastss:
case INS_vbroadcastsd:
if (memAccessKind == PERFSCORE_MEMORY_NONE)
{
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency = opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_1C;
result.insLatency = opSize == EA_16BYTE ? PERFSCORE_LATENCY_1C : PERFSCORE_LATENCY_3C;
}
else
{
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_2C;
result.insLatency += opSize == EA_16BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_3C;
if (ins == INS_vpbroadcastb || ins == INS_vpbroadcastw)
{
result.insLatency += PERFSCORE_LATENCY_1C;
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/emitxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr
return code;
}

insTupleType insTupleTypeInfo(instruction ins) const;

//------------------------------------------------------------------------
// HasKMaskRegisterDest: Temporary check to identify instructions that can
// be Evex encoded but require Opmask(KMask) register support.
Expand Down
Loading

0 comments on commit f1a4cdd

Please sign in to comment.