Skip to content

Commit

Permalink
Initial support for zmm in .NET (#80960)
Browse files Browse the repository at this point in the history
* Adding simd64 and zmm types.

* Adding other infrastructure to lower/emit code for simd64.

* Lowering + workarounds for simd64 constants.

* Fix lowering logic for Create().

* Save/restore for zmm

* Add ToDo comments for AVX512BW

* Separate AVX512F and AVX512BW + Fix disassembly.

* Review changes.

* Fixing throughput issue.

* Additional Review comments.

* Setting 'EnableIncompleteISAClass="1"' for relevant CI runs.

* Removing 64 bit flags + correcting check.

* Apply formatting patch

---------

Co-authored-by: Tanner Gooding <tagoo@outlook.com>
  • Loading branch information
DeepakRajendrakumaran and tannergooding authored Mar 6, 2023
1 parent 4eca676 commit 6123cb0
Show file tree
Hide file tree
Showing 43 changed files with 1,357 additions and 498 deletions.
2 changes: 2 additions & 0 deletions src/coreclr/jit/assertionprop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3222,6 +3222,7 @@ GenTree* Compiler::optVNConstantPropOnTree(BasicBlock* block, GenTree* tree)

#if defined(TARGET_XARCH)
case TYP_SIMD32:
case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val.
{
simd32_t value = vnStore->ConstantValue<simd32_t>(vnCns);

Expand All @@ -3231,6 +3232,7 @@ GenTree* Compiler::optVNConstantPropOnTree(BasicBlock* block, GenTree* tree)
conValTree = vecCon;
break;
}
break;
#endif // TARGET_XARCH
#endif // FEATURE_SIMD

Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -946,7 +946,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
void genSSE2Intrinsic(GenTreeHWIntrinsic* node);
void genSSE41Intrinsic(GenTreeHWIntrinsic* node);
void genSSE42Intrinsic(GenTreeHWIntrinsic* node);
void genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node);
void genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node);
void genAESIntrinsic(GenTreeHWIntrinsic* node);
void genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node);
void genFMAIntrinsic(GenTreeHWIntrinsic* node);
Expand Down
49 changes: 46 additions & 3 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,37 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre

if (vecCon->IsZero())
{
if ((attr != EA_32BYTE) || compiler->compOpportunisticallyDependsOn(InstructionSet_AVX))
bool isSupported;

switch (attr)
{
case EA_32BYTE:
{
isSupported = compiler->compOpportunisticallyDependsOn(InstructionSet_AVX);
break;
}

case EA_64BYTE:
{
isSupported = compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F);
break;
}

case EA_8BYTE:
case EA_16BYTE:
{
assert((attr == EA_8BYTE) || (attr == EA_16BYTE));
isSupported = true;
break;
}

default:
{
unreached();
}
}

if (isSupported)
{
#if defined(FEATURE_SIMD)
emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
Expand Down Expand Up @@ -551,6 +581,18 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
break;
}

case TYP_SIMD64:
{
simd64_t constValue;
// TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val.
constValue.v256[0] = vecCon->gtSimd32Val;
constValue.v256[1] = vecCon->gtSimd32Val;
CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(constValue);

emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
break;
}
#endif // FEATURE_SIMD

default:
Expand Down Expand Up @@ -5730,7 +5772,8 @@ void CodeGen::genCall(GenTreeCall* call)
// To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue
// VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens
// when there's preceding 256-bit AVX to legacy SSE transition penalty.
if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && GetEmitter()->Contains256bitAVX())
// This applies to 512bit AVX512 instructions as well.
if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && (GetEmitter()->Contains256bitOrMoreAVX()))
{
assert(compiler->canUseVexEncoding());
instGen(INS_vzeroupper);
Expand Down Expand Up @@ -11026,7 +11069,7 @@ void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/)
bool emitVzeroUpper = false;
if (check256bitOnly)
{
emitVzeroUpper = GetEmitter()->Contains256bitAVX();
emitVzeroUpper = GetEmitter()->Contains256bitOrMoreAVX();
}
else
{
Expand Down
38 changes: 36 additions & 2 deletions src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2280,6 +2280,40 @@ void Compiler::compSetProcessor()
{
instructionSetFlags.AddInstructionSet(InstructionSet_Vector256);
}
// x86-64-v4 feature level supports AVX512F, AVX512BW, AVX512CD, AVX512DQ, AVX512VL and
// AVX512F/AVX512BW/AVX512CD/AVX512DQ/VX512VL have been shipped together historically.
// It is therefore unlikely that future CPUs only support "just one" and
// not worth the additional complexity in the JIT to support.
if (instructionSetFlags.HasInstructionSet(InstructionSet_AVX512F) &&
instructionSetFlags.HasInstructionSet(InstructionSet_AVX512BW) &&
instructionSetFlags.HasInstructionSet(InstructionSet_AVX512DQ))
{
if (!DoJitStressEvexEncoding())
{
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512F);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512F_VL);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512BW);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512BW_VL);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512DQ);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512DQ_VL);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512CD);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512CD_VL);
#ifdef TARGET_AMD64
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512F_X64);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512F_VL_X64);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512BW_X64);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512BW_VL_X64);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512CD_X64);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512CD_VL_X64);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512DQ_X64);
instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512DQ_VL_X64);
#endif // TARGET_AMD64
}
else
{
instructionSetFlags.AddInstructionSet(InstructionSet_Vector512);
}
}
#elif defined(TARGET_ARM64)
if (instructionSetFlags.HasInstructionSet(InstructionSet_AdvSimd))
{
Expand All @@ -2297,14 +2331,14 @@ void Compiler::compSetProcessor()
if (canUseEvexEncoding())
{
codeGen->GetEmitter()->SetUseEvexEncoding(true);
// TODO-XArch-AVX512: Revisit other flags to be set once avx512 instructions are added.
// TODO-XArch-AVX512 : Revisit other flags to be set once avx512 instructions are added.
}
if (canUseVexEncoding())
{
codeGen->GetEmitter()->SetUseVEXEncoding(true);
// Assume each JITted method does not contain AVX instruction at first
codeGen->GetEmitter()->SetContainsAVX(false);
codeGen->GetEmitter()->SetContains256bitAVX(false);
codeGen->GetEmitter()->SetContains256bitOrMoreAVX(false);
}
}
#endif // TARGET_XARCH
Expand Down
74 changes: 70 additions & 4 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -7629,7 +7629,8 @@ class Compiler
static bool varTypeNeedsPartialCalleeSave(var_types type)
{
assert(type != TYP_STRUCT);
return (type == TYP_SIMD32);
assert((type < TYP_SIMD32) || (type == TYP_SIMD32) || (type == TYP_SIMD64));
return type >= TYP_SIMD32;
}
#elif defined(TARGET_ARM64)
static bool varTypeNeedsPartialCalleeSave(var_types type)
Expand Down Expand Up @@ -8328,6 +8329,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#if defined(TARGET_XARCH)
if (compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
if (compOpportunisticallyDependsOn(InstructionSet_Vector512))
{
return SIMD_Vector512_Supported;
}

return SIMD_AVX2_Supported;
}

Expand Down Expand Up @@ -8443,12 +8449,26 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
CORINFO_CLASS_HANDLE Vector256ULongHandle;
CORINFO_CLASS_HANDLE Vector256NIntHandle;
CORINFO_CLASS_HANDLE Vector256NUIntHandle;

CORINFO_CLASS_HANDLE Vector512FloatHandle;
CORINFO_CLASS_HANDLE Vector512DoubleHandle;
CORINFO_CLASS_HANDLE Vector512IntHandle;
CORINFO_CLASS_HANDLE Vector512UShortHandle;
CORINFO_CLASS_HANDLE Vector512UByteHandle;
CORINFO_CLASS_HANDLE Vector512ShortHandle;
CORINFO_CLASS_HANDLE Vector512ByteHandle;
CORINFO_CLASS_HANDLE Vector512LongHandle;
CORINFO_CLASS_HANDLE Vector512UIntHandle;
CORINFO_CLASS_HANDLE Vector512ULongHandle;
CORINFO_CLASS_HANDLE Vector512NIntHandle;
CORINFO_CLASS_HANDLE Vector512NUIntHandle;
#endif // defined(TARGET_XARCH)
#endif // FEATURE_HW_INTRINSICS

CORINFO_CLASS_HANDLE CanonicalSimd8Handle;
CORINFO_CLASS_HANDLE CanonicalSimd16Handle;
CORINFO_CLASS_HANDLE CanonicalSimd32Handle;
CORINFO_CLASS_HANDLE CanonicalSimd64Handle;

SIMDHandlesCache()
{
Expand Down Expand Up @@ -8515,6 +8535,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

#if defined(TARGET_XARCH)
case TYP_SIMD32:
case TYP_SIMD64:
break;
#endif // TARGET_XARCH

Expand Down Expand Up @@ -8622,7 +8643,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#if defined(TARGET_XARCH)
case TYP_SIMD32:
return m_simdHandleCache->CanonicalSimd32Handle;
case TYP_SIMD64:
return m_simdHandleCache->CanonicalSimd64Handle;
#endif // TARGET_XARCH

default:
unreached();
}
Expand Down Expand Up @@ -8757,7 +8781,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
var_types getSIMDVectorType()
{
#if defined(TARGET_XARCH)
if (getSIMDSupportLevel() == SIMD_AVX2_Supported)
// TODO-XArch-AVX512 : Return TYP_SIMD64 once Vector<T> supports AVX512.
if (getSIMDSupportLevel() >= SIMD_AVX2_Supported)
{
return TYP_SIMD32;
}
Expand Down Expand Up @@ -8798,7 +8823,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
unsigned getSIMDVectorRegisterByteLength()
{
#if defined(TARGET_XARCH)
if (getSIMDSupportLevel() == SIMD_AVX2_Supported)
// TODO-XArch-AVX512 : Return ZMM_REGSIZE_BYTES once Vector<T> supports AVX512.
if (getSIMDSupportLevel() >= SIMD_AVX2_Supported)
{
return YMM_REGSIZE_BYTES;
}
Expand Down Expand Up @@ -8829,6 +8855,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
if (compOpportunisticallyDependsOn(InstructionSet_AVX))
{
if (compOpportunisticallyDependsOn(InstructionSet_AVX512F))
{
return ZMM_REGSIZE_BYTES;
}
return YMM_REGSIZE_BYTES;
}
else
Expand Down Expand Up @@ -8870,6 +8900,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
{
simdType = TYP_SIMD32;
}
else if (size == 64)
{
simdType = TYP_SIMD64;
}
#endif // TARGET_XARCH
else
{
Expand Down Expand Up @@ -8906,7 +8940,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
// otherwise cause the highest level of instruction set support to be reported to crossgen2.
// and this api is only ever used as an optimization or assert, so no reporting should
// ever happen.
return YMM_REGSIZE_BYTES;
return ZMM_REGSIZE_BYTES;
}
#endif // defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
unsigned vectorRegSize = maxSIMDStructBytes();
Expand Down Expand Up @@ -9068,6 +9102,38 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
return opts.compSupportsISA.HasInstructionSet(isa);
}

#ifdef DEBUG
//------------------------------------------------------------------------
// IsBaselineVector512IsaSupportedDebugOnly - Does the target have isa support required for Vector512.
//
// Returns:
// `true` if AVX512F, AVX512BW and AVX512DQ are supported.
//
bool IsBaselineVector512IsaSupportedDebugOnly() const
{
#ifdef TARGET_AMD64
return (compIsaSupportedDebugOnly(InstructionSet_Vector512));
#else
return false;
#endif
}
#endif // DEBUG

//------------------------------------------------------------------------
// IsBaselineVector512IsaSupported - Does the target have isa support required for Vector512.
//
// Returns:
// `true` if AVX512F, AVX512BW and AVX512DQ are supported.
//
bool IsBaselineVector512IsaSupported() const
{
#ifdef TARGET_AMD64
return (compExactlyDependsOn(InstructionSet_Vector512));
#else
return false;
#endif
}

bool canUseVexEncoding() const
{
#ifdef TARGET_XARCH
Expand Down
35 changes: 27 additions & 8 deletions src/coreclr/jit/emit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2614,15 +2614,11 @@ void emitter::emitSetFrameRangeArgs(int offsLo, int offsHi)
*/

const emitter::opSize emitter::emitSizeEncode[] = {
emitter::OPSZ1, emitter::OPSZ2, OPSIZE_INVALID, emitter::OPSZ4, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID,
emitter::OPSZ8, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID,
OPSIZE_INVALID, emitter::OPSZ16, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID,
OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID,
OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, emitter::OPSZ32,
emitter::OPSZ1, emitter::OPSZ2, emitter::OPSZ4, emitter::OPSZ8, emitter::OPSZ16, emitter::OPSZ32, emitter::OPSZ64,
};

const emitAttr emitter::emitSizeDecode[emitter::OPSZ_COUNT] = {EA_1BYTE, EA_2BYTE, EA_4BYTE,
EA_8BYTE, EA_16BYTE, EA_32BYTE};
const emitAttr emitter::emitSizeDecode[emitter::OPSZ_COUNT] = {EA_1BYTE, EA_2BYTE, EA_4BYTE, EA_8BYTE,
EA_16BYTE, EA_32BYTE, EA_64BYTE};

/*****************************************************************************
*
Expand Down Expand Up @@ -6550,7 +6546,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,

coldCodeBlock = nullptr;

// This restricts the data alignment to: 4, 8, 16, or 32 bytes
// This restricts the data alignment to: 4, 8, 16, 32 or 64 bytes
// Alignments greater than 32 would require VM support in ICorJitInfo::allocMem
uint32_t dataAlignment = emitConsDsc.alignment;
assert((dataSection::MIN_DATA_ALIGN <= dataAlignment) && (dataAlignment <= dataSection::MAX_DATA_ALIGN) &&
Expand Down Expand Up @@ -6631,6 +6627,10 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
{
allocMemFlagDataAlign = CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN;
}
else if (dataAlignment == 64)
{
allocMemFlagDataAlign = CORJIT_ALLOCMEM_FLG_RODATA_64BYTE_ALIGN;
}

CorJitAllocMemFlag allocMemFlag = static_cast<CorJitAllocMemFlag>(allocMemFlagCodeAlign | allocMemFlagDataAlign);

Expand Down Expand Up @@ -7965,6 +7965,25 @@ CORINFO_FIELD_HANDLE emitter::emitSimd32Const(simd32_t constValue)
UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD32);
return emitComp->eeFindJitDataOffs(cnum);
}

CORINFO_FIELD_HANDLE emitter::emitSimd64Const(simd64_t constValue)
{
// Access to inline data is 'abstracted' by a special type of static member
// (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference
// to constant data, not a real static field.
CLANG_FORMAT_COMMENT_ANCHOR;

unsigned cnsSize = 64;
unsigned cnsAlign = cnsSize;

if (emitComp->compCodeOpt() == Compiler::SMALL_CODE)
{
cnsAlign = dataSection::MIN_DATA_ALIGN;
}

UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD64);
return emitComp->eeFindJitDataOffs(cnum);
}
#endif // TARGET_XARCH
#endif // FEATURE_SIMD

Expand Down
Loading

0 comments on commit 6123cb0

Please sign in to comment.