diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index de6e06fa883d0..e9eeb35a662d0 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -3222,6 +3222,7 @@ GenTree* Compiler::optVNConstantPropOnTree(BasicBlock* block, GenTree* tree) #if defined(TARGET_XARCH) case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. { simd32_t value = vnStore->ConstantValue(vnCns); @@ -3231,6 +3232,7 @@ GenTree* Compiler::optVNConstantPropOnTree(BasicBlock* block, GenTree* tree) conValTree = vecCon; break; } + break; #endif // TARGET_XARCH #endif // FEATURE_SIMD diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 04352f6e1e30f..2b8d2da8518dd 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -946,7 +946,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void genSSE2Intrinsic(GenTreeHWIntrinsic* node); void genSSE41Intrinsic(GenTreeHWIntrinsic* node); void genSSE42Intrinsic(GenTreeHWIntrinsic* node); - void genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node); + void genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node); void genAESIntrinsic(GenTreeHWIntrinsic* node); void genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node); void genFMAIntrinsic(GenTreeHWIntrinsic* node); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 204ce74108f1d..e837c87a5c001 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -504,7 +504,37 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre if (vecCon->IsZero()) { - if ((attr != EA_32BYTE) || compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)) + bool isSupported; + + switch (attr) + { + case EA_32BYTE: + { + isSupported = compiler->compOpportunisticallyDependsOn(InstructionSet_AVX); + break; + } + + case EA_64BYTE: + { + isSupported = compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F); + break; + } + + case EA_8BYTE: + case EA_16BYTE: + { + assert((attr == EA_8BYTE) || (attr == EA_16BYTE)); + isSupported = true; + break; + } + + default: + { + unreached(); + } + } + + if (isSupported) { #if defined(FEATURE_SIMD) emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); @@ -551,6 +581,18 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); break; } + + case TYP_SIMD64: + { + simd64_t constValue; + // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. + constValue.v256[0] = vecCon->gtSimd32Val; + constValue.v256[1] = vecCon->gtSimd32Val; + CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(constValue); + + emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + break; + } #endif // FEATURE_SIMD default: @@ -5730,7 +5772,8 @@ void CodeGen::genCall(GenTreeCall* call) // To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue // VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens // when there's preceding 256-bit AVX to legacy SSE transition penalty. - if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && GetEmitter()->Contains256bitAVX()) + // This applies to 512bit AVX512 instructions as well. + if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && (GetEmitter()->Contains256bitOrMoreAVX())) { assert(compiler->canUseVexEncoding()); instGen(INS_vzeroupper); @@ -11026,7 +11069,7 @@ void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/) bool emitVzeroUpper = false; if (check256bitOnly) { - emitVzeroUpper = GetEmitter()->Contains256bitAVX(); + emitVzeroUpper = GetEmitter()->Contains256bitOrMoreAVX(); } else { diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 491963c397546..1e454ef74668f 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2280,6 +2280,40 @@ void Compiler::compSetProcessor() { instructionSetFlags.AddInstructionSet(InstructionSet_Vector256); } + // x86-64-v4 feature level supports AVX512F, AVX512BW, AVX512CD, AVX512DQ, AVX512VL and + // AVX512F/AVX512BW/AVX512CD/AVX512DQ/VX512VL have been shipped together historically. + // It is therefore unlikely that future CPUs only support "just one" and + // not worth the additional complexity in the JIT to support. + if (instructionSetFlags.HasInstructionSet(InstructionSet_AVX512F) && + instructionSetFlags.HasInstructionSet(InstructionSet_AVX512BW) && + instructionSetFlags.HasInstructionSet(InstructionSet_AVX512DQ)) + { + if (!DoJitStressEvexEncoding()) + { + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512F); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512F_VL); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512BW); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512BW_VL); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512DQ); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512DQ_VL); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512CD); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512CD_VL); +#ifdef TARGET_AMD64 + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512F_X64); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512F_VL_X64); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512BW_X64); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512BW_VL_X64); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512CD_X64); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512CD_VL_X64); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512DQ_X64); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512DQ_VL_X64); +#endif // TARGET_AMD64 + } + else + { + instructionSetFlags.AddInstructionSet(InstructionSet_Vector512); + } + } #elif defined(TARGET_ARM64) if (instructionSetFlags.HasInstructionSet(InstructionSet_AdvSimd)) { @@ -2297,14 +2331,14 @@ void Compiler::compSetProcessor() if (canUseEvexEncoding()) { codeGen->GetEmitter()->SetUseEvexEncoding(true); - // TODO-XArch-AVX512: Revisit other flags to be set once avx512 instructions are added. + // TODO-XArch-AVX512 : Revisit other flags to be set once avx512 instructions are added. } if (canUseVexEncoding()) { codeGen->GetEmitter()->SetUseVEXEncoding(true); // Assume each JITted method does not contain AVX instruction at first codeGen->GetEmitter()->SetContainsAVX(false); - codeGen->GetEmitter()->SetContains256bitAVX(false); + codeGen->GetEmitter()->SetContains256bitOrMoreAVX(false); } } #endif // TARGET_XARCH diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index c991cc604b21b..81c553640ed8e 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -7629,7 +7629,8 @@ class Compiler static bool varTypeNeedsPartialCalleeSave(var_types type) { assert(type != TYP_STRUCT); - return (type == TYP_SIMD32); + assert((type < TYP_SIMD32) || (type == TYP_SIMD32) || (type == TYP_SIMD64)); + return type >= TYP_SIMD32; } #elif defined(TARGET_ARM64) static bool varTypeNeedsPartialCalleeSave(var_types type) @@ -8328,6 +8329,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #if defined(TARGET_XARCH) if (compOpportunisticallyDependsOn(InstructionSet_AVX2)) { + if (compOpportunisticallyDependsOn(InstructionSet_Vector512)) + { + return SIMD_Vector512_Supported; + } + return SIMD_AVX2_Supported; } @@ -8443,12 +8449,26 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX CORINFO_CLASS_HANDLE Vector256ULongHandle; CORINFO_CLASS_HANDLE Vector256NIntHandle; CORINFO_CLASS_HANDLE Vector256NUIntHandle; + + CORINFO_CLASS_HANDLE Vector512FloatHandle; + CORINFO_CLASS_HANDLE Vector512DoubleHandle; + CORINFO_CLASS_HANDLE Vector512IntHandle; + CORINFO_CLASS_HANDLE Vector512UShortHandle; + CORINFO_CLASS_HANDLE Vector512UByteHandle; + CORINFO_CLASS_HANDLE Vector512ShortHandle; + CORINFO_CLASS_HANDLE Vector512ByteHandle; + CORINFO_CLASS_HANDLE Vector512LongHandle; + CORINFO_CLASS_HANDLE Vector512UIntHandle; + CORINFO_CLASS_HANDLE Vector512ULongHandle; + CORINFO_CLASS_HANDLE Vector512NIntHandle; + CORINFO_CLASS_HANDLE Vector512NUIntHandle; #endif // defined(TARGET_XARCH) #endif // FEATURE_HW_INTRINSICS CORINFO_CLASS_HANDLE CanonicalSimd8Handle; CORINFO_CLASS_HANDLE CanonicalSimd16Handle; CORINFO_CLASS_HANDLE CanonicalSimd32Handle; + CORINFO_CLASS_HANDLE CanonicalSimd64Handle; SIMDHandlesCache() { @@ -8515,6 +8535,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #if defined(TARGET_XARCH) case TYP_SIMD32: + case TYP_SIMD64: break; #endif // TARGET_XARCH @@ -8622,7 +8643,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #if defined(TARGET_XARCH) case TYP_SIMD32: return m_simdHandleCache->CanonicalSimd32Handle; + case TYP_SIMD64: + return m_simdHandleCache->CanonicalSimd64Handle; #endif // TARGET_XARCH + default: unreached(); } @@ -8757,7 +8781,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX var_types getSIMDVectorType() { #if defined(TARGET_XARCH) - if (getSIMDSupportLevel() == SIMD_AVX2_Supported) + // TODO-XArch-AVX512 : Return TYP_SIMD64 once Vector supports AVX512. + if (getSIMDSupportLevel() >= SIMD_AVX2_Supported) { return TYP_SIMD32; } @@ -8798,7 +8823,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX unsigned getSIMDVectorRegisterByteLength() { #if defined(TARGET_XARCH) - if (getSIMDSupportLevel() == SIMD_AVX2_Supported) + // TODO-XArch-AVX512 : Return ZMM_REGSIZE_BYTES once Vector supports AVX512. + if (getSIMDSupportLevel() >= SIMD_AVX2_Supported) { return YMM_REGSIZE_BYTES; } @@ -8829,6 +8855,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) if (compOpportunisticallyDependsOn(InstructionSet_AVX)) { + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + return ZMM_REGSIZE_BYTES; + } return YMM_REGSIZE_BYTES; } else @@ -8870,6 +8900,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX { simdType = TYP_SIMD32; } + else if (size == 64) + { + simdType = TYP_SIMD64; + } #endif // TARGET_XARCH else { @@ -8906,7 +8940,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // otherwise cause the highest level of instruction set support to be reported to crossgen2. // and this api is only ever used as an optimization or assert, so no reporting should // ever happen. - return YMM_REGSIZE_BYTES; + return ZMM_REGSIZE_BYTES; } #endif // defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) unsigned vectorRegSize = maxSIMDStructBytes(); @@ -9068,6 +9102,38 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX return opts.compSupportsISA.HasInstructionSet(isa); } +#ifdef DEBUG + //------------------------------------------------------------------------ + // IsBaselineVector512IsaSupportedDebugOnly - Does the target have isa support required for Vector512. + // + // Returns: + // `true` if AVX512F, AVX512BW and AVX512DQ are supported. + // + bool IsBaselineVector512IsaSupportedDebugOnly() const + { +#ifdef TARGET_AMD64 + return (compIsaSupportedDebugOnly(InstructionSet_Vector512)); +#else + return false; +#endif + } +#endif // DEBUG + + //------------------------------------------------------------------------ + // IsBaselineVector512IsaSupported - Does the target have isa support required for Vector512. + // + // Returns: + // `true` if AVX512F, AVX512BW and AVX512DQ are supported. + // + bool IsBaselineVector512IsaSupported() const + { +#ifdef TARGET_AMD64 + return (compExactlyDependsOn(InstructionSet_Vector512)); +#else + return false; +#endif + } + bool canUseVexEncoding() const { #ifdef TARGET_XARCH diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index f831920c6b0cd..c28eebc6bbf4d 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -2614,15 +2614,11 @@ void emitter::emitSetFrameRangeArgs(int offsLo, int offsHi) */ const emitter::opSize emitter::emitSizeEncode[] = { - emitter::OPSZ1, emitter::OPSZ2, OPSIZE_INVALID, emitter::OPSZ4, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, - emitter::OPSZ8, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, - OPSIZE_INVALID, emitter::OPSZ16, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, - OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, - OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, emitter::OPSZ32, + emitter::OPSZ1, emitter::OPSZ2, emitter::OPSZ4, emitter::OPSZ8, emitter::OPSZ16, emitter::OPSZ32, emitter::OPSZ64, }; -const emitAttr emitter::emitSizeDecode[emitter::OPSZ_COUNT] = {EA_1BYTE, EA_2BYTE, EA_4BYTE, - EA_8BYTE, EA_16BYTE, EA_32BYTE}; +const emitAttr emitter::emitSizeDecode[emitter::OPSZ_COUNT] = {EA_1BYTE, EA_2BYTE, EA_4BYTE, EA_8BYTE, + EA_16BYTE, EA_32BYTE, EA_64BYTE}; /***************************************************************************** * @@ -6550,7 +6546,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, coldCodeBlock = nullptr; - // This restricts the data alignment to: 4, 8, 16, or 32 bytes + // This restricts the data alignment to: 4, 8, 16, 32 or 64 bytes // Alignments greater than 32 would require VM support in ICorJitInfo::allocMem uint32_t dataAlignment = emitConsDsc.alignment; assert((dataSection::MIN_DATA_ALIGN <= dataAlignment) && (dataAlignment <= dataSection::MAX_DATA_ALIGN) && @@ -6631,6 +6627,10 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, { allocMemFlagDataAlign = CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN; } + else if (dataAlignment == 64) + { + allocMemFlagDataAlign = CORJIT_ALLOCMEM_FLG_RODATA_64BYTE_ALIGN; + } CorJitAllocMemFlag allocMemFlag = static_cast(allocMemFlagCodeAlign | allocMemFlagDataAlign); @@ -7965,6 +7965,25 @@ CORINFO_FIELD_HANDLE emitter::emitSimd32Const(simd32_t constValue) UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD32); return emitComp->eeFindJitDataOffs(cnum); } + +CORINFO_FIELD_HANDLE emitter::emitSimd64Const(simd64_t constValue) +{ + // Access to inline data is 'abstracted' by a special type of static member + // (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference + // to constant data, not a real static field. + CLANG_FORMAT_COMMENT_ANCHOR; + + unsigned cnsSize = 64; + unsigned cnsAlign = cnsSize; + + if (emitComp->compCodeOpt() == Compiler::SMALL_CODE) + { + cnsAlign = dataSection::MIN_DATA_ALIGN; + } + + UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD64); + return emitComp->eeFindJitDataOffs(cnum); +} #endif // TARGET_XARCH #endif // FEATURE_SIMD diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 3d01433f90e21..d517b4ac58f3d 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -517,7 +517,8 @@ class emitter OPSZ8 = 3, OPSZ16 = 4, OPSZ32 = 5, - OPSZ_COUNT = 6, + OPSZ64 = 6, + OPSZ_COUNT = 7, #ifdef TARGET_AMD64 OPSZP = OPSZ8, #else @@ -2067,6 +2068,7 @@ class emitter CORINFO_FIELD_HANDLE emitSimd16Const(simd16_t constValue); #if defined(TARGET_XARCH) CORINFO_FIELD_HANDLE emitSimd32Const(simd32_t constValue); + CORINFO_FIELD_HANDLE emitSimd64Const(simd64_t constValue); #endif // TARGET_XARCH #endif // FEATURE_SIMD regNumber emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src); @@ -2766,11 +2768,11 @@ class emitter struct dataSection { - // Note to use alignments greater than 32 requires modification in the VM + // Note to use alignments greater than 64 requires modification in the VM // to support larger alignments (see ICorJitInfo::allocMem) // const static unsigned MIN_DATA_ALIGN = 4; - const static unsigned MAX_DATA_ALIGN = 32; + const static unsigned MAX_DATA_ALIGN = 64; enum sectionType { @@ -3081,9 +3083,9 @@ inline emitAttr emitActualTypeSize(T type) /* static */ inline emitter::opSize emitter::emitEncodeSize(emitAttr size) { assert(size == EA_1BYTE || size == EA_2BYTE || size == EA_4BYTE || size == EA_8BYTE || size == EA_16BYTE || - size == EA_32BYTE); + size == EA_32BYTE || size == EA_64BYTE); - return emitSizeEncode[((int)size) - 1]; + return static_cast(genLog2(size)); } /* static */ inline emitAttr emitter::emitDecodeSize(emitter::opSize ensz) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index c357de5da79ae..313ee56b5524f 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1082,6 +1082,7 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const #define DEFAULT_BYTE_EVEX_PREFIX_MASK 0xFFFFFFFF00000000ULL #define LBIT_IN_BYTE_EVEX_PREFIX 0x0000002000000000ULL +#define LPRIMEBIT_IN_BYTE_EVEX_PREFIX 0x0000004000000000ULL //------------------------------------------------------------------------ // AddEvexPrefix: Add default EVEX perfix with only LL' bits set. @@ -1107,12 +1108,16 @@ emitter::code_t emitter::AddEvexPrefix(instruction ins, code_t code, emitAttr at code |= DEFAULT_BYTE_EVEX_PREFIX; - // TODO-XArch-AVX512: Add EA_64BYTE once ZMM is supported if (attr == EA_32BYTE) { // Set L bit to 1 in case of instructions that operate on 256-bits. code |= LBIT_IN_BYTE_EVEX_PREFIX; } + else if (attr == EA_64BYTE) + { + // Set L' bits to 11 in case of instructions that operate on 512-bits. + code |= LPRIMEBIT_IN_BYTE_EVEX_PREFIX; + } return code; } @@ -3840,8 +3845,8 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) // BT supports 16 bit operands and this code doesn't handle the necessary 66 prefix. assert(ins != INS_bt); - assert((attrSize == EA_4BYTE) || (attrSize == EA_PTRSIZE) // Only for x64 - || (attrSize == EA_16BYTE) || (attrSize == EA_32BYTE) // only for x64 + assert((attrSize == EA_4BYTE) || (attrSize == EA_PTRSIZE) // Only for x64 + || (attrSize == EA_16BYTE) || (attrSize == EA_32BYTE) || (attrSize == EA_64BYTE) // only for x64 || (ins == INS_movzx) || (ins == INS_movsx) // The prefetch instructions are always 3 bytes and have part of their modr/m byte hardcoded || isPrefetch(ins)); @@ -6186,7 +6191,7 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN emitAttr size = EA_SIZE(attr); - assert(size <= EA_32BYTE); + assert(size <= EA_64BYTE); noway_assert(emitVerifyEncodable(ins, size, dstReg, srcReg)); insFormat fmt = emitInsModeFormat(ins, IF_RRD_RRD); @@ -6229,7 +6234,7 @@ void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNum emitAttr size = EA_SIZE(attr); - assert(size <= EA_32BYTE); + assert(size <= EA_64BYTE); noway_assert(emitVerifyEncodable(ins, size, reg1, reg2)); /* Special case: "XCHG" uses a different format */ @@ -7037,7 +7042,7 @@ void emitter::emitIns_R_C(instruction ins, emitAttr attr, regNumber reg, CORINFO emitAttr size = EA_SIZE(attr); - assert(size <= EA_32BYTE); + assert(size <= EA_64BYTE); noway_assert(emitVerifyEncodable(ins, size, reg)); UNATIVE_OFFSET sz; @@ -7776,7 +7781,7 @@ void emitter::emitIns_ARX_R( fmt = emitInsModeFormat(ins, IF_ARD_RRD); noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg)); - assert(!CodeGen::instIsFP(ins) && (EA_SIZE(attr) <= EA_32BYTE)); + assert(!CodeGen::instIsFP(ins) && (EA_SIZE(attr) <= EA_64BYTE)); id->idReg1(reg); } @@ -9575,6 +9580,9 @@ const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName) switch (EA_SIZE(attr)) { + case EA_64BYTE: + return emitZMMregName(reg); + case EA_32BYTE: return emitYMMregName(reg); @@ -9772,6 +9780,24 @@ const char* emitter::emitYMMregName(unsigned reg) return regNames[reg]; } +/***************************************************************************** + * + * Return a string that represents the given ZMM register. + */ + +const char* emitter::emitZMMregName(unsigned reg) +{ + static const char* const regNames[] = { +#define REGDEF(name, rnum, mask, sname) "z" sname, +#include "register.h" + }; + + assert(reg < REG_COUNT); + assert(reg < ArrLen(regNames)); + + return regNames[reg]; +} + /***************************************************************************** * * Display a static data member reference. @@ -10998,13 +11024,21 @@ void emitter::emitDispIns( } case IF_RWR_RRD_RRD_CNS: - assert(IsVexEncodedInstruction(ins)); + assert(IsVexOrEvexEncodedInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); printf("%s, ", emitRegName(id->idReg1(), attr)); printf("%s, ", emitRegName(id->idReg2(), attr)); switch (ins) { + case INS_vinsertf64x4: + case INS_vinsertf32x8: + case INS_vinserti64x4: + case INS_vinserti32x8: + { + attr = EA_32BYTE; + break; + } case INS_vinsertf128: case INS_vinserti128: { @@ -17720,6 +17754,10 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vextracti128: case INS_vinsertf128: case INS_vinserti128: + case INS_vinsertf64x4: + case INS_vinserti64x4: + case INS_vinsertf32x8: + case INS_vinserti32x8: result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += PERFSCORE_LATENCY_3C; break; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index b6759b9ae1a93..a081a162d3af6 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -285,6 +285,10 @@ bool IsWEvexOpcodeExtension(const instrDesc* id) case INS_vfnmsub231sd: case INS_unpcklpd: case INS_vpermilpdvar: + case INS_movdqu16: + case INS_movdqu64: + case INS_vinsertf64x4: + case INS_vinserti64x4: { return true; // W1 } @@ -398,6 +402,10 @@ bool IsWEvexOpcodeExtension(const instrDesc* id) case INS_vpdpbusds: case INS_vpdpwssds: case INS_vpermilpsvar: + case INS_movdqu8: + case INS_movdqu32: + case INS_vinsertf32x8: + case INS_vinserti32x8: { return false; // W0 } @@ -614,14 +622,14 @@ void SetContainsAVX(bool value) containsAVXInstruction = value; } -bool contains256bitAVXInstruction = false; -bool Contains256bitAVX() +bool contains256bitOrMoreAVXInstruction = false; +bool Contains256bitOrMoreAVX() { - return contains256bitAVXInstruction; + return contains256bitOrMoreAVXInstruction; } -void SetContains256bitAVX(bool value) +void SetContains256bitOrMoreAVX(bool value) { - contains256bitAVXInstruction = value; + contains256bitOrMoreAVXInstruction = value; } bool IsDstDstSrcAVXInstruction(instruction ins); @@ -661,6 +669,7 @@ void emitDispShift(instruction ins, int cnt = 0); const char* emitXMMregName(unsigned reg); const char* emitYMMregName(unsigned reg); +const char* emitZMMregName(unsigned reg); /************************************************************************/ /* Private members that deal with target-dependent instr. descriptors */ @@ -722,7 +731,7 @@ inline emitter::opSize emitEncodeScale(size_t scale) { assert(scale == 1 || scale == 2 || scale == 4 || scale == 8); - return emitSizeEncode[scale - 1]; + return static_cast(genLog2((unsigned int)scale)); } inline emitAttr emitDecodeScale(unsigned ensz) diff --git a/src/coreclr/jit/fgbasic.cpp b/src/coreclr/jit/fgbasic.cpp index 581a5aa620bef..eac6859e1bec1 100644 --- a/src/coreclr/jit/fgbasic.cpp +++ b/src/coreclr/jit/fgbasic.cpp @@ -1193,6 +1193,7 @@ void Compiler::fgFindJumpTargets(const BYTE* codeAddr, IL_OFFSET codeSize, Fixed case NI_POPCNT_PopCount: case NI_POPCNT_X64_PopCount: case NI_Vector256_Create: + case NI_Vector512_Create: case NI_Vector256_CreateScalar: case NI_Vector256_CreateScalarUnsafe: case NI_VectorT256_CreateBroadcast: diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index bf8a6ef3b9268..5a84e860fbefc 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -3050,6 +3050,7 @@ unsigned Compiler::gtHashValue(GenTree* tree) { #if defined(FEATURE_SIMD) #if defined(TARGET_XARCH) + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. case TYP_SIMD32: { add = genTreeHashAdd(ulo32(add), vecCon->gtSimd32Val.u32[7]); @@ -7282,7 +7283,8 @@ GenTree* Compiler::gtNewAllBitsSetConNode(var_types type) case TYP_SIMD16: #if defined(TARGET_XARCH) case TYP_SIMD32: -#endif // TARGET_XARCH + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. +#endif // TARGET_XARCH { allBitsSet = gtNewVconNode(type); allBitsSet->AsVecCon()->gtSimd32Val.i64[0] = -1; @@ -7327,7 +7329,8 @@ GenTree* Compiler::gtNewZeroConNode(var_types type) case TYP_SIMD16: #if defined(TARGET_XARCH) case TYP_SIMD32: -#endif // TARGET_XARCH + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. +#endif // TARGET_XARCH { zero = gtNewVconNode(type); zero->AsVecCon()->gtSimd32Val = {}; @@ -7369,7 +7372,8 @@ GenTree* Compiler::gtNewOneConNode(var_types type, var_types simdBaseType /* = T case TYP_SIMD16: #if defined(TARGET_XARCH) case TYP_SIMD32: -#endif // TARGET_XARCH + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. +#endif // TARGET_XARCH { GenTreeVecCon* vecCon = gtNewVconNode(type); @@ -11586,6 +11590,7 @@ void Compiler::gtDispConst(GenTree* tree) #if defined(TARGET_XARCH) case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. { simd32_t simdVal = vecCon->gtSimd32Val; printf("<0x%016llx, 0x%016llx, 0x%016llx, 0x%016llx>", simdVal.u64[0], simdVal.u64[1], @@ -17498,220 +17503,11 @@ bool GenTreeIntConCommon::AddrNeedsReloc(Compiler* comp) #endif // TARGET_X86 #if defined(FEATURE_HW_INTRINSICS) -//---------------------------------------------------------------------------------------------- -// IsHWIntrinsicCreateConstant: Determines if a HWIntrinsic node represents a vector constant -// -// Arguments: -// node - The node to check -// simd32Val - The vector constant being constructed -// -// Returns: -// true if node represents a constant; otherwise, false -bool GenTreeVecCon::IsHWIntrinsicCreateConstant(GenTreeHWIntrinsic* node, simd32_t& simd32Val) +unsigned GenTreeVecCon::ElementCount(unsigned simdSize, var_types simdBaseType) { - NamedIntrinsic intrinsic = node->GetHWIntrinsicId(); - var_types simdType = node->TypeGet(); - var_types simdBaseType = node->GetSimdBaseType(); - unsigned simdSize = node->GetSimdSize(); - - size_t argCnt = node->GetOperandCount(); - size_t cnsArgCnt = 0; - - switch (intrinsic) - { - case NI_Vector128_Create: - case NI_Vector128_CreateScalar: - case NI_Vector128_CreateScalarUnsafe: -#if defined(TARGET_XARCH) - case NI_Vector256_Create: - case NI_Vector256_CreateScalar: - case NI_Vector256_CreateScalarUnsafe: -#elif defined(TARGET_ARM64) - case NI_Vector64_Create: - case NI_Vector64_CreateScalar: - case NI_Vector64_CreateScalarUnsafe: -#endif - { - // Zero out the simd32Val - simd32Val = {}; - - // These intrinsics are meant to set the same value to every element. - if ((argCnt == 1) && HandleArgForHWIntrinsicCreate(node->Op(1), 0, simd32Val, simdBaseType)) - { -// CreateScalar leaves the upper bits as zero - -#if defined(TARGET_XARCH) - if ((intrinsic != NI_Vector128_CreateScalar) && (intrinsic != NI_Vector256_CreateScalar)) -#elif defined(TARGET_ARM64) - if ((intrinsic != NI_Vector64_CreateScalar) && (intrinsic != NI_Vector128_CreateScalar)) -#endif - { - // Now assign the rest of the arguments. - for (unsigned i = 1; i < simdSize / genTypeSize(simdBaseType); i++) - { - HandleArgForHWIntrinsicCreate(node->Op(1), i, simd32Val, simdBaseType); - } - } - - cnsArgCnt = 1; - } - else - { - for (unsigned i = 1; i <= argCnt; i++) - { - if (HandleArgForHWIntrinsicCreate(node->Op(i), i - 1, simd32Val, simdBaseType)) - { - cnsArgCnt++; - } - } - } - - assert((argCnt == 1) || (argCnt == (simdSize / genTypeSize(simdBaseType)))); - return argCnt == cnsArgCnt; - } - - default: - { - return false; - } - } + return simdSize / genTypeSize(simdBaseType); } - -//---------------------------------------------------------------------------------------------- -// HandleArgForHWIntrinsicCreate: Processes an argument for the GenTreeVecCon::IsHWIntrinsicCreateConstant method -// -// Arguments: -// arg - The argument to process -// argIdx - The index of the argument being processed -// simd32Val - The vector constant being constructed -// baseType - The base type of the vector constant -// -// Returns: -// true if arg was a constant; otherwise, false -bool GenTreeVecCon::HandleArgForHWIntrinsicCreate(GenTree* arg, int argIdx, simd32_t& simd32Val, var_types baseType) -{ - switch (baseType) - { - case TYP_BYTE: - case TYP_UBYTE: - { - if (arg->IsCnsIntOrI()) - { - simd32Val.i8[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); - return true; - } - else - { - // We expect the constant to have been already zeroed - assert(simd32Val.i8[argIdx] == 0); - } - break; - } - - case TYP_SHORT: - case TYP_USHORT: - { - if (arg->IsCnsIntOrI()) - { - simd32Val.i16[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); - return true; - } - else - { - // We expect the constant to have been already zeroed - assert(simd32Val.i16[argIdx] == 0); - } - break; - } - - case TYP_INT: - case TYP_UINT: - { - if (arg->IsCnsIntOrI()) - { - simd32Val.i32[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); - return true; - } - else - { - // We expect the constant to have been already zeroed - assert(simd32Val.i32[argIdx] == 0); - } - break; - } - - case TYP_LONG: - case TYP_ULONG: - { -#if defined(TARGET_64BIT) - if (arg->IsCnsIntOrI()) - { - simd32Val.i64[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); - return true; - } -#else - if (arg->OperIsLong() && arg->AsOp()->gtOp1->IsCnsIntOrI() && arg->AsOp()->gtOp2->IsCnsIntOrI()) - { - // 32-bit targets will decompose GT_CNS_LNG into two GT_CNS_INT - // We need to reconstruct the 64-bit value in order to handle this - - INT64 gtLconVal = arg->AsOp()->gtOp2->AsIntCon()->gtIconVal; - gtLconVal <<= 32; - gtLconVal |= arg->AsOp()->gtOp1->AsIntCon()->gtIconVal; - - simd32Val.i64[argIdx] = gtLconVal; - return true; - } -#endif // TARGET_64BIT - else - { - // We expect the constant to have been already zeroed - assert(simd32Val.i64[argIdx] == 0); - } - break; - } - - case TYP_FLOAT: - { - if (arg->IsCnsFltOrDbl()) - { - simd32Val.f32[argIdx] = static_cast(arg->AsDblCon()->DconValue()); - return true; - } - else - { - // We expect the constant to have been already zeroed - // We check against the i32, rather than f32, to account for -0.0 - assert(simd32Val.i32[argIdx] == 0); - } - break; - } - - case TYP_DOUBLE: - { - if (arg->IsCnsFltOrDbl()) - { - simd32Val.f64[argIdx] = static_cast(arg->AsDblCon()->DconValue()); - return true; - } - else - { - // We expect the constant to have been already zeroed - // We check against the i64, rather than f64, to account for -0.0 - assert(simd32Val.i64[argIdx] == 0); - } - break; - } - - default: - { - unreached(); - } - } - - return false; -} -#endif // FEATURE_HW_INTRINSICS +#endif // FEATURE_HW_INTRINSICS*/ //------------------------------------------------------------------------ // IsFieldAddr: Is "this" a static or class field address? @@ -21276,7 +21072,11 @@ GenTree* Compiler::gtNewSimdCreateBroadcastNode( } #endif // TARGET_X86 - if (simdSize == 32) + if (simdSize == 64) + { + hwIntrinsicID = NI_Vector512_Create; + } + else if (simdSize == 32) { hwIntrinsicID = NI_Vector256_Create; } @@ -23925,6 +23725,8 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const case NI_SSE41_ConvertToVector128Int64: case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX512F_BroadcastScalarToVector512: + case NI_AVX512BW_BroadcastScalarToVector512: case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index d44e6ad138b5c..9972b71d8b8c5 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -3350,157 +3350,6 @@ struct GenTreeStrCon : public GenTree #endif }; -// GenTreeVecCon -- vector constant (GT_CNS_VEC) -// -struct GenTreeVecCon : public GenTree -{ - union { - simd8_t gtSimd8Val; - simd12_t gtSimd12Val; - simd16_t gtSimd16Val; - simd32_t gtSimd32Val; - }; - -#if defined(FEATURE_HW_INTRINSICS) - static bool IsHWIntrinsicCreateConstant(GenTreeHWIntrinsic* node, simd32_t& simd32Val); - - static bool HandleArgForHWIntrinsicCreate(GenTree* arg, int argIdx, simd32_t& simd32Val, var_types baseType); -#endif // FEATURE_HW_INTRINSICS - - bool IsAllBitsSet() const - { - switch (gtType) - { -#if defined(FEATURE_SIMD) - case TYP_SIMD8: - { - return (gtSimd8Val.u64[0] == 0xFFFFFFFFFFFFFFFF); - } - - case TYP_SIMD12: - { - return (gtSimd12Val.u32[0] == 0xFFFFFFFF) && (gtSimd12Val.u32[1] == 0xFFFFFFFF) && - (gtSimd12Val.u32[2] == 0xFFFFFFFF); - } - - case TYP_SIMD16: - { - return (gtSimd16Val.u64[0] == 0xFFFFFFFFFFFFFFFF) && (gtSimd16Val.u64[1] == 0xFFFFFFFFFFFFFFFF); - } - -#if defined(TARGET_XARCH) - case TYP_SIMD32: - { - return (gtSimd32Val.u64[0] == 0xFFFFFFFFFFFFFFFF) && (gtSimd32Val.u64[1] == 0xFFFFFFFFFFFFFFFF) && - (gtSimd32Val.u64[2] == 0xFFFFFFFFFFFFFFFF) && (gtSimd32Val.u64[3] == 0xFFFFFFFFFFFFFFFF); - } -#endif // TARGET_XARCH -#endif // FEATURE_SIMD - - default: - { - unreached(); - } - } - } - - static bool Equals(const GenTreeVecCon* left, const GenTreeVecCon* right) - { - var_types gtType = left->TypeGet(); - - if (gtType != right->TypeGet()) - { - return false; - } - - switch (gtType) - { -#if defined(FEATURE_SIMD) - case TYP_SIMD8: - { - return (left->gtSimd8Val.u64[0] == right->gtSimd8Val.u64[0]); - } - - case TYP_SIMD12: - { - return (left->gtSimd12Val.u32[0] == right->gtSimd12Val.u32[0]) && - (left->gtSimd12Val.u32[1] == right->gtSimd12Val.u32[1]) && - (left->gtSimd12Val.u32[2] == right->gtSimd12Val.u32[2]); - } - - case TYP_SIMD16: - { - return (left->gtSimd16Val.u64[0] == right->gtSimd16Val.u64[0]) && - (left->gtSimd16Val.u64[1] == right->gtSimd16Val.u64[1]); - } - -#if defined(TARGET_XARCH) - case TYP_SIMD32: - { - return (left->gtSimd32Val.u64[0] == right->gtSimd32Val.u64[0]) && - (left->gtSimd32Val.u64[1] == right->gtSimd32Val.u64[1]) && - (left->gtSimd32Val.u64[2] == right->gtSimd32Val.u64[2]) && - (left->gtSimd32Val.u64[3] == right->gtSimd32Val.u64[3]); - } -#endif // TARGET_XARCH -#endif // FEATURE_SIMD - - default: - { - unreached(); - } - } - } - - bool IsZero() const - { - switch (gtType) - { -#if defined(FEATURE_SIMD) - case TYP_SIMD8: - { - return (gtSimd8Val.u64[0] == 0x0000000000000000); - } - - case TYP_SIMD12: - { - return (gtSimd12Val.u32[0] == 0x00000000) && (gtSimd12Val.u32[1] == 0x00000000) && - (gtSimd12Val.u32[2] == 0x00000000); - } - - case TYP_SIMD16: - { - return (gtSimd16Val.u64[0] == 0x0000000000000000) && (gtSimd16Val.u64[1] == 0x0000000000000000); - } - -#if defined(TARGET_XARCH) - case TYP_SIMD32: - { - return (gtSimd32Val.u64[0] == 0x0000000000000000) && (gtSimd32Val.u64[1] == 0x0000000000000000) && - (gtSimd32Val.u64[2] == 0x0000000000000000) && (gtSimd32Val.u64[3] == 0x0000000000000000); - } -#endif // TARGET_XARCH -#endif // FEATURE_SIMD - - default: - { - unreached(); - } - } - } - - GenTreeVecCon(var_types type) : GenTree(GT_CNS_VEC, type) - { - assert(varTypeIsSIMD(type)); - } - -#if DEBUGGABLE_GENTREE - GenTreeVecCon() : GenTree() - { - } -#endif -}; - // Encapsulates the SSA info carried by local nodes. Most local nodes have simple 1-to-1 // relationships with their SSA refs. However, defs of promoted structs can represent // many SSA defs at the same time, and we need to efficiently encode that. @@ -6459,6 +6308,367 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic }; #endif // FEATURE_HW_INTRINSICS +// GenTreeVecCon -- vector constant (GT_CNS_VEC) +// +struct GenTreeVecCon : public GenTree +{ + union { + simd8_t gtSimd8Val; + simd12_t gtSimd12Val; + simd16_t gtSimd16Val; + simd32_t gtSimd32Val; + }; + +#if defined(FEATURE_HW_INTRINSICS) + static unsigned ElementCount(unsigned simdSize, var_types simdBaseType); + + template + static bool IsHWIntrinsicCreateConstant(GenTreeHWIntrinsic* node, simdTypename& simdVal) + { + NamedIntrinsic intrinsic = node->GetHWIntrinsicId(); + var_types simdType = node->TypeGet(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + + size_t argCnt = node->GetOperandCount(); + size_t cnsArgCnt = 0; + + switch (intrinsic) + { + case NI_Vector128_Create: + case NI_Vector128_CreateScalar: + case NI_Vector128_CreateScalarUnsafe: +#if defined(TARGET_XARCH) + case NI_Vector256_Create: + case NI_Vector512_Create: + case NI_Vector256_CreateScalar: + case NI_Vector256_CreateScalarUnsafe: +#elif defined(TARGET_ARM64) + case NI_Vector64_Create: + case NI_Vector64_CreateScalar: + case NI_Vector64_CreateScalarUnsafe: +#endif + { + // Zero out the simdVal + simdVal = {}; + + // These intrinsics are meant to set the same value to every element. + if ((argCnt == 1) && HandleArgForHWIntrinsicCreate(node->Op(1), 0, simdVal, simdBaseType)) + { +// CreateScalar leaves the upper bits as zero + +#if defined(TARGET_XARCH) + if ((intrinsic != NI_Vector128_CreateScalar) && (intrinsic != NI_Vector256_CreateScalar)) +#elif defined(TARGET_ARM64) + if ((intrinsic != NI_Vector64_CreateScalar) && (intrinsic != NI_Vector128_CreateScalar)) +#endif + { + // Now assign the rest of the arguments. + for (unsigned i = 1; i < ElementCount(simdSize, simdBaseType); i++) + { + HandleArgForHWIntrinsicCreate(node->Op(1), i, simdVal, simdBaseType); + } + } + + cnsArgCnt = 1; + } + else + { + for (unsigned i = 1; i <= argCnt; i++) + { + if (HandleArgForHWIntrinsicCreate(node->Op(i), i - 1, simdVal, simdBaseType)) + { + cnsArgCnt++; + } + } + } + + assert((argCnt == 1) || (argCnt == ElementCount(simdSize, simdBaseType))); + return argCnt == cnsArgCnt; + } + + default: + { + return false; + } + } + } + + //---------------------------------------------------------------------------------------------- + // HandleArgForHWIntrinsicCreate: Processes an argument for the GenTreeVecCon::IsHWIntrinsicCreateConstant method + // + // Arguments: + // arg - The argument to process + // argIdx - The index of the argument being processed + // simdVal - The vector constant being constructed + // baseType - The base type of the vector constant + // + // Returns: + // true if arg was a constant; otherwise, false + template + static bool HandleArgForHWIntrinsicCreate(GenTree* arg, int argIdx, simdTypename& simdVal, var_types baseType) + { + switch (baseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + if (arg->IsCnsIntOrI()) + { + simdVal.i8[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); + return true; + } + else + { + // We expect the constant to have been already zeroed + assert(simdVal.i8[argIdx] == 0); + } + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + if (arg->IsCnsIntOrI()) + { + simdVal.i16[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); + return true; + } + else + { + // We expect the constant to have been already zeroed + assert(simdVal.i16[argIdx] == 0); + } + break; + } + + case TYP_INT: + case TYP_UINT: + { + if (arg->IsCnsIntOrI()) + { + simdVal.i32[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); + return true; + } + else + { + // We expect the constant to have been already zeroed + assert(simdVal.i32[argIdx] == 0); + } + break; + } + + case TYP_LONG: + case TYP_ULONG: + { +#if defined(TARGET_64BIT) + if (arg->IsCnsIntOrI()) + { + simdVal.i64[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); + return true; + } +#else + if (arg->OperIsLong() && arg->AsOp()->gtOp1->IsCnsIntOrI() && arg->AsOp()->gtOp2->IsCnsIntOrI()) + { + // 32-bit targets will decompose GT_CNS_LNG into two GT_CNS_INT + // We need to reconstruct the 64-bit value in order to handle this + + INT64 gtLconVal = arg->AsOp()->gtOp2->AsIntCon()->gtIconVal; + gtLconVal <<= 32; + gtLconVal |= arg->AsOp()->gtOp1->AsIntCon()->gtIconVal; + + simdVal.i64[argIdx] = gtLconVal; + return true; + } +#endif // TARGET_64BIT + else + { + // We expect the constant to have been already zeroed + assert(simdVal.i64[argIdx] == 0); + } + break; + } + + case TYP_FLOAT: + { + if (arg->IsCnsFltOrDbl()) + { + simdVal.f32[argIdx] = static_cast(arg->AsDblCon()->DconValue()); + return true; + } + else + { + // We expect the constant to have been already zeroed + // We check against the i32, rather than f32, to account for -0.0 + assert(simdVal.i32[argIdx] == 0); + } + break; + } + + case TYP_DOUBLE: + { + if (arg->IsCnsFltOrDbl()) + { + simdVal.f64[argIdx] = static_cast(arg->AsDblCon()->DconValue()); + return true; + } + else + { + // We expect the constant to have been already zeroed + // We check against the i64, rather than f64, to account for -0.0 + assert(simdVal.i64[argIdx] == 0); + } + break; + } + + default: + { + unreached(); + } + } + + return false; + } + +#endif // FEATURE_HW_INTRINSICS + + bool IsAllBitsSet() const + { + switch (gtType) + { +#if defined(FEATURE_SIMD) + case TYP_SIMD8: + { + return (gtSimd8Val.u64[0] == 0xFFFFFFFFFFFFFFFF); + } + + case TYP_SIMD12: + { + return (gtSimd12Val.u32[0] == 0xFFFFFFFF) && (gtSimd12Val.u32[1] == 0xFFFFFFFF) && + (gtSimd12Val.u32[2] == 0xFFFFFFFF); + } + + case TYP_SIMD16: + { + return (gtSimd16Val.u64[0] == 0xFFFFFFFFFFFFFFFF) && (gtSimd16Val.u64[1] == 0xFFFFFFFFFFFFFFFF); + } + +#if defined(TARGET_XARCH) + case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. + { + return (gtSimd32Val.u64[0] == 0xFFFFFFFFFFFFFFFF) && (gtSimd32Val.u64[1] == 0xFFFFFFFFFFFFFFFF) && + (gtSimd32Val.u64[2] == 0xFFFFFFFFFFFFFFFF) && (gtSimd32Val.u64[3] == 0xFFFFFFFFFFFFFFFF); + } +#endif // TARGET_XARCH +#endif // FEATURE_SIMD + + default: + { + unreached(); + } + } + } + + static bool Equals(const GenTreeVecCon* left, const GenTreeVecCon* right) + { + var_types gtType = left->TypeGet(); + + if (gtType != right->TypeGet()) + { + return false; + } + + switch (gtType) + { +#if defined(FEATURE_SIMD) + case TYP_SIMD8: + { + return (left->gtSimd8Val.u64[0] == right->gtSimd8Val.u64[0]); + } + + case TYP_SIMD12: + { + return (left->gtSimd12Val.u32[0] == right->gtSimd12Val.u32[0]) && + (left->gtSimd12Val.u32[1] == right->gtSimd12Val.u32[1]) && + (left->gtSimd12Val.u32[2] == right->gtSimd12Val.u32[2]); + } + + case TYP_SIMD16: + { + return (left->gtSimd16Val.u64[0] == right->gtSimd16Val.u64[0]) && + (left->gtSimd16Val.u64[1] == right->gtSimd16Val.u64[1]); + } + +#if defined(TARGET_XARCH) + case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. + { + return (left->gtSimd32Val.u64[0] == right->gtSimd32Val.u64[0]) && + (left->gtSimd32Val.u64[1] == right->gtSimd32Val.u64[1]) && + (left->gtSimd32Val.u64[2] == right->gtSimd32Val.u64[2]) && + (left->gtSimd32Val.u64[3] == right->gtSimd32Val.u64[3]); + } +#endif // TARGET_XARCH +#endif // FEATURE_SIMD + + default: + { + unreached(); + } + } + } + + bool IsZero() const + { + switch (gtType) + { +#if defined(FEATURE_SIMD) + case TYP_SIMD8: + { + return (gtSimd8Val.u64[0] == 0x0000000000000000); + } + + case TYP_SIMD12: + { + return (gtSimd12Val.u32[0] == 0x00000000) && (gtSimd12Val.u32[1] == 0x00000000) && + (gtSimd12Val.u32[2] == 0x00000000); + } + + case TYP_SIMD16: + { + return (gtSimd16Val.u64[0] == 0x0000000000000000) && (gtSimd16Val.u64[1] == 0x0000000000000000); + } + +#if defined(TARGET_XARCH) + case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. + { + return (gtSimd32Val.u64[0] == 0x0000000000000000) && (gtSimd32Val.u64[1] == 0x0000000000000000) && + (gtSimd32Val.u64[2] == 0x0000000000000000) && (gtSimd32Val.u64[3] == 0x0000000000000000); + } +#endif // TARGET_XARCH +#endif // FEATURE_SIMD + + default: + { + unreached(); + } + } + } + + GenTreeVecCon(var_types type) : GenTree(GT_CNS_VEC, type) + { + assert(varTypeIsSIMD(type)); + } + +#if DEBUGGABLE_GENTREE + GenTreeVecCon() : GenTree() + { + } +#endif +}; + // GenTreeIndexAddr: Given an array object and an index, checks that the index is within the bounds of the array if // necessary and produces the address of the value at that index of the array. // @@ -8772,6 +8982,7 @@ inline bool GenTree::IsVectorCreate() const case NI_Vector128_Create: #if defined(TARGET_XARCH) case NI_Vector256_Create: + case NI_Vector512_Create: #elif defined(TARGET_ARMARCH) case NI_Vector64_Create: #endif diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 9b8db1759571d..7e853358153e1 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -159,6 +159,38 @@ CORINFO_CLASS_HANDLE Compiler::gtGetStructHandleForHWSIMD(var_types simdType, Co assert(!"Didn't find a class handle for simdType"); } } + else if (simdType == TYP_SIMD64) + { + switch (simdBaseJitType) + { + case CORINFO_TYPE_FLOAT: + return m_simdHandleCache->Vector512FloatHandle; + case CORINFO_TYPE_DOUBLE: + return m_simdHandleCache->Vector512DoubleHandle; + case CORINFO_TYPE_INT: + return m_simdHandleCache->Vector512IntHandle; + case CORINFO_TYPE_USHORT: + return m_simdHandleCache->Vector512UShortHandle; + case CORINFO_TYPE_UBYTE: + return m_simdHandleCache->Vector512UByteHandle; + case CORINFO_TYPE_SHORT: + return m_simdHandleCache->Vector512ShortHandle; + case CORINFO_TYPE_BYTE: + return m_simdHandleCache->Vector512ByteHandle; + case CORINFO_TYPE_LONG: + return m_simdHandleCache->Vector512LongHandle; + case CORINFO_TYPE_UINT: + return m_simdHandleCache->Vector512UIntHandle; + case CORINFO_TYPE_ULONG: + return m_simdHandleCache->Vector512ULongHandle; + case CORINFO_TYPE_NATIVEINT: + return m_simdHandleCache->Vector512NIntHandle; + case CORINFO_TYPE_NATIVEUINT: + return m_simdHandleCache->Vector512NUIntHandle; + default: + assert(!"Didn't find a class handle for simdType"); + } + } #endif // TARGET_XARCH #ifdef TARGET_ARM64 else if (simdType == TYP_SIMD8) @@ -293,7 +325,8 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp, return NI_Illegal; } - bool isIsaSupported = comp->compSupportsHWIntrinsic(isa); + bool isIsaSupported = comp->compSupportsHWIntrinsic(isa); + bool isBaselineVector512IsaUsedAndSupported = false; bool isHardwareAcceleratedProp = (strcmp(methodName, "get_IsHardwareAccelerated") == 0); #ifdef TARGET_XARCH @@ -311,6 +344,10 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp, { isa = InstructionSet_AVX2; } + else if (strcmp(className, "Vector512") == 0) + { + isBaselineVector512IsaUsedAndSupported = comp->IsBaselineVector512IsaSupported(); + } } #endif @@ -344,7 +381,7 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp, if (isIsaSupported) { - if (comp->compExactlyDependsOn(isa)) + if (comp->compExactlyDependsOn(isa) || isBaselineVector512IsaUsedAndSupported) { return NI_IsSupported_True; } @@ -392,6 +429,16 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp, } } } + else if (isa == InstructionSet_Vector512) + { + // We support Vector512 intrinsics when AVX512F, AVX512BW, AVX512DQ are available. + if (!comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && + !comp->compOpportunisticallyDependsOn(InstructionSet_AVX512BW) && + !comp->compOpportunisticallyDependsOn(InstructionSet_AVX512DQ)) + { + return NI_Illegal; + } + } #elif defined(TARGET_ARM64) else if (isa == InstructionSet_Vector64) { @@ -1116,6 +1163,8 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, case NI_SSE41_ConvertToVector128Int64: case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX512F_BroadcastScalarToVector512: + case NI_AVX512BW_BroadcastScalarToVector512: case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index cd5f6aa29ed8d..274821ef81b55 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -360,6 +360,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) { case InstructionSet_Vector128: case InstructionSet_Vector256: + case InstructionSet_Vector512: genBaseIntrinsic(node); break; case InstructionSet_X86Base: @@ -384,7 +385,8 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; case InstructionSet_AVX: case InstructionSet_AVX2: - genAvxOrAvx2Intrinsic(node); + case InstructionSet_AVX512F: + genAvxFamilyIntrinsic(node); break; case InstructionSet_AES: genAESIntrinsic(node); @@ -474,6 +476,8 @@ void CodeGen::genHWIntrinsic_R_RM( else { if (varTypeIsIntegral(rmOp) && ((node->GetHWIntrinsicId() == NI_AVX2_BroadcastScalarToVector128) || + (node->GetHWIntrinsicId() == NI_AVX512F_BroadcastScalarToVector512) || + (node->GetHWIntrinsicId() == NI_AVX512BW_BroadcastScalarToVector512) || (node->GetHWIntrinsicId() == NI_AVX2_BroadcastScalarToVector256))) { // In lowering we had the special case of BroadcastScalarToVector(CreateScalarUnsafe(op1)) @@ -1117,6 +1121,7 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node) } case NI_Vector128_ToVector256Unsafe: + case NI_Vector256_ToVector512Unsafe: case NI_Vector256_GetLower: { if (op1->isContained() || op1->isUsedFromSpillTemp()) @@ -1519,12 +1524,12 @@ void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node) } //------------------------------------------------------------------------ -// genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node +// genAvxFamilyIntrinsic: Generates the code for an AVX/AVX2/AVX512 hardware intrinsic node // // Arguments: // node - The hardware intrinsic node // -void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) +void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) { NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); var_types baseType = node->GetSimdBaseType(); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 503c7fc80209f..894663a1575bc 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -225,11 +225,20 @@ HARDWARE_INTRINSIC(Vector256, StoreUnsafe, HARDWARE_INTRINSIC(Vector256, Subtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Sum, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, ToVector512Unsafe, 32, 1, {INS_movdqu8, INS_movdqu8, INS_movdqu16, INS_movdqu16, INS_movdqu32, INS_movdqu32, INS_movdqu64, INS_movdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, WidenLower, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, WidenUpper, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, WithElement, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, Xor, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// ISA Function name SIMD size NumArg Instructions Category Flags +// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// Vector512 Intrinsics +HARDWARE_INTRINSIC(Vector512, Create, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, get_Zero, 64, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} @@ -713,6 +722,21 @@ HARDWARE_INTRINSIC(AVX2, SubtractSaturate, HARDWARE_INTRINSIC(AVX2, UnpackHigh, 32, 2, {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq, INS_punpckhqdq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, UnpackLow, 32, 2, {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq, INS_punpcklqdq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, Xor, 32, 2, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// ISA Function name SIMD size NumArg Instructions Category Flags +// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// AVX512F Intrinsics +HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) +HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinsertf64x4, INS_vinsertf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM) + +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// ISA Function name SIMD size NumArg Instructions Category Flags +// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// AVX512F Intrinsics +HARDWARE_INTRINSIC(AVX512BW, BroadcastScalarToVector512, 64, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 22303b4ad12e4..3b46d9ba9c7a0 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -148,6 +148,10 @@ static CORINFO_InstructionSet lookupInstructionSet(const char* className) { return InstructionSet_Vector256; } + else if (strncmp(className, "Vector512", 9) == 0) + { + return InstructionSet_Vector512; + } } else if (strcmp(className, "Fma") == 0) { @@ -506,12 +510,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, CORINFO_InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsic); - if ((isa == InstructionSet_Vector256) && !compExactlyDependsOn(InstructionSet_AVX)) - { - // We don't want to deal with TYP_SIMD32 if the compiler doesn't otherwise support the type. - return nullptr; - } - var_types simdBaseType = TYP_UNKNOWN; if (simdSize != 0) { @@ -891,6 +889,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_Create: case NI_Vector256_Create: + case NI_Vector512_Create: { if (sig->numArgs == 1) { @@ -943,7 +942,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } } - if (isConstant) + // TODO-XArch-AVX512 : Add this path for simd64 once vecCon supports simd64. + if (isConstant && (simdSize != 64)) { // Some of the below code assumes 16 or 32 byte SIMD types assert((simdSize == 16) || (simdSize == 32)); @@ -1276,7 +1276,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { simd32_t simd32Val = {}; - assert((simdSize == 16) || (simdSize == 32)); + assert((simdSize == 16) || (simdSize == 32) || (simdSize == 64)); simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE; // We want to tightly pack the most significant byte of each short/ushort @@ -1406,6 +1406,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_get_Zero: case NI_Vector256_get_Zero: + case NI_Vector512_get_Zero: { assert(sig->numArgs == 0); retNode = gtNewZeroConNode(retType); @@ -2234,6 +2235,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_ToVector256: case NI_Vector128_ToVector256Unsafe: + case NI_Vector256_ToVector512Unsafe: case NI_Vector256_GetLower: { assert(sig->numArgs == 1); diff --git a/src/coreclr/jit/importervectorization.cpp b/src/coreclr/jit/importervectorization.cpp index da99ed05be826..c7759b57ca7d3 100644 --- a/src/coreclr/jit/importervectorization.cpp +++ b/src/coreclr/jit/importervectorization.cpp @@ -74,7 +74,7 @@ static bool ConvertToLowerCase(WCHAR* input, WCHAR* mask, int length) // // Arguments: // comp - Compiler object -// simdType - Vector type, either TYP_SIMD32 (xarch only) or TYP_SIMD16 +// simdType - Vector type, TYP_SIMD64 (xarch only), TYP_SIMD32 (xarch only) or TYP_SIMD16 // cns - Constant data // // Return Value: @@ -82,9 +82,11 @@ static bool ConvertToLowerCase(WCHAR* input, WCHAR* mask, int length) // static GenTreeVecCon* CreateConstVector(Compiler* comp, var_types simdType, WCHAR* cns) { -#if defined(TARGET_XARCH) - if (simdType == TYP_SIMD32) +#ifdef TARGET_XARCH + if (simdType >= TYP_SIMD32) { + assert((simdType == TYP_SIMD32) || (simdType == TYP_SIMD64)); + // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. simd32_t simd32Val = {}; GenTreeVecCon* vecCon = comp->gtNewVconNode(simdType); diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 635907a6f77c8..5b21f54e3c8e6 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -157,36 +157,23 @@ const char* CodeGen::genSizeStr(emitAttr attr) static const char * const sizes[] = { - "", "byte ptr ", "word ptr ", - nullptr, "dword ptr ", - nullptr, - nullptr, - nullptr, "qword ptr ", - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, "xmmword ptr ", - nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, - "ymmword ptr" + "ymmword ptr", + "zmmword ptr" }; // clang-format on unsigned size = EA_SIZE(attr); - assert(size == 0 || size == 1 || size == 2 || size == 4 || size == 8 || size == 16 || size == 32); + assert(genMaxOneBit(size) && (size <= 64)); if (EA_ATTR(size) == attr) { - return sizes[size]; + return (size > 0) ? sizes[genLog2(size)] : ""; } else if (attr == EA_GCREF) { @@ -808,6 +795,12 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) simd32_t constValue = op->AsVecCon()->gtSimd32Val; return OperandDesc(emit->emitSimd32Const(constValue)); } + + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. + { + simd32_t constValue = op->AsVecCon()->gtSimd32Val; + return OperandDesc(emit->emitSimd32Const(constValue)); + } #endif // TARGET_XARCH #endif // FEATURE_SIMD diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index bb5fc454ea439..228870ca0b96a 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -351,7 +351,8 @@ enum emitAttr : unsigned EA_8BYTE = 0x008, EA_16BYTE = 0x010, EA_32BYTE = 0x020, - EA_SIZE_MASK = 0x03F, + EA_64BYTE = 0x040, + EA_SIZE_MASK = 0x07F, #ifdef TARGET_64BIT EA_PTRSIZE = EA_8BYTE, @@ -359,14 +360,14 @@ enum emitAttr : unsigned EA_PTRSIZE = EA_4BYTE, #endif - EA_OFFSET_FLG = 0x040, + EA_OFFSET_FLG = 0x080, EA_OFFSET = EA_OFFSET_FLG | EA_PTRSIZE, /* size == 0 */ - EA_GCREF_FLG = 0x080, + EA_GCREF_FLG = 0x100, EA_GCREF = EA_GCREF_FLG | EA_PTRSIZE, /* size == -1 */ - EA_BYREF_FLG = 0x100, + EA_BYREF_FLG = 0x200, EA_BYREF = EA_BYREF_FLG | EA_PTRSIZE, /* size == -2 */ - EA_DSP_RELOC_FLG = 0x200, // Is the displacement of the instruction relocatable? - EA_CNS_RELOC_FLG = 0x400, // Is the immediate of the instruction relocatable? + EA_DSP_RELOC_FLG = 0x400, // Is the displacement of the instruction relocatable? + EA_CNS_RELOC_FLG = 0x800, // Is the immediate of the instruction relocatable? }; #define EA_ATTR(x) ((emitAttr)(x)) diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 13ed02d75c6ea..a9825a20c30c3 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -631,6 +631,23 @@ INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(FIRST_AVX512F_INSTRUCTION, "FIRST_AVX512F_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(movdqu32, "movdqu32", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) +INST3(movdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) +INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values +INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(LAST_AVX512F_INSTRUCTION, "LAST_AVX512F_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) + +INST3(FIRST_AVX512BW_INSTRUCTION, "FIRST_AVX512BW_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(movdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) +INST3(movdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) +INST3(LAST_AVX512BW_INSTRUCTION, "LAST_AVX512BW_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) + +INST3(FIRST_AVX512DQ_INSTRUCTION, "FIRST_AVX512DQ_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values +INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(LAST_AVX512DQ_INSTRUCTION, "LAST_AVX512DQ_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) + INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Scalar instructions in SSE4.2 diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 5e3fd02434ae5..054734114cf77 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -3794,6 +3794,7 @@ void Compiler::lvaSortByRefCount() case TYP_SIMD16: #if defined(TARGET_XARCH) case TYP_SIMD32: + case TYP_SIMD64: #endif // TARGET_XARCH #endif // FEATURE_SIMD case TYP_STRUCT: diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 80c459ffc5e98..72f223f758be0 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1443,7 +1443,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) assert(varTypeIsArithmetic(simdBaseType)); assert(simdSize != 0); - bool isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simd32Val); + bool isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simd32Val); bool isCreateScalar = (intrinsicId == NI_Vector64_CreateScalar) || (intrinsicId == NI_Vector128_CreateScalar); size_t argCnt = node->GetOperandCount(); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index a3cd206ec471c..761e9f09b0b26 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1035,6 +1035,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Vector128_Create: case NI_Vector256_Create: + case NI_Vector512_Create: case NI_Vector128_CreateScalar: case NI_Vector256_CreateScalar: { @@ -1899,7 +1900,7 @@ void Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) } //---------------------------------------------------------------------------------------------- -// Lowering::LowerHWIntrinsicCreate: Lowers a Vector128 or Vector256 Create call +// Lowering::LowerHWIntrinsicCreate: Lowers a Vector128 or Vector256 or Vector512 Create call // // Arguments: // node - The hardware intrinsic node. @@ -1911,7 +1912,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); var_types simdBaseType = node->GetSimdBaseType(); unsigned simdSize = node->GetSimdSize(); - simd32_t simd32Val = {}; + simd64_t simd64Val = {}; if ((simdSize == 8) && (simdType == TYP_DOUBLE)) { @@ -1933,13 +1934,22 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) GenTree* tmp2 = nullptr; GenTree* tmp3 = nullptr; - bool isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simd32Val); + bool isConstant = false; + // TODO-XArch-AVX512: Keep only one path once GenTreeVecCon supports gtSimd64Val. + if (simdSize != 64) + { + isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simd64Val.v256[0]); + } + else + { + isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simd64Val); + } bool isCreateScalar = (intrinsicId == NI_Vector128_CreateScalar) || (intrinsicId == NI_Vector256_CreateScalar); size_t argCnt = node->GetOperandCount(); if (isConstant) { - assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16) || (simdSize == 32)); + assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16) || (simdSize == 32) || (simdSize == 64)); for (GenTree* arg : node->Operands()) { @@ -1953,24 +1963,69 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) BlockRange().Remove(arg); } - GenTreeVecCon* vecCon = comp->gtNewVconNode(simdType); + if (simdSize != 64) + { + GenTreeVecCon* vecCon = comp->gtNewVconNode(simdType); - vecCon->gtSimd32Val = simd32Val; - BlockRange().InsertBefore(node, vecCon); + vecCon->gtSimd32Val = simd64Val.v256[0]; + BlockRange().InsertBefore(node, vecCon); - LIR::Use use; - if (BlockRange().TryGetUse(node, &use)) - { - use.ReplaceWith(vecCon); + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + use.ReplaceWith(vecCon); + } + else + { + vecCon->SetUnusedValue(); + } + + BlockRange().Remove(node); + + return LowerNode(vecCon); } else { - vecCon->SetUnusedValue(); - } + assert(comp->IsBaselineVector512IsaSupportedDebugOnly()); - BlockRange().Remove(node); + // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. + // We will be constructing the following parts: + // /--* op1 T + // +--* ... T + // lo = * HWINTRINSIC simd32 T Create + // /--* ... T + // +--* opN T + // hi = * HWINTRINSIC simd32 T Create + // +--* lo simd32 + // tmp1 = * HWINTRINSIC simd64 T ToVector512Unsafe + // idx = CNS_INT int 1 + // /--* tmp1 simd64 + // +--* hi simd32 + // +--* idx int + // node = * HWINTRINSIC simd64 T InsertVector256 + + GenTreeVecCon* vecCon0 = comp->gtNewVconNode(TYP_SIMD32); + vecCon0->gtSimd32Val = simd64Val.v256[0]; + BlockRange().InsertBefore(node, vecCon0); + LowerNode(vecCon0); + GenTreeVecCon* vecCon1 = comp->gtNewVconNode(TYP_SIMD32); + vecCon1->gtSimd32Val = simd64Val.v256[1]; + BlockRange().InsertAfter(vecCon0, vecCon1); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, vecCon0, NI_Vector256_ToVector512Unsafe, simdBaseJitType, + 32); + BlockRange().InsertAfter(vecCon1, tmp1); + + idx = comp->gtNewIconNode(0x01, TYP_INT); + BlockRange().InsertAfter(tmp1, idx); - return LowerNode(vecCon); + node->ResetHWIntrinsicId(NI_AVX512F_InsertVector256, comp, tmp1, vecCon1, idx); + + LowerNode(vecCon1); + LowerNode(idx); + LowerNode(tmp1); + return LowerNode(node); + } } else if (argCnt == 1) { @@ -2124,6 +2179,55 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) return LowerNode(node); } + // We have the following (where simd is simd16, simd32 or simd64): + // /--* op1 T + // node = * HWINTRINSIC simd T Create + + if (intrinsicId == NI_Vector512_Create) + { + assert(comp->IsBaselineVector512IsaSupportedDebugOnly()); + // We will be constructing the following parts: + // /--* op1 T + // tmp1 = * HWINTRINSIC simd32 T CreateScalarUnsafe + // /--* tmp1 simd16 + // node = * HWINTRINSIC simd64 T BroadcastScalarToVector512 + + // This is roughly the following managed code: + // var tmp1 = Vector256.CreateScalarUnsafe(op1); + // return Avx512.BroadcastScalarToVector512(tmp1); + + tmp1 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op1, simdBaseJitType, 16); + LowerNode(tmp1); + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + case TYP_SHORT: + case TYP_USHORT: + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512BW)); + node->ResetHWIntrinsicId(NI_AVX512BW_BroadcastScalarToVector512, tmp1); + break; + } + case TYP_INT: + case TYP_UINT: + case TYP_LONG: + case TYP_ULONG: + case TYP_FLOAT: + case TYP_DOUBLE: + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + node->ResetHWIntrinsicId(NI_AVX512F_BroadcastScalarToVector512, tmp1); + break; + } + default: + { + unreached(); + } + } + return LowerNode(node); + } + // We have the following (where simd is simd16 or simd32): // /--* op1 T // node = * HWINTRINSIC simd T Create @@ -2501,13 +2605,71 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) GenTree* op2 = node->Op(2); + // TODO-XArch-AVX512 : Merge the NI_Vector512_Create and NI_Vector256_Create paths below. // We have the following (where simd is simd16 or simd32): // /--* op1 T // +--* ... T // +--* opN T // node = * HWINTRINSIC simd T Create + if (intrinsicId == NI_Vector512_Create) + { + assert(comp->IsBaselineVector512IsaSupportedDebugOnly()); - if (intrinsicId == NI_Vector256_Create) + // We will be constructing the following parts: + // /--* op1 T + // +--* ... T + // lo = * HWINTRINSIC simd32 T Create + // /--* ... T + // +--* opN T + // hi = * HWINTRINSIC simd32 T Create + // idx = CNS_INT int 1 + // /--* lo simd64 + // +--* hi simd32 + // +--* idx int + // node = * HWINTRINSIC simd64 T InsertVector256 + + // This is roughly the following managed code: + // ... + // var lo = Vector256.Create(op1, ...); + // var hi = Vector256.Create(..., opN); + // return Avx512F.InsertVector512(lo, hi, 0x01); + + // Each Vector256.Create call gets half the operands. That is: + // lo = Vector256.Create(op1, op2); + // hi = Vector256.Create(op3, op4); + // -or- + // lo = Vector256.Create(op1, ..., op4); + // hi = Vector256.Create(op5, ..., op8); + // -or- + // lo = Vector256.Create(op1, ..., op8); + // hi = Vector256.Create(op9, ..., op16); + // -or- + // lo = Vector256.Create(op1, ..., op16); + // hi = Vector256.Create(op17, ..., op32); + + size_t halfArgCnt = argCnt / 2; + assert((halfArgCnt * 2) == argCnt); + + GenTree* lo = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, node->GetOperandArray(), halfArgCnt, + NI_Vector256_Create, simdBaseJitType, 32); + BlockRange().InsertAfter(node->Op(halfArgCnt), lo); + + GenTree* hi = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, node->GetOperandArray(halfArgCnt), halfArgCnt, + NI_Vector256_Create, simdBaseJitType, 32); + BlockRange().InsertAfter(node->Op(argCnt), hi); + + idx = comp->gtNewIconNode(0x01, TYP_INT); + BlockRange().InsertAfter(hi, idx); + + assert(argCnt >= 7); + node->ResetHWIntrinsicId(NI_AVX512F_InsertVector256, comp, lo, hi, idx); + + LowerNode(lo); + LowerNode(hi); + + return LowerNode(node); + } + else if (intrinsicId == NI_Vector256_Create) { assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); @@ -6594,6 +6756,8 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX512F_BroadcastScalarToVector512: + case NI_AVX512BW_BroadcastScalarToVector512: { if (!parentNode->OperIsMemoryLoad()) { @@ -6930,6 +7094,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX512F_BroadcastScalarToVector512: + case NI_AVX512BW_BroadcastScalarToVector512: { if (node->OperIsMemoryLoad()) { @@ -7410,6 +7576,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX2_InsertVector128: case NI_AVX2_MultipleSumAbsoluteDifferences: case NI_AVX2_Permute2x128: + case NI_AVX512F_InsertVector256: case NI_PCLMULQDQ_CarrylessMultiply: { if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional)) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 0be86b638e8d1..c355d2af16dbe 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -733,16 +733,11 @@ LinearScan::LinearScan(Compiler* theCompiler) availableRegs[i] = &availableDoubleRegs; } #ifdef FEATURE_SIMD -#if defined(TARGET_XARCH) - else if ((thisType >= TYP_SIMD8) && (thisType <= TYP_SIMD32)) -#else - else if ((thisType >= TYP_SIMD8) && (thisType <= TYP_SIMD16)) -#endif // TARGET_XARCH - + else if (varTypeIsSIMD(thisType)) { availableRegs[i] = &availableDoubleRegs; } -#endif +#endif // FEATURE_SIMD else { availableRegs[i] = &availableIntRegs; @@ -1602,6 +1597,7 @@ bool LinearScan::isRegCandidate(LclVarDsc* varDsc) case TYP_SIMD16: #if defined(TARGET_XARCH) case TYP_SIMD32: + case TYP_SIMD64: #endif // TARGET_XARCH { return !varDsc->lvPromoted; @@ -5085,6 +5081,13 @@ void LinearScan::allocateRegisters() { allocate = false; } +#if defined(TARGET_XARCH) + else if (lclVarInterval->registerType == TYP_SIMD64) + { + allocate = false; + lclVarInterval->isPartiallySpilled = true; + } +#endif // TARGET_XARCH else { lclVarInterval->isPartiallySpilled = true; diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index b600b73a0ab77..9a8b8539aa278 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2166,6 +2166,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou case NI_Vector128_AsVector3: case NI_Vector128_ToVector256: case NI_Vector128_ToVector256Unsafe: + case NI_Vector256_ToVector512Unsafe: case NI_Vector256_GetLower: { assert(numArgs == 1); @@ -2859,22 +2860,32 @@ int LinearScan::BuildMul(GenTree* tree) } //------------------------------------------------------------------------------ -// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set -// Contains256bitAVX flag when SIMD vector size is 32 bytes +// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, +// set SetContains256bitOrMoreAVX flag when SIMD vector size is 32 or 64 bytes. // // Arguments: -// isFloatingPointType - true if it is floating point type // sizeOfSIMDVector - SIMD Vector size // void LinearScan::SetContainsAVXFlags(unsigned sizeOfSIMDVector /* = 0*/) { - if (compiler->canUseVexEncoding()) + if (!compiler->canUseVexEncoding()) { - compiler->compExactlyDependsOn(InstructionSet_AVX); - compiler->GetEmitter()->SetContainsAVX(true); - if (sizeOfSIMDVector == 32) + return; + } + + compiler->compExactlyDependsOn(InstructionSet_AVX); + compiler->GetEmitter()->SetContainsAVX(true); + if (sizeOfSIMDVector == 32) + { + compiler->GetEmitter()->SetContains256bitOrMoreAVX(true); + return; + } + + if (compiler->canUseEvexEncoding()) + { + if (compiler->compExactlyDependsOn(InstructionSet_AVX512F) && (sizeOfSIMDVector == 64)) { - compiler->GetEmitter()->SetContains256bitAVX(true); + compiler->GetEmitter()->SetContains256bitOrMoreAVX(true); } } } diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 71f709db3dcc9..3de6ce1d41191 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -10779,9 +10779,17 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) return node; } +#if defined(TARGET_XARCH) + // TODO-XArch-AVX512: Enable for simd64 once GenTreeVecCon supports gtSimd64Val. + if (node->TypeGet() == TYP_SIMD64) + { + return node; + } +#endif // TARGET_XARCH + simd32_t simd32Val = {}; - if (GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simd32Val)) + if (GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simd32Val)) { GenTreeVecCon* vecCon = gtNewVconNode(node->TypeGet()); diff --git a/src/coreclr/jit/morphblock.cpp b/src/coreclr/jit/morphblock.cpp index deb4322d5997e..272c31edca076 100644 --- a/src/coreclr/jit/morphblock.cpp +++ b/src/coreclr/jit/morphblock.cpp @@ -598,6 +598,7 @@ void MorphInitBlockHelper::TryInitFieldByField() case TYP_SIMD16: #if defined(TARGET_XARCH) case TYP_SIMD32: + case TYP_SIMD64: #endif // TARGET_XARCH #endif // FEATURE_SIMD { diff --git a/src/coreclr/jit/optcse.cpp b/src/coreclr/jit/optcse.cpp index ab65df387e343..6a810d0b9d357 100644 --- a/src/coreclr/jit/optcse.cpp +++ b/src/coreclr/jit/optcse.cpp @@ -2655,10 +2655,10 @@ class CSE_Heuristic // int spillSimdRegInProlog = 1; -// If we have a SIMD32 that is live across a call we have even higher spill costs -// #if defined(TARGET_XARCH) - if (candidate->Expr()->TypeIs(TYP_SIMD32)) + // If we have a SIMD32 that is live across a call we have even higher spill costs + // + if (candidate->Expr()->TypeIs(TYP_SIMD32, TYP_SIMD64)) { // Additionally for a simd32 CSE candidate we assume that and second spilled/restore will be needed. // (to hold the upper half of the simd32 register that isn't preserved across the call) diff --git a/src/coreclr/jit/regset.h b/src/coreclr/jit/regset.h index 9c1a1041eecf8..ef93565c43e95 100644 --- a/src/coreclr/jit/regset.h +++ b/src/coreclr/jit/regset.h @@ -222,7 +222,7 @@ class RegSet { #if defined(FEATURE_SIMD) #if defined(TARGET_XARCH) - TEMP_MAX_SIZE = YMM_REGSIZE_BYTES, + TEMP_MAX_SIZE = ZMM_REGSIZE_BYTES, #elif defined(TARGET_ARM64) TEMP_MAX_SIZE = FP_REGSIZE_BYTES, #endif // defined(TARGET_XARCH) || defined(TARGET_ARM64) diff --git a/src/coreclr/jit/scopeinfo.cpp b/src/coreclr/jit/scopeinfo.cpp index de3895dedc9d5..dd7eecb5a4d0c 100644 --- a/src/coreclr/jit/scopeinfo.cpp +++ b/src/coreclr/jit/scopeinfo.cpp @@ -291,6 +291,7 @@ void CodeGenInterface::siVarLoc::siFillStackVarLoc( case TYP_SIMD16: #if defined(TARGET_XARCH) case TYP_SIMD32: + case TYP_SIMD64: #endif // TARGET_XARCH #endif // FEATURE_SIMD #ifdef TARGET_64BIT @@ -427,6 +428,7 @@ void CodeGenInterface::siVarLoc::siFillRegisterVarLoc( case TYP_SIMD16: #if defined(TARGET_XARCH) case TYP_SIMD32: + case TYP_SIMD64: #endif // TARGET_XARCH { this->vlType = VLT_REG_FP; diff --git a/src/coreclr/jit/simd.cpp b/src/coreclr/jit/simd.cpp index 323300a90426b..599d3c66a7f93 100644 --- a/src/coreclr/jit/simd.cpp +++ b/src/coreclr/jit/simd.cpp @@ -83,11 +83,15 @@ int Compiler::getSIMDTypeAlignment(var_types simdType) assert((size == 12) || (size == 16)); return 16; } - else + else if (size == 32) { - assert(size == 32); return 32; } + else + { + assert(size == 64); + return 64; + } #elif defined(TARGET_ARM64) // preferred alignment for 64-bit vectors is 8-bytes. // For everything else, 16-bytes. @@ -422,12 +426,87 @@ CorInfoType Compiler::getBaseJitTypeAndSizeOfSIMDType(CORINFO_CLASS_HANDLE typeH const size_t Vector64SizeBytes = 64 / 8; const size_t Vector128SizeBytes = 128 / 8; const size_t Vector256SizeBytes = 256 / 8; + const size_t Vector512SizeBytes = 512 / 8; #if defined(TARGET_XARCH) + static_assert_no_msg(ZMM_REGSIZE_BYTES == Vector512SizeBytes); static_assert_no_msg(YMM_REGSIZE_BYTES == Vector256SizeBytes); static_assert_no_msg(XMM_REGSIZE_BYTES == Vector128SizeBytes); - if (typeHnd == m_simdHandleCache->Vector256FloatHandle) + if (typeHnd == m_simdHandleCache->Vector512FloatHandle) + { + simdBaseJitType = CORINFO_TYPE_FLOAT; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512DoubleHandle) + { + simdBaseJitType = CORINFO_TYPE_DOUBLE; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512IntHandle) + { + simdBaseJitType = CORINFO_TYPE_INT; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512UIntHandle) + { + simdBaseJitType = CORINFO_TYPE_UINT; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512ShortHandle) + { + simdBaseJitType = CORINFO_TYPE_SHORT; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512UShortHandle) + { + simdBaseJitType = CORINFO_TYPE_USHORT; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512ByteHandle) + { + simdBaseJitType = CORINFO_TYPE_BYTE; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512UByteHandle) + { + simdBaseJitType = CORINFO_TYPE_UBYTE; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512LongHandle) + { + simdBaseJitType = CORINFO_TYPE_LONG; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512ULongHandle) + { + simdBaseJitType = CORINFO_TYPE_ULONG; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512NIntHandle) + { + simdBaseJitType = CORINFO_TYPE_NATIVEINT; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512NUIntHandle) + { + simdBaseJitType = CORINFO_TYPE_NATIVEUINT; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + + else if (typeHnd == m_simdHandleCache->Vector256FloatHandle) { simdBaseJitType = CORINFO_TYPE_FLOAT; size = Vector256SizeBytes; @@ -663,7 +742,77 @@ CorInfoType Compiler::getBaseJitTypeAndSizeOfSIMDType(CORINFO_CLASS_HANDLE typeH getClassNameFromMetadata(baseTypeHnd, nullptr)); #if defined(TARGET_XARCH) - if (strcmp(className, "Vector256`1") == 0) + if (strcmp(className, "Vector512`1") == 0) + { + size = Vector512SizeBytes; + switch (type) + { + case CORINFO_TYPE_FLOAT: + m_simdHandleCache->Vector512FloatHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_FLOAT; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_DOUBLE: + m_simdHandleCache->Vector512DoubleHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_DOUBLE; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_INT: + m_simdHandleCache->Vector512IntHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_INT; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_UINT: + m_simdHandleCache->Vector512UIntHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_UINT; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_SHORT: + m_simdHandleCache->Vector512ShortHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_SHORT; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_USHORT: + m_simdHandleCache->Vector512UShortHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_USHORT; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_LONG: + m_simdHandleCache->Vector512LongHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_LONG; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_ULONG: + m_simdHandleCache->Vector512ULongHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_ULONG; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_UBYTE: + m_simdHandleCache->Vector512UByteHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_UBYTE; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_BYTE: + m_simdHandleCache->Vector512ByteHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_BYTE; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_NATIVEINT: + m_simdHandleCache->Vector512NIntHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_NATIVEINT; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_NATIVEUINT: + m_simdHandleCache->Vector512NUIntHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_NATIVEUINT; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + + default: + JITDUMP(" Unknown Hardware Intrinsic SIMD Type Vector512\n"); + } + } + else if (strcmp(className, "Vector256`1") == 0) { size = Vector256SizeBytes; switch (type) @@ -888,6 +1037,11 @@ CorInfoType Compiler::getBaseJitTypeAndSizeOfSIMDType(CORINFO_CLASS_HANDLE typeH { simdBaseJitType = CORINFO_TYPE_UNDEF; } + if (size == ZMM_REGSIZE_BYTES && (simdBaseJitType != CORINFO_TYPE_UNDEF) && + !compExactlyDependsOn(InstructionSet_AVX512F)) + { + simdBaseJitType = CORINFO_TYPE_UNDEF; + } #endif // TARGET_XARCH } #endif // FEATURE_HW_INTRINSICS @@ -916,6 +1070,9 @@ CorInfoType Compiler::getBaseJitTypeAndSizeOfSIMDType(CORINFO_CLASS_HANDLE typeH case 32: pCanonicalHnd = &m_simdHandleCache->CanonicalSimd32Handle; break; + case 64: + pCanonicalHnd = &m_simdHandleCache->CanonicalSimd64Handle; + break; default: unreached(); } diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index c1a0dd629ed5f..127f223e378a4 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -32,7 +32,11 @@ enum SIMDLevel // AVX2 - Hardware has AVX and AVX2 instruction set. // Vector length is 256-bit and SIMD instructions are VEX-256 encoded. // Floating-point instructions are VEX-128 encoded. - SIMD_AVX2_Supported = 3 + SIMD_AVX2_Supported = 3, + + // Vector512 - Hardware has AVX, AVX2 and AVX512F instruction set. + // Floating-point instructions are EVEX encoded. + SIMD_Vector512_Supported = 4 #endif }; @@ -149,6 +153,35 @@ struct simd32_t } }; +struct simd64_t +{ + union { + float f32[16]; + double f64[8]; + int8_t i8[64]; + int16_t i16[32]; + int32_t i32[16]; + int64_t i64[8]; + uint8_t u8[64]; + uint16_t u16[32]; + uint32_t u32[16]; + uint64_t u64[8]; + simd8_t v64[8]; + simd16_t v128[4]; + simd32_t v256[2]; + }; + + bool operator==(const simd64_t& other) const + { + return (v256[0] == other.v256[0]) && (v256[1] == other.v256[1]); + } + + bool operator!=(const simd64_t& other) const + { + return (v256[0] != other.v256[0]) || (v256[1] != other.v256[1]); + } +}; + template TBase EvaluateUnaryScalarSpecialized(genTreeOps oper, TBase arg0) { diff --git a/src/coreclr/jit/simdashwintrinsic.h b/src/coreclr/jit/simdashwintrinsic.h index 556e937afe9ba..7bce4330ae6ad 100644 --- a/src/coreclr/jit/simdashwintrinsic.h +++ b/src/coreclr/jit/simdashwintrinsic.h @@ -14,6 +14,7 @@ enum class SimdAsHWIntrinsicClassId Vector4, VectorT128, VectorT256, + VectorT512, }; enum class SimdAsHWIntrinsicFlag : unsigned int diff --git a/src/coreclr/jit/simdashwintrinsiclistxarch.h b/src/coreclr/jit/simdashwintrinsiclistxarch.h index 2afa558e123fe..397483a98c12b 100644 --- a/src/coreclr/jit/simdashwintrinsiclistxarch.h +++ b/src/coreclr/jit/simdashwintrinsiclistxarch.h @@ -387,6 +387,13 @@ SIMD_AS_HWINTRINSIC_ID(VectorT256, WidenUpper, SIMD_AS_HWINTRINSIC_ID(VectorT256, WithElement, 3, {NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, Xor, 2, {NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor}, SimdAsHWIntrinsicFlag::None) +// ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* +// ISA ID Name NumArg Instructions Flags +// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} +// ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* +// Vector Intrinsics +SIMD_AS_HWINTRINSIC_ID(VectorT512, get_Zero, 0, {NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero}, SimdAsHWIntrinsicFlag::None) + #undef SIMD_AS_HWINTRINSIC_NM #undef SIMD_AS_HWINTRINSIC_ID diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp index cbe18a610f3ab..634c20c0ce8f0 100644 --- a/src/coreclr/jit/simdcodegenxarch.cpp +++ b/src/coreclr/jit/simdcodegenxarch.cpp @@ -406,7 +406,7 @@ void CodeGen::genSimdUpperSave(GenTreeIntrinsic* node) assert(node->gtIntrinsicName == NI_SIMD_UpperSave); GenTree* op1 = node->gtGetOp1(); - assert(op1->IsLocal() && (op1->TypeGet() == TYP_SIMD32)); + assert(op1->IsLocal() && op1->TypeIs(TYP_SIMD32, TYP_SIMD64)); regNumber tgtReg = node->GetRegNum(); regNumber op1Reg = genConsumeReg(op1); @@ -414,6 +414,8 @@ void CodeGen::genSimdUpperSave(GenTreeIntrinsic* node) if (tgtReg != REG_NA) { + // We should never save to register for zmm. + assert(op1->TypeGet() == TYP_SIMD32); GetEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tgtReg, op1Reg, 0x01); genProduceReg(node); } @@ -425,10 +427,19 @@ void CodeGen::genSimdUpperSave(GenTreeIntrinsic* node) LclVarDsc* varDsc = compiler->lvaGetDesc(varNum); assert(varDsc->lvOnFrame); - // We want to store this to the upper 16 bytes of this localVar's home. - int offs = 16; + if (op1->TypeGet() == TYP_SIMD32) + { + // We want to store this to the upper 16 bytes of this localVar's home. + int offs = 16; - GetEmitter()->emitIns_S_R_I(INS_vextractf128, EA_32BYTE, varNum, offs, op1Reg, 0x01); + GetEmitter()->emitIns_S_R_I(INS_vextractf128, EA_32BYTE, varNum, offs, op1Reg, 0x01); + } + else + { + assert(op1->TypeGet() == TYP_SIMD64); + // We will save the whole 64 bytes for zmm. + GetEmitter()->emitIns_S_R(INS_movups, EA_64BYTE, op1Reg, varNum, 0); + } } } @@ -452,7 +463,7 @@ void CodeGen::genSimdUpperRestore(GenTreeIntrinsic* node) assert(node->gtIntrinsicName == NI_SIMD_UpperRestore); GenTree* op1 = node->gtGetOp1(); - assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32); + assert(op1->IsLocal() && op1->TypeIs(TYP_SIMD32, TYP_SIMD64)); regNumber srcReg = node->GetRegNum(); regNumber lclVarReg = genConsumeReg(op1); @@ -460,6 +471,8 @@ void CodeGen::genSimdUpperRestore(GenTreeIntrinsic* node) if (srcReg != REG_NA) { + // We should never save to register for zmm. + assert(op1->TypeGet() == TYP_SIMD32); GetEmitter()->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, lclVarReg, lclVarReg, srcReg, 0x01); } else @@ -468,9 +481,18 @@ void CodeGen::genSimdUpperRestore(GenTreeIntrinsic* node) unsigned varNum = op1->AsLclVarCommon()->GetLclNum(); LclVarDsc* varDsc = compiler->lvaGetDesc(varNum); assert(varDsc->lvOnFrame); - // We will load this from the upper 16 bytes of this localVar's home. - int offs = 16; - GetEmitter()->emitIns_R_R_S_I(INS_vinsertf128, EA_32BYTE, lclVarReg, lclVarReg, varNum, offs, 0x01); + if (op1->TypeGet() == TYP_SIMD32) + { + // We will load this from the upper 16 bytes of this localVar's home. + int offs = 16; + GetEmitter()->emitIns_R_R_S_I(INS_vinsertf128, EA_32BYTE, lclVarReg, lclVarReg, varNum, offs, 0x01); + } + else + { + assert(op1->TypeGet() == TYP_SIMD64); + // We will restore the whole 64 bytes for zmm. + GetEmitter()->emitIns_R_S(INS_movups, EA_64BYTE, lclVarReg, varNum, 0); + } } } diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index 64af2659bd592..ac3f0ca7e8c02 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -101,6 +101,7 @@ #define REGSIZE_BYTES 8 // number of bytes in one register #define XMM_REGSIZE_BYTES 16 // XMM register size in bytes #define YMM_REGSIZE_BYTES 32 // YMM register size in bytes + #define ZMM_REGSIZE_BYTES 64 // ZMM register size in bytes #define CODE_ALIGN 1 // code alignment requirement #define STACK_ALIGN 16 // stack alignment requirement diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index 09c6b6b0b04ef..dffd6adf2efb0 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -102,6 +102,7 @@ #define XMM_REGSIZE_BYTES 16 // XMM register size in bytes #define YMM_REGSIZE_BYTES 32 // YMM register size in bytes + #define ZMM_REGSIZE_BYTES 64 // ZMM register size in bytes #define REGNUM_BITS 6 // number of bits in a REG_* diff --git a/src/coreclr/jit/typelist.h b/src/coreclr/jit/typelist.h index 4265c2a0daa82..b9140aa601f33 100644 --- a/src/coreclr/jit/typelist.h +++ b/src/coreclr/jit/typelist.h @@ -57,12 +57,12 @@ DEF_TP(BLK ,"blk" , TYP_BLK, TI_ERROR, 0, 0, 0, 1, 4, VTF_ANY) / DEF_TP(LCLBLK ,"lclBlk" , TYP_LCLBLK, TI_ERROR, 0, 0, 0, 1, 4, VTF_ANY) // preallocated memory for locspace #ifdef FEATURE_SIMD -// Amd64: The size and alignment of SIMD vector varies at JIT time based on whether target arch supports AVX or SSE2. DEF_TP(SIMD8 ,"simd8" , TYP_SIMD8, TI_STRUCT, 8, 8, 8, 2, 8, VTF_S|VTF_VEC) DEF_TP(SIMD12 ,"simd12" , TYP_SIMD12, TI_STRUCT,12,16, 16, 4,16, VTF_S|VTF_VEC) DEF_TP(SIMD16 ,"simd16" , TYP_SIMD16, TI_STRUCT,16,16, 16, 4,16, VTF_S|VTF_VEC) #if defined(TARGET_XARCH) DEF_TP(SIMD32 ,"simd32" , TYP_SIMD32, TI_STRUCT,32,32, 32, 8,16, VTF_S|VTF_VEC) +DEF_TP(SIMD64 ,"simd64" , TYP_SIMD64, TI_STRUCT,64,64, 64, 16,16, VTF_S|VTF_VEC) #endif // TARGET_XARCH #endif // FEATURE_SIMD diff --git a/src/coreclr/jit/utils.cpp b/src/coreclr/jit/utils.cpp index 5d177ba592be6..2823eb84f03df 100644 --- a/src/coreclr/jit/utils.cpp +++ b/src/coreclr/jit/utils.cpp @@ -244,13 +244,21 @@ const char* getRegNameFloat(regNumber reg, var_types type) #ifdef FEATURE_SIMD static const char* regNamesYMM[] = { #define REGDEF(name, rnum, mask, sname) "y" sname, +#include "register.h" + }; + static const char* regNamesZMM[] = { +#define REGDEF(name, rnum, mask, sname) "z" sname, #include "register.h" }; #endif // FEATURE_SIMD assert((unsigned)reg < ArrLen(regNamesFloat)); #if defined(FEATURE_SIMD) && defined(TARGET_XARCH) - if (type == TYP_SIMD32) + if (type == TYP_SIMD64) + { + return regNamesZMM[reg]; + } + else if (type == TYP_SIMD32) { return regNamesYMM[reg]; } diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 91b1a5b4be9ee..39bd8898548e5 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -445,6 +445,7 @@ ValueNumStore::ValueNumStore(Compiler* comp, CompAllocator alloc) , m_simd12CnsMap(nullptr) , m_simd16CnsMap(nullptr) , m_simd32CnsMap(nullptr) + , m_simd64CnsMap(nullptr) #endif // FEATURE_SIMD , m_VNFunc0Map(nullptr) , m_VNFunc1Map(nullptr) @@ -1700,6 +1701,12 @@ ValueNumStore::Chunk::Chunk(CompAllocator alloc, ValueNum* pNextBaseVN, var_type m_defs = new (alloc) Alloc::Type[ChunkSize]; break; } + + case TYP_SIMD64: + { + m_defs = new (alloc) Alloc::Type[ChunkSize]; + break; + } #endif // TARGET_XARCH #endif // FEATURE_SIMD @@ -1859,6 +1866,11 @@ ValueNum ValueNumStore::VNForSimd32Con(simd32_t cnsVal) { return VnForConst(cnsVal, GetSimd32CnsMap(), TYP_SIMD32); } + +ValueNum ValueNumStore::VNForSimd64Con(simd64_t cnsVal) +{ + return VnForConst(cnsVal, GetSimd64CnsMap(), TYP_SIMD64); +} #endif // TARGET_XARCH #endif // FEATURE_SIMD @@ -1963,6 +1975,11 @@ ValueNum ValueNumStore::VNZeroForType(var_types typ) { return VNForSimd32Con({}); } + + case TYP_SIMD64: + { + return VNForSimd64Con({}); + } #endif // TARGET_XARCH #endif // FEATURE_SIMD @@ -3374,6 +3391,16 @@ simd32_t ValueNumStore::GetConstantSimd32(ValueNum argVN) return ConstantValue(argVN); } + +// Given a simd64 constant value number return its value as a simd32. +// +simd64_t ValueNumStore::GetConstantSimd64(ValueNum argVN) +{ + assert(IsVNConstant(argVN)); + assert(TypeOfVN(argVN) == TYP_SIMD64); + + return ConstantValue(argVN); +} #endif // TARGET_XARCH #endif // FEATURE_SIMD @@ -8167,6 +8194,16 @@ void ValueNumStore::vnDump(Compiler* comp, ValueNum vn, bool isPtr) cnsVal.u64[2], cnsVal.u64[3]); break; } + + case TYP_SIMD64: + { + simd64_t cnsVal = GetConstantSimd64(vn); + printf( + "Simd64Cns[0x%016llx, 0x%016llx, 0x%016llx, 0x%016llx, 0x%016llx, 0x%016llx, 0x%016llx, 0x%016llx]", + cnsVal.u64[0], cnsVal.u64[1], cnsVal.u64[2], cnsVal.u64[3], cnsVal.u64[4], cnsVal.u64[5], + cnsVal.u64[6], cnsVal.u64[7]); + break; + } #endif // TARGET_XARCH #endif // FEATURE_SIMD @@ -9665,6 +9702,7 @@ void Compiler::fgValueNumberTreeConst(GenTree* tree) #if defined(TARGET_XARCH) case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. tree->gtVNPair.SetBoth(vnStore->VNForSimd32Con(tree->AsVecCon()->gtSimd32Val)); break; #endif // TARGET_XARCH diff --git a/src/coreclr/jit/valuenum.h b/src/coreclr/jit/valuenum.h index 62a5bd22ad002..44fc0d503c992 100644 --- a/src/coreclr/jit/valuenum.h +++ b/src/coreclr/jit/valuenum.h @@ -354,6 +354,7 @@ class ValueNumStore simd16_t GetConstantSimd16(ValueNum argVN); #if defined(TARGET_XARCH) simd32_t GetConstantSimd32(ValueNum argVN); + simd64_t GetConstantSimd64(ValueNum argVN); #endif // TARGET_XARCH #endif // FEATURE_SIMD @@ -438,6 +439,7 @@ class ValueNumStore ValueNum VNForSimd16Con(simd16_t cnsVal); #if defined(TARGET_XARCH) ValueNum VNForSimd32Con(simd32_t cnsVal); + ValueNum VNForSimd64Con(simd64_t cnsVal); #endif // TARGET_XARCH #endif // FEATURE_SIMD @@ -1643,6 +1645,50 @@ class ValueNumStore } return m_simd32CnsMap; } + + struct Simd64PrimitiveKeyFuncs : public JitKeyFuncsDefEquals + { + static bool Equals(simd64_t x, simd64_t y) + { + return x == y; + } + + static unsigned GetHashCode(const simd64_t val) + { + unsigned hash = 0; + + hash = static_cast(hash ^ val.u32[0]); + hash = static_cast(hash ^ val.u32[1]); + hash = static_cast(hash ^ val.u32[2]); + hash = static_cast(hash ^ val.u32[3]); + hash = static_cast(hash ^ val.u32[4]); + hash = static_cast(hash ^ val.u32[5]); + hash = static_cast(hash ^ val.u32[6]); + hash = static_cast(hash ^ val.u32[7]); + hash = static_cast(hash ^ val.u32[8]); + hash = static_cast(hash ^ val.u32[9]); + hash = static_cast(hash ^ val.u32[10]); + hash = static_cast(hash ^ val.u32[11]); + hash = static_cast(hash ^ val.u32[12]); + hash = static_cast(hash ^ val.u32[13]); + hash = static_cast(hash ^ val.u32[14]); + hash = static_cast(hash ^ val.u32[15]); + + return hash; + } + }; + + typedef VNMap Simd64ToValueNumMap; + Simd64ToValueNumMap* m_simd64CnsMap; + Simd64ToValueNumMap* GetSimd64CnsMap() + { + if (m_simd64CnsMap == nullptr) + { + m_simd64CnsMap = new (m_alloc) Simd64ToValueNumMap(m_alloc); + } + return m_simd64CnsMap; + } + #endif // FEATURE_SIMD template @@ -1790,6 +1836,13 @@ struct ValueNumStore::VarTypConv typedef simd32_t Type; typedef simd32_t Lang; }; + +template <> +struct ValueNumStore::VarTypConv +{ + typedef simd64_t Type; + typedef simd64_t Lang; +}; #endif // TARGET_XARCH #endif // FEATURE_SIMD @@ -1859,6 +1912,13 @@ FORCEINLINE simd32_t ValueNumStore::SafeGetConstantValue(Chunk* c, uns assert(c->m_typ == TYP_SIMD32); return reinterpret_cast::Lang*>(c->m_defs)[offset]; } + +template <> +FORCEINLINE simd64_t ValueNumStore::SafeGetConstantValue(Chunk* c, unsigned offset) +{ + assert(c->m_typ == TYP_SIMD64); + return reinterpret_cast::Lang*>(c->m_defs)[offset]; +} #endif // TARGET_XARCH template <> @@ -1917,6 +1977,20 @@ FORCEINLINE simd32_t ValueNumStore::ConstantValueInternal(ValueNum vn return SafeGetConstantValue(c, offset); } + +template <> +FORCEINLINE simd64_t ValueNumStore::ConstantValueInternal(ValueNum vn DEBUGARG(bool coerce)) +{ + Chunk* c = m_chunks.GetNoExpand(GetChunkNum(vn)); + assert(c->m_attribs == CEA_Const); + + unsigned offset = ChunkOffset(vn); + + assert(c->m_typ == TYP_SIMD64); + assert(!coerce); + + return SafeGetConstantValue(c, offset); +} #endif // TARGET_XARCH #endif // FEATURE_SIMD diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp index 387028a6f84e9..8e888542fecfd 100644 --- a/src/coreclr/vm/jitinterface.cpp +++ b/src/coreclr/vm/jitinterface.cpp @@ -11949,7 +11949,11 @@ void CEEJitInfo::allocMem (AllocMemArgs *pArgs) S_SIZE_T totalSize = S_SIZE_T(codeSize); size_t roDataAlignment = sizeof(void*); - if ((pArgs->flag & CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN)!= 0) + if ((pArgs->flag & CORJIT_ALLOCMEM_FLG_RODATA_64BYTE_ALIGN)!= 0) + { + roDataAlignment = 64; + } + else if ((pArgs->flag & CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN)!= 0) { roDataAlignment = 32; } diff --git a/src/tests/Common/testenvironment.proj b/src/tests/Common/testenvironment.proj index 68783c754fe6a..e11acfa265352 100644 --- a/src/tests/Common/testenvironment.proj +++ b/src/tests/Common/testenvironment.proj @@ -154,8 +154,8 @@ - - + +