diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 313ee56b5524f..bdd03f7fd8de8 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -5843,7 +5843,13 @@ bool emitter::IsMovInstruction(instruction ins) case INS_movaps: case INS_movd: case INS_movdqa: + case INS_movdqa32: + case INS_movdqa64: case INS_movdqu: + case INS_movdqu8: + case INS_movdqu16: + case INS_movdqu32: + case INS_movdqu64: case INS_movsdsse2: case INS_movss: case INS_movsx: @@ -5927,8 +5933,23 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) case INS_movupd: case INS_movups: { - // non EA_32BYTE moves clear the upper bits under VEX encoding - hasSideEffect = UseVEXEncoding() && (size != EA_32BYTE); + // TODO-XArch-AVX512 : Handle merge/masks scenarios once k-mask support is added for these. + // non EA_32BYTE and EA_64BYTE moves clear the upper bits under VEX and EVEX encoding respectively. + if (UseVEXEncoding()) + { + if (UseEvexEncoding()) + { + hasSideEffect = (size != EA_64BYTE); + } + else + { + hasSideEffect = (size != EA_32BYTE); + } + } + else + { + hasSideEffect = false; + } break; } @@ -5963,6 +5984,20 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) break; } + case INS_movdqa32: + case INS_movdqa64: + case INS_movdqu8: + case INS_movdqu16: + case INS_movdqu32: + case INS_movdqu64: + { + // These EVEX instructions merges/masks based on k-register + // TODO-XArch-AVX512 : Handle merge/masks scenarios once k-mask support is added for these. + assert(UseEvexEncoding()); + hasSideEffect = (size != EA_64BYTE); + break; + } + case INS_movsxd: { // Sign-extends the source @@ -6152,7 +6187,13 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN case INS_movapd: case INS_movaps: case INS_movdqa: + case INS_movdqa32: + case INS_movdqa64: case INS_movdqu: + case INS_movdqu8: + case INS_movdqu16: + case INS_movdqu32: + case INS_movdqu64: case INS_movsdsse2: case INS_movss: case INS_movupd: @@ -17350,7 +17391,13 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case INS_movdqa: + case INS_movdqa32: + case INS_movdqa64: case INS_movdqu: + case INS_movdqu8: + case INS_movdqu16: + case INS_movdqu32: + case INS_movdqu64: case INS_movaps: case INS_movups: case INS_movapd: diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index a081a162d3af6..66a4a30602226 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -285,6 +285,7 @@ bool IsWEvexOpcodeExtension(const instrDesc* id) case INS_vfnmsub231sd: case INS_unpcklpd: case INS_vpermilpdvar: + case INS_movdqa64: case INS_movdqu16: case INS_movdqu64: case INS_vinsertf64x4: @@ -402,6 +403,7 @@ bool IsWEvexOpcodeExtension(const instrDesc* id) case INS_vpdpbusds: case INS_vpdpwssds: case INS_vpermilpsvar: + case INS_movdqa32: case INS_movdqu8: case INS_movdqu32: case INS_vinsertf32x8: diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 5a84e860fbefc..a373e66df7d7b 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -18930,6 +18930,7 @@ bool GenTree::isContainableHWIntrinsic() const case NI_SSE2_LoadAlignedVector128: case NI_SSE2_LoadScalarVector128: case NI_AVX_LoadAlignedVector256: + case NI_AVX512F_LoadAlignedVector512: { // These loads are contained as part of a HWIntrinsic operation return true; @@ -21555,7 +21556,12 @@ GenTree* Compiler::gtNewSimdLoadAlignedNode( NamedIntrinsic intrinsic = NI_Illegal; - if (simdSize == 32) + if (simdSize == 64) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + intrinsic = NI_AVX512F_LoadAlignedVector512; + } + else if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_LoadAlignedVector256; @@ -21616,7 +21622,15 @@ GenTree* Compiler::gtNewSimdLoadNonTemporalNode( // We don't guarantee a non-temporal load will actually occur, so fallback // to regular aligned loads if the required ISA isn't supported. - if (simdSize == 32) + if (simdSize == 64) + { + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + intrinsic = NI_AVX512F_LoadAlignedVector512NonTemporal; + isNonTemporal = true; + } + } + else if (simdSize == 32) { if (compOpportunisticallyDependsOn(InstructionSet_AVX2)) { @@ -22877,7 +22891,12 @@ GenTree* Compiler::gtNewSimdStoreAlignedNode( NamedIntrinsic intrinsic = NI_Illegal; - if (simdSize == 32) + if (simdSize == 64) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + intrinsic = NI_AVX512F_StoreAligned; + } + else if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_StoreAligned; @@ -22934,7 +22953,12 @@ GenTree* Compiler::gtNewSimdStoreNonTemporalNode( NamedIntrinsic intrinsic = NI_Illegal; - if (simdSize == 32) + if (simdSize == 64) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + intrinsic = NI_AVX512F_StoreAlignedNonTemporal; + } + else if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_StoreAlignedNonTemporal; diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 894663a1575bc..d581ddc5a5711 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -238,6 +238,14 @@ HARDWARE_INTRINSIC(Vector256, Xor, // Vector512 Intrinsics HARDWARE_INTRINSIC(Vector512, Create, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, get_Zero, 64, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Vector512, Load, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, LoadAligned, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, LoadAlignedNonTemporal, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, LoadUnsafe, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, Store, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, StoreAligned, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, StoreAlignedNonTemporal, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, StoreUnsafe, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -729,6 +737,11 @@ HARDWARE_INTRINSIC(AVX2, Xor, // AVX512F Intrinsics HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinsertf64x4, INS_vinsertf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512, 64, 1, {INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa64, INS_movdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512NonTemporal, 64, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX512F, StoreAligned, 64, 2, {INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa64, INS_movdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) +HARDWARE_INTRINSIC(AVX512F, StoreAlignedNonTemporal, 64, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 3b46d9ba9c7a0..802a3bd97e9b9 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1665,8 +1665,10 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_AVX_LoadVector256: case NI_Vector128_Load: case NI_Vector256_Load: + case NI_Vector512_Load: case NI_Vector128_LoadUnsafe: case NI_Vector256_LoadUnsafe: + case NI_Vector512_LoadUnsafe: { if (sig->numArgs == 2) { @@ -1698,6 +1700,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_LoadAligned: case NI_Vector256_LoadAligned: + case NI_Vector512_LoadAligned: { assert(sig->numArgs == 1); @@ -1716,6 +1719,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_LoadAlignedNonTemporal: case NI_Vector256_LoadAlignedNonTemporal: + case NI_Vector512_LoadAlignedNonTemporal: { assert(sig->numArgs == 1); @@ -2086,8 +2090,10 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_Store: case NI_Vector256_Store: + case NI_Vector512_Store: case NI_Vector128_StoreUnsafe: case NI_Vector256_StoreUnsafe: + case NI_Vector512_StoreUnsafe: { assert(retType == TYP_VOID); var_types simdType = getSIMDTypeForSize(simdSize); @@ -2130,6 +2136,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_StoreAligned: case NI_Vector256_StoreAligned: + case NI_Vector512_StoreAligned: { assert(sig->numArgs == 2); assert(retType == TYP_VOID); @@ -2155,6 +2162,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_StoreAlignedNonTemporal: case NI_Vector256_StoreAlignedNonTemporal: + case NI_Vector512_StoreAlignedNonTemporal: { assert(sig->numArgs == 2); assert(retType == TYP_VOID); diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index a9825a20c30c3..a7d02c5c2af9f 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -632,20 +632,22 @@ INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX512F_INSTRUCTION, "FIRST_AVX512F_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(movdqu32, "movdqu32", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) -INST3(movdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) -INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values -INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(movdqa32, "movdqa32", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_32Bit | INS_FLAGS_None) +INST3(movdqa64, "movdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_64Bit | INS_FLAGS_None) +INST3(movdqu32, "movdqu32", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_32Bit | INS_FLAGS_None) +INST3(movdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_64Bit | INS_FLAGS_None) +INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values +INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values INST3(LAST_AVX512F_INSTRUCTION, "LAST_AVX512F_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX512BW_INSTRUCTION, "FIRST_AVX512BW_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(movdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) -INST3(movdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) +INST3(movdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_8Bit | INS_FLAGS_None) +INST3(movdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_16Bit | INS_FLAGS_None) INST3(LAST_AVX512BW_INSTRUCTION, "LAST_AVX512BW_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX512DQ_INSTRUCTION, "FIRST_AVX512DQ_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values -INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values +INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values INST3(LAST_AVX512DQ_INSTRUCTION, "LAST_AVX512DQ_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 761e9f09b0b26..3a92611fa4cb2 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -6920,6 +6920,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_SSE_LoadAlignedVector128: case NI_SSE2_LoadAlignedVector128: case NI_AVX_LoadAlignedVector256: + case NI_AVX512F_LoadAlignedVector512: { return supportsAlignedSIMDLoads; }