From d08430573c93c67e1a0856a4736fda646b85510b Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Mon, 6 Mar 2023 09:19:25 -0800 Subject: [PATCH 1/3] Load(), LoadUnsafe(), LoadAligned(), LoadAlignedNonTemporal() --- src/coreclr/jit/emitxarch.cpp | 36 ++++++++++++++++++++++++-- src/coreclr/jit/emitxarch.h | 2 ++ src/coreclr/jit/gentree.cpp | 18 +++++++++++-- src/coreclr/jit/hwintrinsiclistxarch.h | 7 +++++ src/coreclr/jit/hwintrinsicxarch.cpp | 4 +++ src/coreclr/jit/instrsxarch.h | 18 +++++++------ src/coreclr/jit/lowerxarch.cpp | 1 + 7 files changed, 74 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 313ee56b5524f..c2209f63c4e8c 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -5855,6 +5855,12 @@ bool emitter::IsMovInstruction(instruction ins) } #if defined(TARGET_AMD64) + case INS_movdqa32: + case INS_movdqa64: + case INS_movdqu8: + case INS_movdqu16: + case INS_movdqu32: + case INS_movdqu64: case INS_movq: case INS_movsxd: { @@ -5927,8 +5933,9 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) case INS_movupd: case INS_movups: { - // non EA_32BYTE moves clear the upper bits under VEX encoding - hasSideEffect = UseVEXEncoding() && (size != EA_32BYTE); + // TODO-XArch-AVX512 : Handle merge/masks scenarios once k-mask support is added for these. + // non EA_32BYTE and EA_64BYTE moves clear the upper bits under VEX and EVEX encoding respectively. + hasSideEffect = (UseVEXEncoding() && (size <= EA_32BYTE)) || (UseEvexEncoding() && (size <= EA_64BYTE)); break; } @@ -5963,6 +5970,19 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) break; } + case INS_movdqa32: + case INS_movdqa64: + case INS_movdqu8: + case INS_movdqu16: + case INS_movdqu32: + case INS_movdqu64: + { + // These EVEX instructions merges/masks based on k-register + // TODO-XArch-AVX512 : Handle merge/masks scenarios once k-mask support is added for these. + hasSideEffect = UseEvexEncoding() && (size < EA_64BYTE); + break; + } + case INS_movsxd: { // Sign-extends the source @@ -6152,7 +6172,13 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN case INS_movapd: case INS_movaps: case INS_movdqa: + case INS_movdqa32: + case INS_movdqa64: case INS_movdqu: + case INS_movdqu8: + case INS_movdqu16: + case INS_movdqu32: + case INS_movdqu64: case INS_movsdsse2: case INS_movss: case INS_movupd: @@ -17350,7 +17376,13 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case INS_movdqa: + case INS_movdqa32: + case INS_movdqa64: case INS_movdqu: + case INS_movdqu8: + case INS_movdqu16: + case INS_movdqu32: + case INS_movdqu64: case INS_movaps: case INS_movups: case INS_movapd: diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index a081a162d3af6..66a4a30602226 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -285,6 +285,7 @@ bool IsWEvexOpcodeExtension(const instrDesc* id) case INS_vfnmsub231sd: case INS_unpcklpd: case INS_vpermilpdvar: + case INS_movdqa64: case INS_movdqu16: case INS_movdqu64: case INS_vinsertf64x4: @@ -402,6 +403,7 @@ bool IsWEvexOpcodeExtension(const instrDesc* id) case INS_vpdpbusds: case INS_vpdpwssds: case INS_vpermilpsvar: + case INS_movdqa32: case INS_movdqu8: case INS_movdqu32: case INS_vinsertf32x8: diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 5a84e860fbefc..34844fca2e8dc 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -18930,6 +18930,7 @@ bool GenTree::isContainableHWIntrinsic() const case NI_SSE2_LoadAlignedVector128: case NI_SSE2_LoadScalarVector128: case NI_AVX_LoadAlignedVector256: + case NI_AVX512F_LoadAlignedVector512: { // These loads are contained as part of a HWIntrinsic operation return true; @@ -21555,7 +21556,12 @@ GenTree* Compiler::gtNewSimdLoadAlignedNode( NamedIntrinsic intrinsic = NI_Illegal; - if (simdSize == 32) + if (simdSize == 64) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + intrinsic = NI_AVX512F_LoadAlignedVector512; + } + else if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_LoadAlignedVector256; @@ -21616,7 +21622,15 @@ GenTree* Compiler::gtNewSimdLoadNonTemporalNode( // We don't guarantee a non-temporal load will actually occur, so fallback // to regular aligned loads if the required ISA isn't supported. - if (simdSize == 32) + if (simdSize == 64) + { + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + intrinsic = NI_AVX512F_LoadAlignedVector512NonTemporal; + isNonTemporal = true; + } + } + else if (simdSize == 32) { if (compOpportunisticallyDependsOn(InstructionSet_AVX2)) { diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 894663a1575bc..4a8ed7fdec715 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -238,6 +238,11 @@ HARDWARE_INTRINSIC(Vector256, Xor, // Vector512 Intrinsics HARDWARE_INTRINSIC(Vector512, Create, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, get_Zero, 64, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Vector512, Load, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector512, LoadAligned, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector512, LoadAlignedNonTemporal, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector512, LoadUnsafe, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -729,6 +734,8 @@ HARDWARE_INTRINSIC(AVX2, Xor, // AVX512F Intrinsics HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinsertf64x4, INS_vinsertf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512, 64, 1, {INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa64, INS_movdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512NonTemporal, 64, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 3b46d9ba9c7a0..2f5392ce6d029 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1665,8 +1665,10 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_AVX_LoadVector256: case NI_Vector128_Load: case NI_Vector256_Load: + case NI_Vector512_Load: case NI_Vector128_LoadUnsafe: case NI_Vector256_LoadUnsafe: + case NI_Vector512_LoadUnsafe: { if (sig->numArgs == 2) { @@ -1698,6 +1700,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_LoadAligned: case NI_Vector256_LoadAligned: + case NI_Vector512_LoadAligned: { assert(sig->numArgs == 1); @@ -1716,6 +1719,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_LoadAlignedNonTemporal: case NI_Vector256_LoadAlignedNonTemporal: + case NI_Vector512_LoadAlignedNonTemporal: { assert(sig->numArgs == 1); diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index a9825a20c30c3..a7d02c5c2af9f 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -632,20 +632,22 @@ INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX512F_INSTRUCTION, "FIRST_AVX512F_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(movdqu32, "movdqu32", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) -INST3(movdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) -INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values -INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(movdqa32, "movdqa32", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_32Bit | INS_FLAGS_None) +INST3(movdqa64, "movdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_64Bit | INS_FLAGS_None) +INST3(movdqu32, "movdqu32", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_32Bit | INS_FLAGS_None) +INST3(movdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_64Bit | INS_FLAGS_None) +INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values +INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, Input_64Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values INST3(LAST_AVX512F_INSTRUCTION, "LAST_AVX512F_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX512BW_INSTRUCTION, "FIRST_AVX512BW_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(movdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) -INST3(movdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) +INST3(movdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_8Bit | INS_FLAGS_None) +INST3(movdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_16Bit | INS_FLAGS_None) INST3(LAST_AVX512BW_INSTRUCTION, "LAST_AVX512BW_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX512DQ_INSTRUCTION, "FIRST_AVX512DQ_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values -INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values +INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values INST3(LAST_AVX512DQ_INSTRUCTION, "LAST_AVX512DQ_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 761e9f09b0b26..3a92611fa4cb2 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -6920,6 +6920,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_SSE_LoadAlignedVector128: case NI_SSE2_LoadAlignedVector128: case NI_AVX_LoadAlignedVector256: + case NI_AVX512F_LoadAlignedVector512: { return supportsAlignedSIMDLoads; } From 83baea1ad47ccf1a2fd55a78457233e6f16add04 Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Tue, 7 Mar 2023 11:34:07 -0800 Subject: [PATCH 2/3] Store(), StoreUnsafe(), StoreAligned(), StoreAlignedNonTemporal() --- src/coreclr/jit/gentree.cpp | 14 ++++++++++++-- src/coreclr/jit/hwintrinsiclistxarch.h | 16 +++++++++++----- src/coreclr/jit/hwintrinsicxarch.cpp | 4 ++++ 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 34844fca2e8dc..a373e66df7d7b 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -22891,7 +22891,12 @@ GenTree* Compiler::gtNewSimdStoreAlignedNode( NamedIntrinsic intrinsic = NI_Illegal; - if (simdSize == 32) + if (simdSize == 64) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + intrinsic = NI_AVX512F_StoreAligned; + } + else if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_StoreAligned; @@ -22948,7 +22953,12 @@ GenTree* Compiler::gtNewSimdStoreNonTemporalNode( NamedIntrinsic intrinsic = NI_Illegal; - if (simdSize == 32) + if (simdSize == 64) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + intrinsic = NI_AVX512F_StoreAlignedNonTemporal; + } + else if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_AVX_StoreAlignedNonTemporal; diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 4a8ed7fdec715..d581ddc5a5711 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -238,11 +238,14 @@ HARDWARE_INTRINSIC(Vector256, Xor, // Vector512 Intrinsics HARDWARE_INTRINSIC(Vector512, Create, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, get_Zero, 64, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(Vector512, Load, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) -HARDWARE_INTRINSIC(Vector512, LoadAligned, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) -HARDWARE_INTRINSIC(Vector512, LoadAlignedNonTemporal, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) -HARDWARE_INTRINSIC(Vector512, LoadUnsafe, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) - +HARDWARE_INTRINSIC(Vector512, Load, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, LoadAligned, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, LoadAlignedNonTemporal, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, LoadUnsafe, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, Store, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, StoreAligned, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, StoreAlignedNonTemporal, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, StoreUnsafe, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -736,6 +739,9 @@ HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinsertf64x4, INS_vinsertf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512, 64, 1, {INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa64, INS_movdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX512F, LoadAlignedVector512NonTemporal, 64, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX512F, StoreAligned, 64, 2, {INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa32, INS_movdqa64, INS_movdqa64, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) +HARDWARE_INTRINSIC(AVX512F, StoreAlignedNonTemporal, 64, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 2f5392ce6d029..802a3bd97e9b9 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -2090,8 +2090,10 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_Store: case NI_Vector256_Store: + case NI_Vector512_Store: case NI_Vector128_StoreUnsafe: case NI_Vector256_StoreUnsafe: + case NI_Vector512_StoreUnsafe: { assert(retType == TYP_VOID); var_types simdType = getSIMDTypeForSize(simdSize); @@ -2134,6 +2136,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_StoreAligned: case NI_Vector256_StoreAligned: + case NI_Vector512_StoreAligned: { assert(sig->numArgs == 2); assert(retType == TYP_VOID); @@ -2159,6 +2162,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_StoreAlignedNonTemporal: case NI_Vector256_StoreAlignedNonTemporal: + case NI_Vector512_StoreAlignedNonTemporal: { assert(sig->numArgs == 2); assert(retType == TYP_VOID); From a50908a0d40ff371bae0b8ee4a6ac088a0744ea4 Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Tue, 7 Mar 2023 12:33:27 -0800 Subject: [PATCH 3/3] Fixing 'HasSideEffect()' check used for RedundantMov --- src/coreclr/jit/emitxarch.cpp | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index c2209f63c4e8c..bdd03f7fd8de8 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -5843,7 +5843,13 @@ bool emitter::IsMovInstruction(instruction ins) case INS_movaps: case INS_movd: case INS_movdqa: + case INS_movdqa32: + case INS_movdqa64: case INS_movdqu: + case INS_movdqu8: + case INS_movdqu16: + case INS_movdqu32: + case INS_movdqu64: case INS_movsdsse2: case INS_movss: case INS_movsx: @@ -5855,12 +5861,6 @@ bool emitter::IsMovInstruction(instruction ins) } #if defined(TARGET_AMD64) - case INS_movdqa32: - case INS_movdqa64: - case INS_movdqu8: - case INS_movdqu16: - case INS_movdqu32: - case INS_movdqu64: case INS_movq: case INS_movsxd: { @@ -5935,7 +5935,21 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) { // TODO-XArch-AVX512 : Handle merge/masks scenarios once k-mask support is added for these. // non EA_32BYTE and EA_64BYTE moves clear the upper bits under VEX and EVEX encoding respectively. - hasSideEffect = (UseVEXEncoding() && (size <= EA_32BYTE)) || (UseEvexEncoding() && (size <= EA_64BYTE)); + if (UseVEXEncoding()) + { + if (UseEvexEncoding()) + { + hasSideEffect = (size != EA_64BYTE); + } + else + { + hasSideEffect = (size != EA_32BYTE); + } + } + else + { + hasSideEffect = false; + } break; } @@ -5979,7 +5993,8 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) { // These EVEX instructions merges/masks based on k-register // TODO-XArch-AVX512 : Handle merge/masks scenarios once k-mask support is added for these. - hasSideEffect = UseEvexEncoding() && (size < EA_64BYTE); + assert(UseEvexEncoding()); + hasSideEffect = (size != EA_64BYTE); break; }