From f61cbd2070fa95d45dcde4cf598535ac8e0e601f Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 18 Jan 2023 07:48:54 -0800 Subject: [PATCH 01/12] Add `TYP_MASK` and `Vector512.ExtractMostSignificantBits`. --- src/coreclr/jit/emitxarch.cpp | 107 ++++++++++++++++++-- src/coreclr/jit/emitxarch.h | 7 ++ src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 43 ++++++++ src/coreclr/jit/hwintrinsiclistxarch.h | 4 + src/coreclr/jit/hwintrinsicxarch.cpp | 16 +++ src/coreclr/jit/instr.cpp | 3 +- src/coreclr/jit/instrsxarch.h | 17 ++++ src/coreclr/jit/lsra.cpp | 6 ++ src/coreclr/jit/lsra.h | 11 ++ src/coreclr/jit/lsrabuild.cpp | 11 ++ src/coreclr/jit/lsraxarch.cpp | 10 ++ src/coreclr/jit/register.h | 26 ++++- src/coreclr/jit/target.h | 5 + src/coreclr/jit/targetamd64.h | 5 + src/coreclr/jit/targetx86.h | 6 ++ src/coreclr/jit/typelist.h | 3 +- src/coreclr/jit/vartype.h | 12 ++- src/coreclr/vm/threadsuspend.cpp | 2 +- 18 files changed, 279 insertions(+), 15 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 313ee56b5524f..ab7cc3051a2f7 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -34,8 +34,15 @@ bool emitter::IsSSEOrAVXInstruction(instruction ins) return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION); } +bool emitter::IsKInstruction(instruction ins) +{ + return (ins >= INS_FIRST_K_INSTRUCTION) && (ins <= INS_LAST_K_INSTRUCTION); +} + //------------------------------------------------------------------------ -// IsAvx512OrPriorInstruction: Is this an Avx512 or Avx or Sse instruction. +// IsAvx512OrPriorInstruction: Is this an Avx512 or Avx or Sse or K (opmask) instruction. +// Technically, K instructions would be considered under the VEX encoding umbrella, but due to +// the instruction table encoding had to be pulled out with the rest of the `INST5` definitions. // // Arguments: // ins - The instruction to check. @@ -46,7 +53,7 @@ bool emitter::IsSSEOrAVXInstruction(instruction ins) bool emitter::IsAvx512OrPriorInstruction(instruction ins) { // TODO-XArch-AVX512: Fix check once AVX512 instructions are added. - return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION); + return ((ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION)) || IsKInstruction(ins); } bool emitter::IsAVXOnlyInstruction(instruction ins) @@ -154,7 +161,7 @@ regNumber emitter::getSseShiftRegNumber(instruction ins) bool emitter::IsVexEncodedInstruction(instruction ins) const { - return UseVEXEncoding() && IsSSEOrAVXInstruction(ins); + return UseVEXEncoding() && (IsSSEOrAVXInstruction(ins) || IsKInstruction(ins)); } //------------------------------------------------------------------------ @@ -263,6 +270,11 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const case INS_vbroadcastf128: // INS_vbroadcastf32x4, INS_vbroadcastf64x2. case INS_vbroadcasti128: // INS_vbroadcasti32x4, INS_vbroadcasti64x2. + case INS_kmovb: + case INS_kmovw: + case INS_kmovd: + case INS_kmovq: + // TODO-XARCH-AVX512 these need to be encoded with the proper individual EVEX instructions (movdqu8, // movdqu16 etc) // For implementation speed, I have set it up so the standing instruction will default to the 32-bit operand @@ -1248,6 +1260,8 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) case INS_vpgatherqq: case INS_vgatherdpd: case INS_vgatherqpd: + case INS_vpmovw2m: + case INS_vpmovq2m: return true; default: break; @@ -1307,7 +1321,7 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) // so we never need it if ((ins != INS_push) && (ins != INS_pop) && (ins != INS_movq) && (ins != INS_movzx) && (ins != INS_push_hide) && (ins != INS_pop_hide) && (ins != INS_ret) && (ins != INS_call) && (ins != INS_tail_i_jmp) && - !((ins >= INS_i_jmp) && (ins <= INS_l_jg))) + !((ins >= INS_i_jmp) && (ins <= INS_l_jg)) && (ins != INS_kmovb) && (ins != INS_kmovw) && (ins != INS_kmovd)) { return true; } @@ -3477,7 +3491,16 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) // If Byte 4 (which is 0xFF00) is zero, that's where the RM encoding goes. // Otherwise, it will be placed after the 4 byte encoding, making the total 5 bytes. // This would probably be better expressed as a different format or something? - code_t code = insCodeRM(ins); + code_t code; + if (IsKInstruction(ins)) + { + code = insCodeRR(ins); + code = AddVexPrefix(ins, code, EA_SIZE(id->idOpSize())); + } + else + { + code = insCodeRM(ins); + } UNATIVE_OFFSET sz = emitGetAdjustedSize(id, code); @@ -5850,6 +5873,10 @@ bool emitter::IsMovInstruction(instruction ins) case INS_movupd: case INS_movups: case INS_movzx: + case INS_kmovb: + case INS_kmovw: + case INS_kmovd: + case INS_kmovq: { return true; } @@ -5971,6 +5998,15 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) } #endif // TARGET_AMD64 + case INS_kmovb: + case INS_kmovw: + case INS_kmovd: + case INS_kmovq: + { + hasSideEffect = true; + break; + } + default: { unreached(); @@ -6182,6 +6218,12 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN } #endif // TARGET_AMD64 + case INS_kmovb: + case INS_kmovw: + case INS_kmovd: + case INS_kmovq: + break; + default: { unreached(); @@ -9578,6 +9620,11 @@ const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName) #ifdef TARGET_AMD64 char suffix = '\0'; + if (isMaskReg(reg)) + { + return emitKregName(reg); + } + switch (EA_SIZE(attr)) { case EA_64BYTE: @@ -9798,6 +9845,24 @@ const char* emitter::emitZMMregName(unsigned reg) return regNames[reg]; } +/***************************************************************************** + * + * Return a string that represents the given K register. + */ + +const char* emitter::emitKregName(unsigned reg) +{ + static const char* const regNames[] = { +#define REGDEF(name, rnum, mask, sname) sname, +#include "register.h" + }; + + assert(reg < REG_COUNT); + assert(reg < ArrLen(regNames)); + + return regNames[reg]; +} + /***************************************************************************** * * Display a static data member reference. @@ -13802,7 +13867,16 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert((ins != INS_movd) || (isFloatReg(reg1) != isFloatReg(reg2))); - if ((ins != INS_movd) || isFloatReg(reg1)) + if (IsKInstruction(ins)) + { + code = insCodeRR(ins); + if (isGeneralRegister(reg1)) + { + // kmov r, k form, flip last byte of opcode from 0x92 to 0x93 + code |= 0x01; + } + } + else if ((ins != INS_movd) || isFloatReg(reg1)) { code = insCodeRM(ins); } @@ -18103,6 +18177,27 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; } #endif + + case INS_vpmovb2m: + case INS_vpmovw2m: + case INS_vpmovd2m: + case INS_vpmovq2m: + { + result.insLatency += PERFSCORE_LATENCY_1C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + } + + case INS_kmovb: + case INS_kmovw: + case INS_kmovd: + case INS_kmovq: + { + result.insLatency += PERFSCORE_LATENCY_3C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + } + default: // unhandled instruction insFmt combination perfScoreUnhandledInstruction(id, &result); diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index a081a162d3af6..c9b7fa138263f 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -23,6 +23,11 @@ inline static bool isDoubleReg(regNumber reg) return isFloatReg(reg); } +inline static bool isMaskReg(regNumber reg) +{ + return (reg >= REG_MASK_FIRST && reg <= REG_MASK_LAST); +} + /************************************************************************/ /* Routines that compute the size of / encode instructions */ /************************************************************************/ @@ -96,6 +101,7 @@ static bool IsAvx512OnlyInstruction(instruction ins); static bool IsFMAInstruction(instruction ins); static bool IsAVXVNNIInstruction(instruction ins); static bool IsBMIInstruction(instruction ins); +static bool IsKInstruction(instruction ins); static regNumber getBmiRegNumber(instruction ins); static regNumber getSseShiftRegNumber(instruction ins); @@ -670,6 +676,7 @@ void emitDispShift(instruction ins, int cnt = 0); const char* emitXMMregName(unsigned reg); const char* emitYMMregName(unsigned reg); const char* emitZMMregName(unsigned reg); +const char* emitKregName(unsigned reg); /************************************************************************/ /* Private members that deal with target-dependent instr. descriptors */ diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 274821ef81b55..a601248c30933 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1671,6 +1671,49 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_AVX512F_MoveMaskSpecial: + { + op1Reg = op1->GetRegNum(); + regNumber maskReg = node->ExtractTempReg(RBM_ALLMASK); + + instruction maskIns; + instruction kmovIns; + + switch (baseType) + { + case TYP_BYTE: + case TYP_UBYTE: + maskIns = INS_vpmovb2m; + kmovIns = INS_kmovq; + break; + case TYP_SHORT: + case TYP_USHORT: + maskIns = INS_vpmovw2m; + kmovIns = INS_kmovd; + break; + case TYP_INT: + case TYP_UINT: + case TYP_FLOAT: + maskIns = INS_vpmovd2m; + kmovIns = INS_kmovw; + break; + case TYP_DOUBLE: + case TYP_LONG: + case TYP_ULONG: + maskIns = INS_vpmovq2m; + kmovIns = INS_kmovb; + break; + default: + unreached(); + } + + assert(emitter::isMaskReg(maskReg)); + + emit->emitIns_R_R(maskIns, attr, maskReg, op1Reg); + emit->emitIns_Mov(kmovIns, EA_8BYTE, targetReg, maskReg, INS_FLAGS_DONT_CARE); + break; + } + default: unreached(); break; diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 894663a1575bc..3410211079745 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -239,6 +239,8 @@ HARDWARE_INTRINSIC(Vector256, Xor, HARDWARE_INTRINSIC(Vector512, Create, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, get_Zero, 64, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Vector512, ExtractMostSignificantBits, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} @@ -871,6 +873,8 @@ HARDWARE_INTRINSIC(SSE2, UCOMISD, HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX512F, MoveMaskSpecial, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) + #endif // FEATURE_HW_INTRINSIC #undef HARDWARE_INTRINSIC diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 3b46d9ba9c7a0..b0891a34fe6a5 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1249,6 +1249,22 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector512_ExtractMostSignificantBits: + { + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F) && + compOpportunisticallyDependsOn(InstructionSet_AVX512BW) && + compOpportunisticallyDependsOn(InstructionSet_AVX512DQ)) + { + var_types simdType = getSIMDTypeForSize(simdSize); + + op1 = impSIMDPopStack(simdType); + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_AVX512F_MoveMaskSpec, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ false); + } + break; + } + case NI_Vector128_ExtractMostSignificantBits: case NI_Vector256_ExtractMostSignificantBits: { diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 5b21f54e3c8e6..ee60a7ce1e5bc 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -101,7 +101,8 @@ const char* CodeGen::genInsDisplayName(emitter::instrDesc* id) static char buf[4][TEMP_BUFFER_LEN]; const char* retbuf; - if (GetEmitter()->IsVexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins)) + if (GetEmitter()->IsVexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins) && + !GetEmitter()->IsKInstruction(ins)) { sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "v%s", insName); retbuf = buf[curBuf]; diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index a9825a20c30c3..1b484cf27d583 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -67,6 +67,18 @@ INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, // See comment around quarter way through this file for more information. INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_TT_NONE, INS_FLAGS_None ) +INST5(FIRST_K_INSTRUCTION, "FIRST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) + +// id nm um mr mi rm a4 rr tt flags +// TODO-XARCH-AVX512 add the proper W bit switch +INST5(kmovb, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, PACK3(0x66, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_Has_Wbit ) +INST5(kmovw, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, PACK2(0x0F, 0x92), INS_TT_NONE, INS_FLAGS_Has_Wbit ) +INST5(kmovd, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, PACK3(0xF2, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_Has_Wbit ) +INST5(kmovq, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, PACK3(0xF2, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_Has_Wbit ) + +INST5(LAST_K_INSTRUCTION, "LAST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) + + // id nm um mr mi rm a4 tt flags INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) @@ -648,6 +660,11 @@ INST3(vinsertf32x8, "insertf32x8", IUM_WR, BAD_CODE, BAD_CODE, INST3(vinserti32x8, "inserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE8, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values INST3(LAST_AVX512DQ_INSTRUCTION, "LAST_AVX512DQ_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(vpmovb2m, "vpmovb2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x29), INS_TT_NONE, Input_8Bit) +INST3(vpmovw2m, "vpmovw2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x29), INS_TT_NONE, Input_16Bit) +INST3(vpmovd2m, "vpmovd2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x39), INS_TT_NONE, Input_32Bit) +INST3(vpmovq2m, "vpmovq2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x39), INS_TT_NONE, Input_64Bit) + INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Scalar instructions in SSE4.2 diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index c355d2af16dbe..10356c4ca3c27 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -698,6 +698,7 @@ LinearScan::LinearScan(Compiler* theCompiler) availableFloatRegs = RBM_ALLFLOAT; availableDoubleRegs = RBM_ALLDOUBLE; + availableMaskRegs = RBM_NONE; #if defined(TARGET_AMD64) || defined(TARGET_ARM64) if (compiler->opts.compDbgEnC) @@ -718,6 +719,7 @@ LinearScan::LinearScan(Compiler* theCompiler) { availableFloatRegs |= RBM_HIGHFLOAT; availableDoubleRegs |= RBM_HIGHFLOAT; + availableMaskRegs |= RBM_K1; } #endif @@ -737,6 +739,10 @@ LinearScan::LinearScan(Compiler* theCompiler) { availableRegs[i] = &availableDoubleRegs; } + else if (thisType == TYP_MASK) + { + availableRegs[i] = &availableMaskRegs; + } #endif // FEATURE_SIMD else { diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index d28a8d521d632..633edddbdda8a 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -35,6 +35,7 @@ const unsigned int RegisterTypeCount = 2; typedef var_types RegisterType; #define IntRegisterType TYP_INT #define FloatRegisterType TYP_FLOAT +#define MaskRegisterType TYP_MASK //------------------------------------------------------------------------ // regType: Return the RegisterType to use for a given type @@ -486,6 +487,12 @@ class RegRecord : public Referenceable { registerType = FloatRegisterType; } +#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) + else if (emitter::isMaskReg(reg)) + { + registerType = MaskRegisterType; + } +#endif else { // The constructor defaults to IntRegisterType @@ -1090,6 +1097,9 @@ class LinearScan : public LinearScanInterface RefPosition* defineNewInternalTemp(GenTree* tree, RegisterType regType, regMaskTP candidates); RefPosition* buildInternalIntRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); RefPosition* buildInternalFloatRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); +#if defined(FEATURE_SIMD) + RefPosition* buildInternalMaskRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); +#endif void buildInternalRegisterUses(); void writeLocalReg(GenTreeLclVar* lclNode, unsigned varNum, regNumber reg); @@ -1598,6 +1608,7 @@ class LinearScan : public LinearScanInterface PhasedVar availableIntRegs; PhasedVar availableFloatRegs; PhasedVar availableDoubleRegs; + PhasedVar availableMaskRegs; PhasedVar* availableRegs[TYP_COUNT]; // Register mask of argument registers currently occupied because we saw a diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 14f142d7908fa..418deefa1705c 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1393,6 +1393,17 @@ RefPosition* LinearScan::buildInternalFloatRegisterDefForNode(GenTree* tree, reg return defRefPosition; } +#if defined(FEATURE_SIMD) +RefPosition* LinearScan::buildInternalMaskRegisterDefForNode(GenTree* tree, regMaskTP internalCands) +{ + // The candidate set should contain only float registers. + assert((internalCands & ~availableMaskRegs) == RBM_NONE); + + RefPosition* defRefPosition = defineNewInternalTemp(tree, MaskRegisterType, internalCands); + return defRefPosition; +} +#endif + //------------------------------------------------------------------------ // buildInternalRegisterUses - adds use positions for internal // registers required for tree node. diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 9a8b8539aa278..f4aa43ee68b88 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2490,6 +2490,16 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou break; } + case NI_AVX512F_MoveMaskSpecial: + { + srcCount += BuildOperandUses(op1); + buildInternalMaskRegisterDefForNode(intrinsicTree); + setInternalRegsDelayFree = true; + + buildUses = false; + break; + } + default: { assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END)); diff --git a/src/coreclr/jit/register.h b/src/coreclr/jit/register.h index ca90673e85adf..9ec15818da56c 100644 --- a/src/coreclr/jit/register.h +++ b/src/coreclr/jit/register.h @@ -69,9 +69,18 @@ REGALIAS(EDI, RDI) #ifdef TARGET_AMD64 #define XMMBASE 16 #define XMMMASK(x) ((__int64)(1) << ((x)+XMMBASE)) + +#define KBASE 48 +#define KMASK(x) ((__int64)(1) << ((x)+KBASE)) + #else // !TARGET_AMD64 #define XMMBASE 8 #define XMMMASK(x) ((__int32)(1) << ((x)+XMMBASE)) + +#define KBASE 16 +#define KMASK(x) ((__int32)(1) << ((x)+KBASE)) + + #endif // !TARGET_AMD64 REGDEF(XMM0, 0+XMMBASE, XMMMASK(0), "mm0" ) @@ -83,9 +92,7 @@ REGDEF(XMM5, 5+XMMBASE, XMMMASK(5), "mm5" ) REGDEF(XMM6, 6+XMMBASE, XMMMASK(6), "mm6" ) REGDEF(XMM7, 7+XMMBASE, XMMMASK(7), "mm7" ) -#ifdef TARGET_X86 -REGDEF(STK, 8+XMMBASE, 0x0000, "STK" ) -#else // !TARGET_X86 +#ifdef TARGET_AMD64 REGDEF(XMM8, 8+XMMBASE, XMMMASK(8), "mm8" ) REGDEF(XMM9, 9+XMMBASE, XMMMASK(9), "mm9" ) REGDEF(XMM10, 10+XMMBASE, XMMMASK(10), "mm10" ) @@ -113,9 +120,18 @@ REGDEF(XMM29, 29+XMMBASE, XMMMASK(29), "mm29" ) REGDEF(XMM30, 30+XMMBASE, XMMMASK(30), "mm30" ) REGDEF(XMM31, 31+XMMBASE, XMMMASK(31), "mm31" ) -REGDEF(STK, 32+XMMBASE, 0x0000, "STK" ) +#endif // !TARGET_AMD64 + +REGDEF(K0, 0+KBASE, KMASK(0), "k0" ) +REGDEF(K1, 1+KBASE, KMASK(1), "k1" ) +REGDEF(K2, 2+KBASE, KMASK(2), "k2" ) +REGDEF(K3, 3+KBASE, KMASK(3), "k3" ) +REGDEF(K4, 4+KBASE, KMASK(4), "k4" ) +REGDEF(K5, 5+KBASE, KMASK(5), "k5" ) +REGDEF(K6, 6+KBASE, KMASK(6), "k6" ) +REGDEF(K7, 7+KBASE, KMASK(7), "k7" ) -#endif // !TARGET_X86 +REGDEF(STK, 8+KBASE, 0x0000, "STK" ) #elif defined(TARGET_ARM) #include "registerarm.h" diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index 8baf645453adf..a838814164f88 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -682,6 +682,11 @@ inline bool isFloatRegType(var_types type) return varTypeUsesFloatReg(type); } +inline bool isMaskReg(var_types type) +{ + return varTypeIsMask(type); +} + // If the WINDOWS_AMD64_ABI is defined make sure that TARGET_AMD64 is also defined. #if defined(WINDOWS_AMD64_ABI) #if !defined(TARGET_AMD64) diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index ac3f0ca7e8c02..b50fab0ba2f3b 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -91,6 +91,11 @@ #define REG_FP_LAST REG_XMM31 #define FIRST_FP_ARGREG REG_XMM0 + #define REG_MASK_FIRST REG_K0 + #define REG_MASK_LAST REG_K7 + + #define RBM_ALLMASK (RBM_K1 | RBM_K2 | RBM_K3 | RBM_K4 | RBM_K5 | RBM_K6 | RBM_K7) + #ifdef UNIX_AMD64_ABI #define LAST_FP_ARGREG REG_XMM7 #else // !UNIX_AMD64_ABI diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index dffd6adf2efb0..f716327298123 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -74,6 +74,10 @@ #define REG_FP_FIRST REG_XMM0 #define REG_FP_LAST REG_XMM7 + + #define REG_MASK_FIRST REG_K0 + #define REG_MASK_LAST REG_K7 + #define FIRST_FP_ARGREG REG_XMM0 #define LAST_FP_ARGREG REG_XMM3 #define REG_FLTARG_0 REG_XMM0 @@ -91,6 +95,8 @@ #define RBM_ALLFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7) #define RBM_ALLDOUBLE RBM_ALLFLOAT + #define RBM_ALLMASK (REG_K1 | REG_K2 | REG_K3 | REG_K4 | REG_K5 | REG_K6 | REG_K7) + // TODO-CQ: Currently we are following the x86 ABI for SSE2 registers. // This should be reconsidered. #define RBM_FLT_CALLEE_SAVED RBM_NONE diff --git a/src/coreclr/jit/typelist.h b/src/coreclr/jit/typelist.h index b9140aa601f33..89adbd3dc613c 100644 --- a/src/coreclr/jit/typelist.h +++ b/src/coreclr/jit/typelist.h @@ -64,6 +64,7 @@ DEF_TP(SIMD16 ,"simd16" , TYP_SIMD16, TI_STRUCT,16,16, 16, 4,16, VTF_S|VTF_ DEF_TP(SIMD32 ,"simd32" , TYP_SIMD32, TI_STRUCT,32,32, 32, 8,16, VTF_S|VTF_VEC) DEF_TP(SIMD64 ,"simd64" , TYP_SIMD64, TI_STRUCT,64,64, 64, 16,16, VTF_S|VTF_VEC) #endif // TARGET_XARCH +DEF_TP(MASK ,"mask" , TYP_MASK, TI_STRUCT,8, 8, 8, 2,8, VTF_S) #endif // FEATURE_SIMD DEF_TP(UNKNOWN ,"unknown" ,TYP_UNKNOWN, TI_ERROR, 0, 0, 0, 0, 0, VTF_ANY) @@ -75,4 +76,4 @@ DEF_TP(UNKNOWN ,"unknown" ,TYP_UNKNOWN, TI_ERROR, 0, 0, 0, 0, 0, VTF_ANY) #undef PS #undef PST #undef VTF_I32 -#undef VTF_I64 +#undef VTF_I64 \ No newline at end of file diff --git a/src/coreclr/jit/vartype.h b/src/coreclr/jit/vartype.h index a51dbff3b82ac..1764739e20aa8 100644 --- a/src/coreclr/jit/vartype.h +++ b/src/coreclr/jit/vartype.h @@ -70,7 +70,17 @@ inline bool varTypeIsSIMD(T vt) #else // Always return false if FEATURE_SIMD is not enabled return false; -#endif // !FEATURE_SIMD +#endif +} + +template +inline bool varTypeIsMask(T vt) +{ +#ifdef FEATURE_SIMD + return (TypeGet(vt) == TYP_MASK); +#else // FEATURE_SIMD + return false; +#endif // !FEATURE_ } template diff --git a/src/coreclr/vm/threadsuspend.cpp b/src/coreclr/vm/threadsuspend.cpp index 89f2f9d33f7e2..31e381e18472e 100644 --- a/src/coreclr/vm/threadsuspend.cpp +++ b/src/coreclr/vm/threadsuspend.cpp @@ -3050,7 +3050,7 @@ BOOL Thread::RedirectCurrentThreadAtHandledJITCase(PFN_REDIRECTTARGET pTgt, CONT //////////////////////////////////////////////////// // Now redirect the thread to the helper function - + SetIP(pCurrentThreadCtx, (PCODE)pTgt); #ifdef TARGET_ARM From ff0ac7562ecb6cb4ef0e16c75b57f7dbf1e96822 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Mon, 27 Feb 2023 13:07:15 -0800 Subject: [PATCH 02/12] Rebase / rename error fix. --- src/coreclr/jit/hwintrinsicxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index b0891a34fe6a5..acca5e2308295 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1259,7 +1259,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(simdType); - retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_AVX512F_MoveMaskSpec, simdBaseJitType, simdSize, + retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_AVX512F_MoveMaskSpecial, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); } break; From e663eb4511b469e06ca7d153f2151d65b05de3c0 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 1 Mar 2023 20:26:43 -0800 Subject: [PATCH 03/12] Review edits. --- src/coreclr/jit/lsra.cpp | 7 +++++-- src/coreclr/jit/lsrabuild.cpp | 5 ++--- src/coreclr/jit/target.h | 2 ++ src/coreclr/jit/vartype.h | 2 ++ 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 10356c4ca3c27..73c1b5bdc1bdd 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -698,7 +698,9 @@ LinearScan::LinearScan(Compiler* theCompiler) availableFloatRegs = RBM_ALLFLOAT; availableDoubleRegs = RBM_ALLDOUBLE; - availableMaskRegs = RBM_NONE; +#if defined(TARGET_XARCH) + availableMaskRegs = RBM_K1; +#endif #if defined(TARGET_AMD64) || defined(TARGET_ARM64) if (compiler->opts.compDbgEnC) @@ -719,7 +721,6 @@ LinearScan::LinearScan(Compiler* theCompiler) { availableFloatRegs |= RBM_HIGHFLOAT; availableDoubleRegs |= RBM_HIGHFLOAT; - availableMaskRegs |= RBM_K1; } #endif @@ -739,10 +740,12 @@ LinearScan::LinearScan(Compiler* theCompiler) { availableRegs[i] = &availableDoubleRegs; } +#ifdef TARGET_XARCH else if (thisType == TYP_MASK) { availableRegs[i] = &availableMaskRegs; } +#endif // TARGET_XARCH #endif // FEATURE_SIMD else { diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 418deefa1705c..18e03679d9cf2 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1393,14 +1393,13 @@ RefPosition* LinearScan::buildInternalFloatRegisterDefForNode(GenTree* tree, reg return defRefPosition; } -#if defined(FEATURE_SIMD) +#if defined(FEATURE_SIMD) && defined(TARGET_XARCH) RefPosition* LinearScan::buildInternalMaskRegisterDefForNode(GenTree* tree, regMaskTP internalCands) { // The candidate set should contain only float registers. assert((internalCands & ~availableMaskRegs) == RBM_NONE); - RefPosition* defRefPosition = defineNewInternalTemp(tree, MaskRegisterType, internalCands); - return defRefPosition; + return defineNewInternalTemp(tree, MaskRegisterType, internalCands); } #endif diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index a838814164f88..a912d6f669755 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -682,10 +682,12 @@ inline bool isFloatRegType(var_types type) return varTypeUsesFloatReg(type); } +#if defined(TARGET_XARCH) inline bool isMaskReg(var_types type) { return varTypeIsMask(type); } +#endif // If the WINDOWS_AMD64_ABI is defined make sure that TARGET_AMD64 is also defined. #if defined(WINDOWS_AMD64_ABI) diff --git a/src/coreclr/jit/vartype.h b/src/coreclr/jit/vartype.h index 1764739e20aa8..d3ace4c6bf0a1 100644 --- a/src/coreclr/jit/vartype.h +++ b/src/coreclr/jit/vartype.h @@ -73,6 +73,7 @@ inline bool varTypeIsSIMD(T vt) #endif } +#ifdef TARGET_XARCH template inline bool varTypeIsMask(T vt) { @@ -82,6 +83,7 @@ inline bool varTypeIsMask(T vt) return false; #endif // !FEATURE_ } +#endif template inline bool varTypeIsIntegral(T vt) From dcbc6a978db839489e590ef308f419ca3d189c6f Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Thu, 2 Mar 2023 06:41:25 -0800 Subject: [PATCH 04/12] Formatting. --- src/coreclr/jit/lsra.cpp | 2 +- src/coreclr/jit/lsrabuild.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 73c1b5bdc1bdd..0859544c3c3dd 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -699,7 +699,7 @@ LinearScan::LinearScan(Compiler* theCompiler) availableFloatRegs = RBM_ALLFLOAT; availableDoubleRegs = RBM_ALLDOUBLE; #if defined(TARGET_XARCH) - availableMaskRegs = RBM_K1; + availableMaskRegs = RBM_K1; #endif #if defined(TARGET_AMD64) || defined(TARGET_ARM64) diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 18e03679d9cf2..a6fbce9b40cb4 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1393,7 +1393,7 @@ RefPosition* LinearScan::buildInternalFloatRegisterDefForNode(GenTree* tree, reg return defRefPosition; } -#if defined(FEATURE_SIMD) && defined(TARGET_XARCH) +#if defined(FEATURE_SIMD) && defined(TARGET_XARCH) RefPosition* LinearScan::buildInternalMaskRegisterDefForNode(GenTree* tree, regMaskTP internalCands) { // The candidate set should contain only float registers. From 40a46ebd29d01bd961e9e04dc143bce59d9e008c Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Mon, 6 Mar 2023 09:20:41 -0800 Subject: [PATCH 05/12] Review edits. --- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index a601248c30933..19979720fc154 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1707,7 +1707,7 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) unreached(); } - assert(emitter::isMaskReg(maskReg)); + assert(emitter::isMaskReg(maskReg) && maskReg == REG_K1); emit->emitIns_R_R(maskIns, attr, maskReg, op1Reg); emit->emitIns_Mov(kmovIns, EA_8BYTE, targetReg, maskReg, INS_FLAGS_DONT_CARE); From 2468b10ef9ea001dba57bcdf9ff2b881e5c5c353 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Tue, 7 Mar 2023 05:18:27 -0800 Subject: [PATCH 06/12] Review cleanup. --- src/coreclr/jit/emitxarch.cpp | 20 +------------------- src/coreclr/jit/emitxarch.h | 1 - src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 2 ++ src/coreclr/jit/lsra.h | 4 +++- src/coreclr/jit/register.h | 2 +- src/coreclr/jit/target.h | 2 -- src/coreclr/jit/typelist.h | 2 +- src/coreclr/jit/vartype.h | 6 ++---- src/coreclr/vm/threadsuspend.cpp | 1 - 9 files changed, 10 insertions(+), 30 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index ab7cc3051a2f7..bdf4e3414da27 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -9622,7 +9622,7 @@ const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName) if (isMaskReg(reg)) { - return emitKregName(reg); + return rn; } switch (EA_SIZE(attr)) @@ -9845,24 +9845,6 @@ const char* emitter::emitZMMregName(unsigned reg) return regNames[reg]; } -/***************************************************************************** - * - * Return a string that represents the given K register. - */ - -const char* emitter::emitKregName(unsigned reg) -{ - static const char* const regNames[] = { -#define REGDEF(name, rnum, mask, sname) sname, -#include "register.h" - }; - - assert(reg < REG_COUNT); - assert(reg < ArrLen(regNames)); - - return regNames[reg]; -} - /***************************************************************************** * * Display a static data member reference. diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index c9b7fa138263f..16c642b0262df 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -676,7 +676,6 @@ void emitDispShift(instruction ins, int cnt = 0); const char* emitXMMregName(unsigned reg); const char* emitYMMregName(unsigned reg); const char* emitZMMregName(unsigned reg); -const char* emitKregName(unsigned reg); /************************************************************************/ /* Private members that deal with target-dependent instr. descriptors */ diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 19979720fc154..5bacee515d947 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1707,6 +1707,8 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) unreached(); } + // TODO-XARCH-AVX512 remove REG_K1 check when all K registers possible for + // allocation. assert(emitter::isMaskReg(maskReg) && maskReg == REG_K1); emit->emitIns_R_R(maskIns, attr, maskReg, op1Reg); diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 633edddbdda8a..43483b10d8471 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -487,7 +487,7 @@ class RegRecord : public Referenceable { registerType = FloatRegisterType; } -#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) +#if defined(TARGET_XARCH) else if (emitter::isMaskReg(reg)) { registerType = MaskRegisterType; @@ -1608,7 +1608,9 @@ class LinearScan : public LinearScanInterface PhasedVar availableIntRegs; PhasedVar availableFloatRegs; PhasedVar availableDoubleRegs; +#if defined(TARGET_XARCH) PhasedVar availableMaskRegs; +#endif PhasedVar* availableRegs[TYP_COUNT]; // Register mask of argument registers currently occupied because we saw a diff --git a/src/coreclr/jit/register.h b/src/coreclr/jit/register.h index 9ec15818da56c..7f2a0d47570f9 100644 --- a/src/coreclr/jit/register.h +++ b/src/coreclr/jit/register.h @@ -131,7 +131,7 @@ REGDEF(K5, 5+KBASE, KMASK(5), "k5" ) REGDEF(K6, 6+KBASE, KMASK(6), "k6" ) REGDEF(K7, 7+KBASE, KMASK(7), "k7" ) -REGDEF(STK, 8+KBASE, 0x0000, "STK" ) +REGDEF(STK, 8+KBASE, 0x0000, "STK" ) #elif defined(TARGET_ARM) #include "registerarm.h" diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index a912d6f669755..a838814164f88 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -682,12 +682,10 @@ inline bool isFloatRegType(var_types type) return varTypeUsesFloatReg(type); } -#if defined(TARGET_XARCH) inline bool isMaskReg(var_types type) { return varTypeIsMask(type); } -#endif // If the WINDOWS_AMD64_ABI is defined make sure that TARGET_AMD64 is also defined. #if defined(WINDOWS_AMD64_ABI) diff --git a/src/coreclr/jit/typelist.h b/src/coreclr/jit/typelist.h index 89adbd3dc613c..cad267f060b92 100644 --- a/src/coreclr/jit/typelist.h +++ b/src/coreclr/jit/typelist.h @@ -63,8 +63,8 @@ DEF_TP(SIMD16 ,"simd16" , TYP_SIMD16, TI_STRUCT,16,16, 16, 4,16, VTF_S|VTF_ #if defined(TARGET_XARCH) DEF_TP(SIMD32 ,"simd32" , TYP_SIMD32, TI_STRUCT,32,32, 32, 8,16, VTF_S|VTF_VEC) DEF_TP(SIMD64 ,"simd64" , TYP_SIMD64, TI_STRUCT,64,64, 64, 16,16, VTF_S|VTF_VEC) +DEF_TP(MASK ,"mask" , TYP_MASK, TI_STRUCT, 8, 8, 8, 2, 8, VTF_ANY) #endif // TARGET_XARCH -DEF_TP(MASK ,"mask" , TYP_MASK, TI_STRUCT,8, 8, 8, 2,8, VTF_S) #endif // FEATURE_SIMD DEF_TP(UNKNOWN ,"unknown" ,TYP_UNKNOWN, TI_ERROR, 0, 0, 0, 0, 0, VTF_ANY) diff --git a/src/coreclr/jit/vartype.h b/src/coreclr/jit/vartype.h index d3ace4c6bf0a1..5c19c997d8a38 100644 --- a/src/coreclr/jit/vartype.h +++ b/src/coreclr/jit/vartype.h @@ -73,17 +73,15 @@ inline bool varTypeIsSIMD(T vt) #endif } -#ifdef TARGET_XARCH template inline bool varTypeIsMask(T vt) { -#ifdef FEATURE_SIMD +#if defined (TARGET_XARCH) && defined(FEATURE_SIMD) return (TypeGet(vt) == TYP_MASK); #else // FEATURE_SIMD return false; -#endif // !FEATURE_ +#endif } -#endif template inline bool varTypeIsIntegral(T vt) diff --git a/src/coreclr/vm/threadsuspend.cpp b/src/coreclr/vm/threadsuspend.cpp index 31e381e18472e..1434e94bef4fe 100644 --- a/src/coreclr/vm/threadsuspend.cpp +++ b/src/coreclr/vm/threadsuspend.cpp @@ -3050,7 +3050,6 @@ BOOL Thread::RedirectCurrentThreadAtHandledJITCase(PFN_REDIRECTTARGET pTgt, CONT //////////////////////////////////////////////////// // Now redirect the thread to the helper function - SetIP(pCurrentThreadCtx, (PCODE)pTgt); #ifdef TARGET_ARM From 6340763f64a54fcb033670e8753a39d1de6c1287 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Tue, 7 Mar 2023 06:08:06 -0800 Subject: [PATCH 07/12] Build fixes. --- src/coreclr/jit/lsra.h | 12 ++++++------ src/coreclr/jit/vartype.h | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 43483b10d8471..3f337a1eece90 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -487,7 +487,7 @@ class RegRecord : public Referenceable { registerType = FloatRegisterType; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) else if (emitter::isMaskReg(reg)) { registerType = MaskRegisterType; @@ -1605,11 +1605,11 @@ class LinearScan : public LinearScanInterface // A temporary VarToRegMap used during the resolution of critical edges. VarToRegMap sharedCriticalVarToRegMap; - PhasedVar availableIntRegs; - PhasedVar availableFloatRegs; - PhasedVar availableDoubleRegs; -#if defined(TARGET_XARCH) - PhasedVar availableMaskRegs; + PhasedVar availableIntRegs; + PhasedVar availableFloatRegs; + PhasedVar availableDoubleRegs; +#if defined(TARGET_XARCH) + PhasedVar availableMaskRegs; #endif PhasedVar* availableRegs[TYP_COUNT]; diff --git a/src/coreclr/jit/vartype.h b/src/coreclr/jit/vartype.h index 5c19c997d8a38..097e05d13f29f 100644 --- a/src/coreclr/jit/vartype.h +++ b/src/coreclr/jit/vartype.h @@ -76,11 +76,11 @@ inline bool varTypeIsSIMD(T vt) template inline bool varTypeIsMask(T vt) { -#if defined (TARGET_XARCH) && defined(FEATURE_SIMD) +#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) return (TypeGet(vt) == TYP_MASK); -#else // FEATURE_SIMD +#else // FEATURE_SIMD return false; -#endif +#endif } template From 41c4b14921bb98239dc73c75338cd0cf62850b11 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Tue, 7 Mar 2023 21:22:35 -0800 Subject: [PATCH 08/12] Address throughput issues pertaining to `availableRegCount`. --- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 3 +++ src/coreclr/jit/hwintrinsicxarch.cpp | 4 +--- src/coreclr/jit/lsra.cpp | 19 +++++++++++++++-- src/coreclr/jit/lsra.h | 23 +++++++++++++-------- src/coreclr/jit/target.h | 2 +- src/coreclr/jit/targetamd64.h | 4 +++- src/coreclr/jit/targetx86.h | 3 ++- 7 files changed, 41 insertions(+), 17 deletions(-) diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 5bacee515d947..3b176bc81f594 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1679,6 +1679,9 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) instruction maskIns; instruction kmovIns; + // TODO-XARCH-AVX512 note that this type/kmov combination assumes 512-bit vector types but would change + // if used for other vector lengths, i.e., TYPE_BYTE requires kmovq for for 512-bit vector, but kmovd + // for 256-bit vector. switch (baseType) { case TYP_BYTE: diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index acca5e2308295..a06910cffffa0 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1251,9 +1251,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector512_ExtractMostSignificantBits: { - if (compOpportunisticallyDependsOn(InstructionSet_AVX512F) && - compOpportunisticallyDependsOn(InstructionSet_AVX512BW) && - compOpportunisticallyDependsOn(InstructionSet_AVX512DQ)) + if (IsBaselineVector512IsaSupported()) { var_types simdType = getSIMDTypeForSize(simdSize); diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 0859544c3c3dd..222bb7d398c7e 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -636,10 +636,12 @@ LinearScan::LinearScan(Compiler* theCompiler) , refPositions(theCompiler->getAllocator(CMK_LSRA_RefPosition)) , listNodePool(theCompiler) { +#if defined(TARGET_XARCH) + availableRegCount = ACTUAL_REG_COUNT; + #if defined(TARGET_AMD64) rbmAllFloat = compiler->rbmAllFloat; rbmFltCalleeTrash = compiler->rbmFltCalleeTrash; - availableRegCount = ACTUAL_REG_COUNT; if (!compiler->DoJitStressEvexEncoding()) { @@ -647,6 +649,16 @@ LinearScan::LinearScan(Compiler* theCompiler) } #endif // TARGET_AMD64 +#if defined(TARGET_XARCH) + if (!compiler->DoJitStressEvexEncoding()) + { + availableRegCount -= CNT_MASK_REGS; + } +#endif + +#endif // TARGET_XARCH + + regSelector = new (theCompiler, CMK_LSRA) RegisterSelection(this); firstColdLoc = MaxLocation; @@ -699,7 +711,10 @@ LinearScan::LinearScan(Compiler* theCompiler) availableFloatRegs = RBM_ALLFLOAT; availableDoubleRegs = RBM_ALLDOUBLE; #if defined(TARGET_XARCH) - availableMaskRegs = RBM_K1; + if (compiler->DoJitStressEvexEncoding()) + { + availableMaskRegs = RBM_ALLMASK; + } #endif #if defined(TARGET_AMD64) || defined(TARGET_ARM64) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 3f337a1eece90..92ed3b73717e9 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -483,21 +483,21 @@ class RegRecord : public Referenceable } else #endif - if (emitter::isFloatReg(reg)) + if (emitter::isGeneralRegister(reg)) + { + assert(registerType == IntRegisterType); + } + else if (emitter::isFloatReg(reg)) { registerType = FloatRegisterType; } #if defined(TARGET_XARCH) && defined(FEATURE_SIMD) - else if (emitter::isMaskReg(reg)) + else { + assert(emitter::isMaskReg(reg)); registerType = MaskRegisterType; } #endif - else - { - // The constructor defaults to IntRegisterType - assert(emitter::isGeneralRegister(reg) && registerType == IntRegisterType); - } regNum = reg; isCalleeSave = ((RBM_CALLEE_SAVED & genRegMask(reg)) != 0); } @@ -1951,10 +1951,11 @@ class LinearScan : public LinearScanInterface #endif // FEATURE_ARG_SPLIT int BuildLclHeap(GenTree* tree); +#if defined(TARGET_XARCH) + #if defined(TARGET_AMD64) regMaskTP rbmAllFloat; regMaskTP rbmFltCalleeTrash; - unsigned availableRegCount; regMaskTP get_RBM_ALLFLOAT() const { @@ -1964,11 +1965,15 @@ class LinearScan : public LinearScanInterface { return this->rbmFltCalleeTrash; } +#endif // TARGET_AMD64 + + unsigned availableRegCount; + unsigned get_AVAILABLE_REG_COUNT() const { return this->availableRegCount; } -#endif // TARGET_AMD64 +#endif // TARGET_XARCH //------------------------------------------------------------------------ // calleeSaveRegs: Get the set of callee-save registers of the given RegisterType diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index a838814164f88..22e3c106a4ae2 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -190,7 +190,7 @@ enum _regMask_enum : unsigned #error Unsupported target architecture #endif -#if defined(TARGET_AMD64) +#if defined(TARGET_XARCH) // AVAILABLE_REG_COUNT is defined to be dynamic, based on whether AVX-512 high registers are available. #define AVAILABLE_REG_COUNT get_AVAILABLE_REG_COUNT() #else diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index b50fab0ba2f3b..6c56df1106923 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -94,7 +94,9 @@ #define REG_MASK_FIRST REG_K0 #define REG_MASK_LAST REG_K7 - #define RBM_ALLMASK (RBM_K1 | RBM_K2 | RBM_K3 | RBM_K4 | RBM_K5 | RBM_K6 | RBM_K7) + #define RBM_ALLMASK RBM_K1 + + #define CNT_MASK_REGS 8 #ifdef UNIX_AMD64_ABI #define LAST_FP_ARGREG REG_XMM7 diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index f716327298123..4c2d41ef9ae66 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -77,6 +77,7 @@ #define REG_MASK_FIRST REG_K0 #define REG_MASK_LAST REG_K7 + #define CNT_MASK_REGS 8 #define FIRST_FP_ARGREG REG_XMM0 #define LAST_FP_ARGREG REG_XMM3 @@ -95,7 +96,7 @@ #define RBM_ALLFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7) #define RBM_ALLDOUBLE RBM_ALLFLOAT - #define RBM_ALLMASK (REG_K1 | REG_K2 | REG_K3 | REG_K4 | REG_K5 | REG_K6 | REG_K7) + #define RBM_ALLMASK REG_K1 // TODO-CQ: Currently we are following the x86 ABI for SSE2 registers. // This should be reconsidered. From 3fb7f783f301e49ea17ef191ba46a786720b0ef2 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Thu, 9 Mar 2023 15:38:27 -0800 Subject: [PATCH 09/12] kmov RR refactor. --- src/coreclr/jit/emit.cpp | 3 ++- src/coreclr/jit/emitxarch.cpp | 37 ++++++++++++++++++++++------------- src/coreclr/jit/instrsxarch.h | 24 +++++++++++------------ src/coreclr/jit/lsra.cpp | 15 ++------------ src/coreclr/jit/lsra.h | 4 ++-- src/coreclr/jit/targetx86.h | 2 ++ 6 files changed, 43 insertions(+), 42 deletions(-) diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index c28eebc6bbf4d..a20026c803a83 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -8294,12 +8294,13 @@ void emitter::emitDispDataSec(dataSecDsc* section, BYTE* dst) i += j; break; + case 64: case 32: case 16: case 8: assert((data->dsSize % 8) == 0); printf("\tdq\t%016llXh", *reinterpret_cast(&data->dsCont[i])); - for (j = 8; j < 32; j += 8) + for (j = 8; j < 64; j += 8) { if (i + j >= data->dsSize) break; diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index bdf4e3414da27..d0a1a542a1bcb 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -53,7 +53,7 @@ bool emitter::IsKInstruction(instruction ins) bool emitter::IsAvx512OrPriorInstruction(instruction ins) { // TODO-XArch-AVX512: Fix check once AVX512 instructions are added. - return ((ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION)) || IsKInstruction(ins); + return ((ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION)); } bool emitter::IsAVXOnlyInstruction(instruction ins) @@ -161,7 +161,7 @@ regNumber emitter::getSseShiftRegNumber(instruction ins) bool emitter::IsVexEncodedInstruction(instruction ins) const { - return UseVEXEncoding() && (IsSSEOrAVXInstruction(ins) || IsKInstruction(ins)); + return UseVEXEncoding() && IsSSEOrAVXInstruction(ins); } //------------------------------------------------------------------------ @@ -1308,6 +1308,7 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) case INS_shlx: case INS_sarx: case INS_shrx: + case INS_kmovq: // kmovq always takes W1 bit, regardless of form. return true; default: return false; @@ -1321,7 +1322,7 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) // so we never need it if ((ins != INS_push) && (ins != INS_pop) && (ins != INS_movq) && (ins != INS_movzx) && (ins != INS_push_hide) && (ins != INS_pop_hide) && (ins != INS_ret) && (ins != INS_call) && (ins != INS_tail_i_jmp) && - !((ins >= INS_i_jmp) && (ins <= INS_l_jg)) && (ins != INS_kmovb) && (ins != INS_kmovw) && (ins != INS_kmovd)) + !((ins >= INS_i_jmp) && (ins <= INS_l_jg))) { return true; } @@ -3491,16 +3492,11 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) // If Byte 4 (which is 0xFF00) is zero, that's where the RM encoding goes. // Otherwise, it will be placed after the 4 byte encoding, making the total 5 bytes. // This would probably be better expressed as a different format or something? - code_t code; + code_t code = insCodeRM(ins); if (IsKInstruction(ins)) { - code = insCodeRR(ins); code = AddVexPrefix(ins, code, EA_SIZE(id->idOpSize())); } - else - { - code = insCodeRM(ins); - } UNATIVE_OFFSET sz = emitGetAdjustedSize(id, code); @@ -3514,7 +3510,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) emitAttr size = EA_SIZE(attr); if ((TakesRexWPrefix(ins, size) && ((ins != INS_xor) || (reg1 != reg2))) || IsExtendedReg(reg1, attr) || - IsExtendedReg(reg2, attr)) + IsExtendedReg(reg2, attr) || (ins == INS_kmovd && isMaskReg(reg1) && isMaskReg(reg2))) { sz += emitGetRexPrefixSize(ins); includeRexPrefixSize = false; @@ -13849,13 +13845,26 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert((ins != INS_movd) || (isFloatReg(reg1) != isFloatReg(reg2))); - if (IsKInstruction(ins)) + if (ins == INS_kmovb || ins == INS_kmovw || ins == INS_kmovd || ins == INS_kmovq) { - code = insCodeRR(ins); + assert(!(isGeneralRegister(reg1) && isGeneralRegister(reg2))); + + code = insCodeRM(ins); if (isGeneralRegister(reg1)) { - // kmov r, k form, flip last byte of opcode from 0x92 to 0x93 - code |= 0x01; + // kmov r, k form, flip last byte of opcode from 0x90 to 0x92 + code |= 0x03; + } + else if (isGeneralRegister(reg2)) + { + // kmov r, k form, flip last byte of opcode from 0x90 to 0x92 + code |= 0x02; + } + + // kmovd RR form requires W bit + if (!(code & 0x02) && ins == INS_kmovd) + { + AddRexWPrefix(id, code); } } else if ((ins != INS_movd) || isFloatReg(reg1)) diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 1b484cf27d583..4c7724854571d 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -67,18 +67,6 @@ INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, // See comment around quarter way through this file for more information. INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_TT_NONE, INS_FLAGS_None ) -INST5(FIRST_K_INSTRUCTION, "FIRST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) - -// id nm um mr mi rm a4 rr tt flags -// TODO-XARCH-AVX512 add the proper W bit switch -INST5(kmovb, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, PACK3(0x66, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_Has_Wbit ) -INST5(kmovw, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, PACK2(0x0F, 0x92), INS_TT_NONE, INS_FLAGS_Has_Wbit ) -INST5(kmovd, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, PACK3(0xF2, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_Has_Wbit ) -INST5(kmovq, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, PACK3(0xF2, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_Has_Wbit ) - -INST5(LAST_K_INSTRUCTION, "LAST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) - - // id nm um mr mi rm a4 tt flags INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) @@ -639,6 +627,18 @@ INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(FIRST_K_INSTRUCTION, "FIRST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) + +// id nm um mr mi rm tt flags +// TODO-XARCH-AVX512 add the proper W bit switch +INST3(kmovb, "kmovb", IUM_WR, PACK3(0x66, 0x0F, 0x91), BAD_CODE, PACK3(0x66, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_Has_Wbit ) +INST3(kmovw, "kmovw", IUM_WR, PACK2(0x0F, 0x91), BAD_CODE, PACK2(0x0F, 0x90), INS_TT_NONE, INS_FLAGS_Has_Wbit ) +INST3(kmovd, "kmovd", IUM_WR, PACK3(0xF2, 0x0F, 0x91), BAD_CODE, PACK3(0xF2, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_Has_Wbit ) +INST3(kmovq, "kmovq", IUM_WR, PACK3(0xF2, 0x0F, 0x91), BAD_CODE, PACK3(0xF2, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_Has_Wbit ) + +INST3(LAST_K_INSTRUCTION, "LAST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) + + INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 222bb7d398c7e..05da557d2f57a 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -642,23 +642,15 @@ LinearScan::LinearScan(Compiler* theCompiler) #if defined(TARGET_AMD64) rbmAllFloat = compiler->rbmAllFloat; rbmFltCalleeTrash = compiler->rbmFltCalleeTrash; +#endif if (!compiler->DoJitStressEvexEncoding()) { availableRegCount -= CNT_HIGHFLOAT; - } -#endif // TARGET_AMD64 - -#if defined(TARGET_XARCH) - if (!compiler->DoJitStressEvexEncoding()) - { availableRegCount -= CNT_MASK_REGS; } -#endif - #endif // TARGET_XARCH - regSelector = new (theCompiler, CMK_LSRA) RegisterSelection(this); firstColdLoc = MaxLocation; @@ -711,10 +703,7 @@ LinearScan::LinearScan(Compiler* theCompiler) availableFloatRegs = RBM_ALLFLOAT; availableDoubleRegs = RBM_ALLDOUBLE; #if defined(TARGET_XARCH) - if (compiler->DoJitStressEvexEncoding()) - { - availableMaskRegs = RBM_ALLMASK; - } + availableMaskRegs = RBM_ALLMASK; #endif #if defined(TARGET_AMD64) || defined(TARGET_ARM64) diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 92ed3b73717e9..6c8b961a44c7d 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -492,7 +492,7 @@ class RegRecord : public Referenceable registerType = FloatRegisterType; } #if defined(TARGET_XARCH) && defined(FEATURE_SIMD) - else + else { assert(emitter::isMaskReg(reg)); registerType = MaskRegisterType; @@ -1967,7 +1967,7 @@ class LinearScan : public LinearScanInterface } #endif // TARGET_AMD64 - unsigned availableRegCount; + unsigned availableRegCount; unsigned get_AVAILABLE_REG_COUNT() const { diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index 4c2d41ef9ae66..0499abe7d8149 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -98,6 +98,8 @@ #define RBM_ALLMASK REG_K1 + #define CNT_HIGHFLOAT 0 + // TODO-CQ: Currently we are following the x86 ABI for SSE2 registers. // This should be reconsidered. #define RBM_FLT_CALLEE_SAVED RBM_NONE From 1c19de31b7741b39071f28ea222e70e9dc3ac004 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Fri, 10 Mar 2023 10:45:04 -0800 Subject: [PATCH 10/12] Split kmov into kmov_msk and kmov_gpr. --- src/coreclr/jit/emitxarch.cpp | 92 +++++++++++++-------- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 8 +- src/coreclr/jit/instrsxarch.h | 14 +++- 3 files changed, 70 insertions(+), 44 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index d0a1a542a1bcb..8957839ea14a4 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -270,10 +270,14 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const case INS_vbroadcastf128: // INS_vbroadcastf32x4, INS_vbroadcastf64x2. case INS_vbroadcasti128: // INS_vbroadcasti32x4, INS_vbroadcasti64x2. - case INS_kmovb: - case INS_kmovw: - case INS_kmovd: - case INS_kmovq: + case INS_kmovb_msk: + case INS_kmovw_msk: + case INS_kmovd_msk: + case INS_kmovq_msk: + case INS_kmovb_gpr: + case INS_kmovw_gpr: + case INS_kmovd_gpr: + case INS_kmovq_gpr: // TODO-XARCH-AVX512 these need to be encoded with the proper individual EVEX instructions (movdqu8, // movdqu16 etc) @@ -1308,7 +1312,9 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) case INS_shlx: case INS_sarx: case INS_shrx: - case INS_kmovq: // kmovq always takes W1 bit, regardless of form. + case INS_kmovq_msk: + case INS_kmovq_gpr: + case INS_kmovd_msk: return true; default: return false; @@ -3510,7 +3516,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) emitAttr size = EA_SIZE(attr); if ((TakesRexWPrefix(ins, size) && ((ins != INS_xor) || (reg1 != reg2))) || IsExtendedReg(reg1, attr) || - IsExtendedReg(reg2, attr) || (ins == INS_kmovd && isMaskReg(reg1) && isMaskReg(reg2))) + IsExtendedReg(reg2, attr)) { sz += emitGetRexPrefixSize(ins); includeRexPrefixSize = false; @@ -5869,10 +5875,14 @@ bool emitter::IsMovInstruction(instruction ins) case INS_movupd: case INS_movups: case INS_movzx: - case INS_kmovb: - case INS_kmovw: - case INS_kmovd: - case INS_kmovq: + case INS_kmovb_msk: + case INS_kmovw_msk: + case INS_kmovd_msk: + case INS_kmovq_msk: + case INS_kmovb_gpr: + case INS_kmovw_gpr: + case INS_kmovd_gpr: + case INS_kmovq_gpr: { return true; } @@ -5994,10 +6004,14 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) } #endif // TARGET_AMD64 - case INS_kmovb: - case INS_kmovw: - case INS_kmovd: - case INS_kmovq: + case INS_kmovb_msk: + case INS_kmovw_msk: + case INS_kmovd_msk: + case INS_kmovq_msk: + case INS_kmovb_gpr: + case INS_kmovw_gpr: + case INS_kmovd_gpr: + case INS_kmovq_gpr: { hasSideEffect = true; break; @@ -6214,11 +6228,24 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN } #endif // TARGET_AMD64 - case INS_kmovb: - case INS_kmovw: - case INS_kmovd: - case INS_kmovq: + case INS_kmovb_msk: + case INS_kmovw_msk: + case INS_kmovd_msk: + case INS_kmovq_msk: + { + assert((isMaskReg(dstReg) || isMaskReg(srcReg)) && !isGeneralRegister(dstReg) && + !isGeneralRegister(srcReg)); + break; + } + + case INS_kmovb_gpr: + case INS_kmovw_gpr: + case INS_kmovd_gpr: + case INS_kmovq_gpr: + { + assert(isGeneralRegister(dstReg) || isGeneralRegister(srcReg)); break; + } default: { @@ -13845,26 +13872,15 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert((ins != INS_movd) || (isFloatReg(reg1) != isFloatReg(reg2))); - if (ins == INS_kmovb || ins == INS_kmovw || ins == INS_kmovd || ins == INS_kmovq) + if (ins == INS_kmovb_gpr || ins == INS_kmovw_gpr || ins == INS_kmovd_gpr || ins == INS_kmovq_gpr) { assert(!(isGeneralRegister(reg1) && isGeneralRegister(reg2))); code = insCodeRM(ins); if (isGeneralRegister(reg1)) { - // kmov r, k form, flip last byte of opcode from 0x90 to 0x92 - code |= 0x03; - } - else if (isGeneralRegister(reg2)) - { - // kmov r, k form, flip last byte of opcode from 0x90 to 0x92 - code |= 0x02; - } - - // kmovd RR form requires W bit - if (!(code & 0x02) && ins == INS_kmovd) - { - AddRexWPrefix(id, code); + // kmov r, k form, flip last byte of opcode from 0x92 to 0x93 + code |= 0x01; } } else if ((ins != INS_movd) || isFloatReg(reg1)) @@ -18179,10 +18195,14 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; } - case INS_kmovb: - case INS_kmovw: - case INS_kmovd: - case INS_kmovq: + case INS_kmovb_msk: + case INS_kmovw_msk: + case INS_kmovd_msk: + case INS_kmovq_msk: + case INS_kmovb_gpr: + case INS_kmovw_gpr: + case INS_kmovd_gpr: + case INS_kmovq_gpr: { result.insLatency += PERFSCORE_LATENCY_3C; result.insThroughput = PERFSCORE_THROUGHPUT_1C; diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 3b176bc81f594..77a7ab22dc19c 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1687,24 +1687,24 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) case TYP_BYTE: case TYP_UBYTE: maskIns = INS_vpmovb2m; - kmovIns = INS_kmovq; + kmovIns = INS_kmovq_gpr; break; case TYP_SHORT: case TYP_USHORT: maskIns = INS_vpmovw2m; - kmovIns = INS_kmovd; + kmovIns = INS_kmovd_gpr; break; case TYP_INT: case TYP_UINT: case TYP_FLOAT: maskIns = INS_vpmovd2m; - kmovIns = INS_kmovw; + kmovIns = INS_kmovw_gpr; break; case TYP_DOUBLE: case TYP_LONG: case TYP_ULONG: maskIns = INS_vpmovq2m; - kmovIns = INS_kmovb; + kmovIns = INS_kmovb_gpr; break; default: unreached(); diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 4c7724854571d..bef8c11026610 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -631,10 +631,16 @@ INST3(FIRST_K_INSTRUCTION, "FIRST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BA // id nm um mr mi rm tt flags // TODO-XARCH-AVX512 add the proper W bit switch -INST3(kmovb, "kmovb", IUM_WR, PACK3(0x66, 0x0F, 0x91), BAD_CODE, PACK3(0x66, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_Has_Wbit ) -INST3(kmovw, "kmovw", IUM_WR, PACK2(0x0F, 0x91), BAD_CODE, PACK2(0x0F, 0x90), INS_TT_NONE, INS_FLAGS_Has_Wbit ) -INST3(kmovd, "kmovd", IUM_WR, PACK3(0xF2, 0x0F, 0x91), BAD_CODE, PACK3(0xF2, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_Has_Wbit ) -INST3(kmovq, "kmovq", IUM_WR, PACK3(0xF2, 0x0F, 0x91), BAD_CODE, PACK3(0xF2, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_Has_Wbit ) +INST3(kmovb_msk, "kmovb", IUM_WR, PACK3(0x66, 0x0F, 0x91), BAD_CODE, PACK3(0x66, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None ) +INST3(kmovw_msk, "kmovw", IUM_WR, PACK2(0x0F, 0x91), BAD_CODE, PACK2(0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None ) +INST3(kmovd_msk, "kmovd", IUM_WR, PACK3(0xF2, 0x0F, 0x91), BAD_CODE, PACK3(0xF2, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None ) +INST3(kmovq_msk, "kmovq", IUM_WR, PACK3(0xF2, 0x0F, 0x91), BAD_CODE, PACK3(0xF2, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None ) + + +INST3(kmovb_gpr, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0x66, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_None ) +INST3(kmovw_gpr, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, PACK2(0x0F, 0x92), INS_TT_NONE, INS_FLAGS_None ) +INST3(kmovd_gpr, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF2, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_None ) +INST3(kmovq_gpr, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, PACK3(0xF2, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_None ) INST3(LAST_K_INSTRUCTION, "LAST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) From 45f4b90b7943f37051c5f3ef66a45ae683d5ad19 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Fri, 10 Mar 2023 10:47:50 -0800 Subject: [PATCH 11/12] Fix thread. --- src/coreclr/vm/threadsuspend.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/coreclr/vm/threadsuspend.cpp b/src/coreclr/vm/threadsuspend.cpp index 1434e94bef4fe..89f2f9d33f7e2 100644 --- a/src/coreclr/vm/threadsuspend.cpp +++ b/src/coreclr/vm/threadsuspend.cpp @@ -3050,6 +3050,7 @@ BOOL Thread::RedirectCurrentThreadAtHandledJITCase(PFN_REDIRECTTARGET pTgt, CONT //////////////////////////////////////////////////// // Now redirect the thread to the helper function + SetIP(pCurrentThreadCtx, (PCODE)pTgt); #ifdef TARGET_ARM From 46c9e27b3426bae73a45b640d75bb393c02516e6 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Fri, 10 Mar 2023 10:57:13 -0800 Subject: [PATCH 12/12] Review edits. --- src/coreclr/jit/instrsxarch.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index bef8c11026610..4a45d025ce3fd 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -629,8 +629,6 @@ INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, INST3(FIRST_K_INSTRUCTION, "FIRST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) -// id nm um mr mi rm tt flags -// TODO-XARCH-AVX512 add the proper W bit switch INST3(kmovb_msk, "kmovb", IUM_WR, PACK3(0x66, 0x0F, 0x91), BAD_CODE, PACK3(0x66, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None ) INST3(kmovw_msk, "kmovw", IUM_WR, PACK2(0x0F, 0x91), BAD_CODE, PACK2(0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None ) INST3(kmovd_msk, "kmovd", IUM_WR, PACK3(0xF2, 0x0F, 0x91), BAD_CODE, PACK3(0xF2, 0x0F, 0x90), INS_TT_NONE, INS_FLAGS_None )