From 028cdf5058ab44cbf7b4014272229984a2c6a8ea Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Wed, 24 Aug 2022 16:02:39 -0700 Subject: [PATCH] Adding EVEX encoding pathways for emitOutputRRR(). Adding flag to turn on EVEX encoding. --- src/coreclr/jit/compiler.cpp | 5 + src/coreclr/jit/compiler.h | 33 + src/coreclr/jit/emit.h | 1 + src/coreclr/jit/emitxarch.cpp | 997 +++++++++++++++++++++++++++--- src/coreclr/jit/emitxarch.h | 365 +++++++++++ src/coreclr/jit/instrsxarch.h | 2 + src/coreclr/jit/jitconfigvalues.h | 5 +- 7 files changed, 1324 insertions(+), 84 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index e92d45518e9a6..bc2d9bfa1c3f0 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2284,6 +2284,11 @@ void Compiler::compSetProcessor() #ifdef TARGET_XARCH if (!compIsForInlining()) { + if (canUseEvexEncoding()) + { + codeGen->GetEmitter()->SetUseEvexEncoding(true); + // TODO-XArch-AVX512: Revisit other flags to be set once avx512 instructions are added. + } if (canUseVexEncoding()) { codeGen->GetEmitter()->SetUseVEXEncoding(true); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 51b8764a0da35..d2031af55f7ef 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8960,6 +8960,39 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #endif } + //------------------------------------------------------------------------ + // canUseEvexEncoding - Answer the question: Is Evex encoding supported on this target. + // + // Returns: + // TRUE if Evex encoding is supported, FALSE if not. + bool canUseEvexEncoding() const + { +#ifdef TARGET_XARCH + return compOpportunisticallyDependsOn(InstructionSet_AVX512F); +#else + return false; +#endif + } + + //------------------------------------------------------------------------ + // DoJitStressEvexEncoding- Answer the question: Do we force EVEX encoding. + // + // Returns: + // TRUE if user requests EVEX encoding and it's safe, FALSE if not. + bool DoJitStressEvexEncoding() const + { +#ifdef TARGET_XARCH + // Using JitStressEVEXEncoding flag will force instructions which would + // otherwise use VEX encoding but can be EVEX encoded to use EVEX encoding + // This requires AVX512VL support. + if (JitConfig.JitStressEVEXEncoding() && compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) + { + return true; + } +#endif + return false; + } + /* XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index c5d245ba83c66..34379a6a3d05a 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -445,6 +445,7 @@ class emitter #ifdef TARGET_XARCH SetUseVEXEncoding(false); + SetUseEvexEncoding(false); #endif // TARGET_XARCH emitDataSecCur = nullptr; diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 1ef3510a75260..84d6df4e2251d 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -34,11 +34,38 @@ bool emitter::IsSSEOrAVXInstruction(instruction ins) return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION); } +//------------------------------------------------------------------------ +// IsAvx512OrPriorInstruction: Is this an Avx512 or Avx or Sse instruction. +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// TRUE if it is a sse or avx or avx512 instruction.. +bool emitter::IsAvx512OrPriorInstruction(instruction ins) +{ + // TODO-XArch-AVX512: Fix check once AVX512 instructions are added. + return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION); +} + bool emitter::IsAVXOnlyInstruction(instruction ins) { return (ins >= INS_FIRST_AVX_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION); } +//------------------------------------------------------------------------ +// IsAvx512OnlyInstruction: Is this an Avx512 instruction. +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// TRUE if it is a avx512f+ instruction. +bool emitter::IsAvx512OnlyInstruction(instruction ins) +{ + return (ins >= INS_FIRST_AVX512_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION); +} + bool emitter::IsFMAInstruction(instruction ins) { return (ins >= INS_FIRST_FMA_INSTRUCTION) && (ins <= INS_LAST_FMA_INSTRUCTION); @@ -128,6 +155,146 @@ bool emitter::IsAVXInstruction(instruction ins) const return UseVEXEncoding() && IsSSEOrAVXInstruction(ins); } +//------------------------------------------------------------------------ +// IsAvx512Instruction: Answer the question- Can this instruction be Evex encoded. +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// TRUE if ins can be Evex encoded. +bool emitter::IsAvx512Instruction(instruction ins) const +{ + if (!UseEvexEncoding()) + { + return false; + } + + // TODO-XArch-AVX512: Explore adding this as a flag to instr table. + switch (ins) + { + // No EVEX Encoding exists at all. + case INS_pmovmskb: + case INS_movmskpd: + case INS_movmskps: + case INS_dppd: + case INS_dpps: + case INS_maskmovdqu: + case INS_haddps: + case INS_haddpd: + case INS_hsubps: + case INS_hsubpd: + case INS_addsubps: + case INS_addsubpd: + case INS_rcpps: + case INS_rcpss: + case INS_rsqrtps: + case INS_rsqrtss: + case INS_psignb: + case INS_psignd: + case INS_psignw: + case INS_roundps: + case INS_roundss: + case INS_roundpd: + case INS_roundsd: + case INS_blendps: + case INS_blendpd: + case INS_blendvps: + case INS_pblendw: + case INS_pblendvb: + case INS_blendvpd: + case INS_ptest: + case INS_phaddw: + case INS_phsubw: + case INS_phaddd: + case INS_phsubd: + case INS_phaddsw: + case INS_phsubsw: + case INS_lddqu: + case INS_phminposuw: + case INS_mpsadbw: + case INS_pclmulqdq: + case INS_aesdec: + case INS_aesdeclast: + case INS_aesenc: + case INS_aesenclast: + case INS_aesimc: + case INS_aeskeygenassist: + case INS_vzeroupper: + case INS_vperm2i128: + case INS_vperm2f128: + case INS_vpblendd: + case INS_vblendvps: + case INS_vblendvpd: + case INS_vpblendvb: + case INS_vtestps: + case INS_vtestpd: + case INS_vmaskmovps: + case INS_vmaskmovpd: + case INS_vpmaskmovd: + case INS_vpmaskmovq: + case INS_andn: + case INS_blsi: + case INS_blsmsk: + case INS_blsr: + case INS_bextr: + case INS_rorx: + case INS_pdep: + case INS_pext: + case INS_bzhi: + case INS_mulx: +#ifdef TARGET_AMD64 + case INS_shlx: + case INS_sarx: + case INS_shrx: +#endif + case INS_lfence: + case INS_mfence: + case INS_movnti: + case INS_prefetchnta: + case INS_prefetcht0: + case INS_prefetcht1: + case INS_prefetcht2: + case INS_sfence: + // Might need new INS_*suffix* instructions for these. + case INS_por: // INS_pord, INS_porq. + case INS_pxor: // INS_pxord, INS_pxorq + case INS_movdqa: // INS_movdqa32, INS_movdqa64. + case INS_movdqu: // INS_movdqu8, INS_movdqu16, INS_movdqu32, INS_movdqu64. + case INS_pand: // INS_pandd, INS_pandq. + case INS_pandn: // INS_pandnd, INS_pandnq. + case INS_vextractf128: // INS_vextractf32x4, INS_vextractf64x2. + case INS_vextracti128: // INS_vextracti32x4, INS_vextracti64x2. + case INS_vinsertf128: // INS_vinsertf32x4, INS_vinsertf64x2. + case INS_vinserti128: // INS_vinserti32x4, INS_vinserti64x2. + case INS_vbroadcastf128: // INS_vbroadcastf32x4, INS_vbroadcastf64x2. + case INS_vbroadcasti128: // INS_vbroadcasti32x4, INS_vbroadcasti64x2. + { + return false; + } + default: + { + break; + } + } + + return IsAvx512OrPriorInstruction(ins); +} + +//------------------------------------------------------------------------ +// Answer the question: Is this a SIMD instruction. +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// TRUE if ins is a SIMD instruction. +// +bool emitter::IsSimdInstruction(instruction ins) const +{ + return IsAvx512Instruction(ins) || IsAVXInstruction(ins); +} + // Returns true if the AVX instruction is a binary operator that requires 3 operands. // When we emit an instruction with only two operands, we will duplicate the destination // as a source. @@ -136,7 +303,7 @@ bool emitter::IsAVXInstruction(instruction ins) const // to indicate whether a 3-operand instruction. bool emitter::IsDstDstSrcAVXInstruction(instruction ins) { - return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstDstSrcAVXInstruction) != 0) && IsAVXInstruction(ins); + return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstDstSrcAVXInstruction) != 0) && IsSimdInstruction(ins); } // Returns true if the AVX instruction requires 3 operands that duplicate the source @@ -146,7 +313,7 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins) // to indicate whether a 3-operand instruction. bool emitter::IsDstSrcSrcAVXInstruction(instruction ins) { - return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstSrcSrcAVXInstruction) != 0) && IsAVXInstruction(ins); + return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstSrcSrcAVXInstruction) != 0) && IsSimdInstruction(ins); } //------------------------------------------------------------------------ @@ -579,6 +746,97 @@ bool emitter::Is4ByteSSEInstruction(instruction ins) return !UseVEXEncoding() && EncodedBySSE38orSSE3A(ins); } +//------------------------------------------------------------------------ +// TakesEvexPrefix: Checks if the instruction should be EVEX encoded. +// TODO-XArch-AVX512: This check needs to be updated once AVX512 instructions are added. +// Eventually, this should evolve to return 'TRUE' for the following cases: +// - JitConfig.JitStressEVEXEncoding flag is set. +// - Is an new AVX512 instruction. +// - Uses ZMM vector registers. +// - Uses upper 128-bit or 256-bit registers for an AVX512VL ins. +// - Uses Operand mask encoding: 64-bit opmask registers k0-k7 for conditional execution and merging of destination +// operands. +// - Need to encode functionality specific to Instruction classes(e.g.,embedded broadcast, embedded rounding control +// etc.) +// +// Arguments: +// instruction -- processor instruction to check +// +// Return Value: +// true if this instruction requires a EVEX prefix. +// +bool emitter::TakesEvexPrefix(instruction ins) const +{ + if (!emitComp->DoJitStressEvexEncoding()) + { + return false; + } + + // TODO-XArch-AVX512: Revisit 'HasKMaskRegisterDest()' check once KMask support is added. + return IsAvx512Instruction(ins) && !HasKMaskRegisterDest(ins); +} + +// Add base EVEX prefix without setting W, R, X, or B bits +// L'L bits will be set based on emitter attr. +// +// 4-byte EVEX prefix = 62 +// - R, X, B, W - bits to express corresponding REX prefixes.Additionally, X combines with B to expand r/m to 32 SIMD +// registers +// - R' - combines with R to expand reg to 32 SIMD registers +// - mm - lower 2 bits of m-mmmmm (5-bit) in corresponding VEX prefix +// - vvvv (4-bits) - register specifier in 1's complement form; must be 1111 if unused +// - pp (2-bits) - opcode extension providing equivalent functionality of a SIMD size prefix +// these prefixes are treated mandatory when used with escape opcode 0Fh for +// some SIMD instructions +// 00 - None (0F - packed float) +// 01 - 66 (66 0F - packed double) +// 10 - F3 (F3 0F - scalar float +// 11 - F2 (F2 0F - scalar double) +// - z - bit to specify merging mode +// - L - scalar or AVX-128 bit operations (L=0), 256-bit operations (L=1) +// - L'- bit to support 512-bit operations or rounding control mode +// - b - broadcast/rc/sae context +// - V'- bit to extend vvvv +// - aaa - specifies mask register +// Rest - reserved for future use and usage of them will uresult in Undefined instruction exception. +#define DEFAULT_BYTE_EVEX_PREFIX 0x62F07C0800000000ULL + +#define DEFAULT_BYTE_EVEX_PREFIX_MASK 0xFFFFFFFF00000000ULL +#define LBIT_IN_BYTE_EVEX_PREFIX 0x0000002000000000ULL + +//------------------------------------------------------------------------ +// AddEvexPrefix: Add default EVEX perfix with only LL' bits set. +// +// Arguments: +// ins -- processor instruction to check. +// code -- opcode bits. +// attr -- operand size +// +// Return Value: +// encoded code with Evex prefix. +// +emitter::code_t emitter::AddEvexPrefix(instruction ins, code_t code, emitAttr attr) +{ + + // Only AVX512 instructions require EVEX prefix + assert(IsAvx512Instruction(ins)); + + // Shouldn't have already added EVEX prefix + assert(!hasEvexPrefix(code)); + + assert((code & DEFAULT_BYTE_EVEX_PREFIX_MASK) == 0); + + code |= DEFAULT_BYTE_EVEX_PREFIX; + + // TODO-XArch-AVX512: Add EA_64BYTE once ZMM is supported + if (attr == EA_32BYTE) + { + // Set L bit to 1 in case of instructions that operate on 256-bits. + code |= LBIT_IN_BYTE_EVEX_PREFIX; + } + return code; +} + // Returns true if this instruction requires a VEX prefix // All AVX instructions require a VEX prefix bool emitter::TakesVexPrefix(instruction ins) const @@ -856,6 +1114,17 @@ unsigned RegEncoding(regNumber reg) // AVX: specific bits within VEX prefix need to be set in bit-inverted form. emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) { + if (UseEvexEncoding() && IsAvx512Instruction(ins)) + { + if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + { + // W-bit is available in 4-byte EVEX prefix that starts with byte 62. + assert(hasEvexPrefix(code)); + + // W-bit is the only bit that is added in non bit-inverted form. + return emitter::code_t(code | 0x0000800000000000ULL); + } + } if (UseVEXEncoding() && IsAVXInstruction(ins)) { if (TakesVexPrefix(ins)) @@ -876,82 +1145,460 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) } #ifdef TARGET_AMD64 - -emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) -{ - if (UseVEXEncoding() && IsAVXInstruction(ins)) + +emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) +{ + if (UseEvexEncoding() && IsAvx512Instruction(ins)) + { + if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + { + // R-bit is available in 4-byte EVEX prefix that starts with byte 62. + assert(hasEvexPrefix(code)); + + // R-bit is added in bit-inverted form. + return code & 0xFF7FFFFFFFFFFFFFULL; + } + } + if (UseVEXEncoding() && IsAVXInstruction(ins)) + { + if (TakesVexPrefix(ins)) + { + // R-bit is supported by both 2-byte and 3-byte VEX prefix + assert(hasVexPrefix(code)); + + // R-bit is added in bit-inverted form. + return code & 0xFF7FFFFFFFFFFFULL; + } + } + + return code | 0x4400000000ULL; +} + +emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) +{ + if (UseVEXEncoding() && IsAVXInstruction(ins)) + { + if (TakesVexPrefix(ins)) + { + // X-bit is available only in 3-byte VEX prefix that starts with byte C4. + assert(hasVexPrefix(code)); + + // X-bit is added in bit-inverted form. + return code & 0xFFBFFFFFFFFFFFULL; + } + } + + return code | 0x4200000000ULL; +} + +emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code) +{ + if (UseEvexEncoding() && IsAvx512Instruction(ins)) + { + if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + { + // B-bit is available in 4-byte EVEX prefix that starts with byte 62. + assert(hasEvexPrefix(code)); + + // B-bit is added in bit-inverted form. + return code & 0xFFDFFFFFFFFFFFFFULL; + } + } + if (UseVEXEncoding() && IsAVXInstruction(ins)) + { + if (TakesVexPrefix(ins)) + { + // B-bit is available only in 3-byte VEX prefix that starts with byte C4. + assert(hasVexPrefix(code)); + + // B-bit is added in bit-inverted form. + return code & 0xFFDFFFFFFFFFFFULL; + } + } + + return code | 0x4100000000ULL; +} + +// Adds REX prefix (0x40) without W, R, X or B bits set +emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code) +{ + assert(!UseVEXEncoding() || !IsAVXInstruction(ins)); + assert(!UseEvexEncoding() || !IsAvx512Instruction(ins)); + return code | 0x4000000000ULL; +} + +#endif // TARGET_AMD64 + +bool isPrefix(BYTE b) +{ + assert(b != 0); // Caller should check this + assert(b != 0x67); // We don't use the address size prefix + assert(b != 0x65); // The GS segment override prefix is emitted separately + assert(b != 0x64); // The FS segment override prefix is emitted separately + assert(b != 0xF0); // The lock prefix is emitted separately + assert(b != 0x2E); // We don't use the CS segment override prefix + assert(b != 0x3E); // Or the DS segment override prefix + assert(b != 0x26); // Or the ES segment override prefix + assert(b != 0x36); // Or the SS segment override prefix + + // That just leaves the size prefixes used in SSE opcodes: + // Scalar Double Scalar Single Packed Double + return ((b == 0xF2) || (b == 0xF3) || (b == 0x66)); +} + +//------------------------------------------------------------------------ +// emitOutputSimdPrefixIfNeeded: Outputs EVEX prefix (in case of AVX512 instructions), +// VEX prefix (in case of AVX instructions) and REX.R/X/W/B otherwise. +// +// Arguments: +// ins -- processor instruction to check. +// dst -- buffer to write prefix to. +// code -- opcode bits. +// attr -- operand size +// +// Return Value: +// Size of prefix. +// +unsigned emitter::emitOutputSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code) +{ + // TODO-XArch-AVX512: Remove redundant code and collapse into single pathway for EVEX and VEX if possible. + if (hasEvexPrefix(code)) + { + // Only AVX512 instructions should have an EVEX prefix + assert(IsAvx512Instruction(ins)); + + code_t evexPrefix = (code >> 32) & 0xFFFFFFFF; + code &= 0x00000000FFFFFFFFLL; + + WORD leadingBytes = 0; + BYTE check = (code >> 24) & 0xFF; + if (check != 0) + { + // check for a prefix in the 11 position + BYTE sizePrefix = (code >> 16) & 0xFF; + if ((sizePrefix != 0) && isPrefix(sizePrefix)) + { + // 'pp' bits in byte 1 of EVEX prefix allows us to encode SIMD size prefixes as two bits + // + // 00 - None (0F - packed float) + // 01 - 66 (66 0F - packed double) + // 10 - F3 (F3 0F - scalar float + // 11 - F2 (F2 0F - scalar double) + switch (sizePrefix) + { + case 0x66: + // None of the existing BMI instructions should be EVEX encoded. + assert(!IsBMIInstruction(ins)); + evexPrefix |= (0x01 << 8); + break; + case 0xF3: + evexPrefix |= (0x02 << 8); + break; + case 0xF2: + evexPrefix |= (0x03 << 8); + break; + default: + assert(!"unrecognized SIMD size prefix"); + unreached(); + } + + // Now the byte in the 22 position must be an escape byte 0F + leadingBytes = check; + assert(leadingBytes == 0x0F); + + // Get rid of both sizePrefix and escape byte + code &= 0x0000FFFFLL; + + // Check the byte in the 33 position to see if it is 3A or 38. + // In such a case escape bytes must be 0x0F3A or 0x0F38 + check = code & 0xFF; + if (check == 0x3A || check == 0x38) + { + leadingBytes = (leadingBytes << 8) | check; + code &= 0x0000FF00LL; + } + } + } + else + { + // 2-byte opcode with the bytes ordered as 0x0011RM22 + // the byte in position 11 must be an escape byte. + leadingBytes = (code >> 16) & 0xFF; + assert(leadingBytes == 0x0F || leadingBytes == 0x00); + code &= 0xFFFF; + } + + // If there is an escape byte it must be 0x0F or 0x0F3A or 0x0F38 + // mm bits in byte 0 of EVEX prefix allows us to encode these + // implied leading bytes. They are identical to low two bits of VEX.mmmmm + + switch (leadingBytes) + { + case 0x00: + // there is no leading byte + break; + case 0x0F: + evexPrefix |= (0x01 << 16); + break; + case 0x0F38: + evexPrefix |= (0x02 << 16); + break; + case 0x0F3A: + evexPrefix |= (0x03 << 16); + break; + default: + assert(!"encountered unknown leading bytes"); + unreached(); + } + + // At this point + // EVEX.2211RM33 got transformed as EVEX.0000RM33 + // EVEX.0011RM22 got transformed as EVEX.0000RM22 + // + // Now output EVEX prefix leaving the 4-byte opcode + // EVEX prefix is always 4 bytes + + emitOutputByte(dst, ((evexPrefix >> 24) & 0xFF)); + emitOutputByte(dst + 1, ((evexPrefix >> 16) & 0xFF)); + emitOutputByte(dst + 2, (evexPrefix >> 8) & 0xFF); + emitOutputByte(dst + 3, evexPrefix & 0xFF); + return 4; + } + else if (hasVexPrefix(code)) + { + // Only AVX instructions should have a VEX prefix + assert(UseVEXEncoding() && IsAVXInstruction(ins)); + code_t vexPrefix = (code >> 32) & 0x00FFFFFF; + code &= 0x00000000FFFFFFFFLL; + + WORD leadingBytes = 0; + BYTE check = (code >> 24) & 0xFF; + if (check != 0) + { + // 3-byte opcode: with the bytes ordered as 0x2211RM33 or + // 4-byte opcode: with the bytes ordered as 0x22114433 + // check for a prefix in the 11 position + BYTE sizePrefix = (code >> 16) & 0xFF; + if ((sizePrefix != 0) && isPrefix(sizePrefix)) + { + // 'pp' bits in byte2 of VEX prefix allows us to encode SIMD size prefixes as two bits + // + // 00 - None (0F - packed float) + // 01 - 66 (66 0F - packed double) + // 10 - F3 (F3 0F - scalar float + // 11 - F2 (F2 0F - scalar double) + switch (sizePrefix) + { + case 0x66: + if (IsBMIInstruction(ins)) + { + switch (ins) + { + case INS_rorx: + case INS_pdep: + case INS_mulx: +// TODO: Unblock when enabled for x86 +#ifdef TARGET_AMD64 + case INS_shrx: +#endif + { + vexPrefix |= 0x03; + break; + } + + case INS_pext: +// TODO: Unblock when enabled for x86 +#ifdef TARGET_AMD64 + case INS_sarx: +#endif + { + vexPrefix |= 0x02; + break; + } +// TODO: Unblock when enabled for x86 +#ifdef TARGET_AMD64 + case INS_shlx: + { + vexPrefix |= 0x01; + break; + } +#endif + default: + { + vexPrefix |= 0x00; + break; + } + } + } + else + { + vexPrefix |= 0x01; + } + break; + case 0xF3: + vexPrefix |= 0x02; + break; + case 0xF2: + vexPrefix |= 0x03; + break; + default: + assert(!"unrecognized SIMD size prefix"); + unreached(); + } + + // Now the byte in the 22 position must be an escape byte 0F + leadingBytes = check; + assert(leadingBytes == 0x0F); + + // Get rid of both sizePrefix and escape byte + code &= 0x0000FFFFLL; + + // Check the byte in the 33 position to see if it is 3A or 38. + // In such a case escape bytes must be 0x0F3A or 0x0F38 + check = code & 0xFF; + if (check == 0x3A || check == 0x38) + { + leadingBytes = (leadingBytes << 8) | check; + code &= 0x0000FF00LL; + } + } + } + else + { + // 2-byte opcode with the bytes ordered as 0x0011RM22 + // the byte in position 11 must be an escape byte. + leadingBytes = (code >> 16) & 0xFF; + assert(leadingBytes == 0x0F || leadingBytes == 0x00); + code &= 0xFFFF; + } + + // If there is an escape byte it must be 0x0F or 0x0F3A or 0x0F38 + // m-mmmmm bits in byte 1 of VEX prefix allows us to encode these + // implied leading bytes. 0x0F is supported by both the 2-byte and + // 3-byte encoding. While 0x0F3A and 0x0F38 are only supported by + // the 3-byte version. + + switch (leadingBytes) + { + case 0x00: + // there is no leading byte + break; + case 0x0F: + vexPrefix |= 0x0100; + break; + case 0x0F38: + vexPrefix |= 0x0200; + break; + case 0x0F3A: + vexPrefix |= 0x0300; + break; + default: + assert(!"encountered unknown leading bytes"); + unreached(); + } + + // At this point + // VEX.2211RM33 got transformed as VEX.0000RM33 + // VEX.0011RM22 got transformed as VEX.0000RM22 + // + // Now output VEX prefix leaving the 4-byte opcode + + // The 2-byte VEX encoding, requires that the X and B-bits are set (these + // bits are inverted from the REX values so set means off), the W-bit is + // not set (this bit is not inverted), and that the m-mmmm bits are 0-0001 + // (the 2-byte VEX encoding only supports the 0x0F leading byte). When these + // conditions are met, we can change byte-0 from 0xC4 to 0xC5 and then + // byte-1 is the logical-or of bit 7 from byte-1 and bits 0-6 from byte 2 + // from the 3-byte VEX encoding. + // + // Given the above, the check can be reduced to a simple mask and comparison. + // * 0xFFFF7F80 is a mask that ignores any bits whose value we don't care about: + // * R can be set or unset (0x7F ignores bit 7) + // * vvvv can be any value (0x80 ignores bits 3-6) + // * L can be set or unset (0x80 ignores bit 2) + // * pp can be any value (0x80 ignores bits 0-1) + // * 0x00C46100 is a value that signifies the requirements listed above were met: + // * We must be a three-byte VEX opcode (0x00C4) + // * X and B must be set (0x61 validates bits 5-6) + // * m-mmmm must be 0-00001 (0x61 validates bits 0-4) + // * W must be unset (0x00 validates bit 7) + if ((vexPrefix & 0xFFFF7F80) == 0x00C46100) + { + // Encoding optimization calculation is not done while estimating the instruction + // size and thus over-predict instruction size by 1 byte. + // If there are IGs that will be aligned, do not optimize encoding so the + // estimated alignment sizes are accurate. + if (emitCurIG->igNum > emitLastAlignedIgNum) + { + emitOutputByte(dst, 0xC5); + emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F)); + return 2; + } + } + + emitOutputByte(dst, ((vexPrefix >> 16) & 0xFF)); + emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0xFF)); + emitOutputByte(dst + 2, vexPrefix & 0xFF); + return 3; + } + +#ifdef TARGET_AMD64 + if (code > 0x00FFFFFFFFLL) { - if (TakesVexPrefix(ins)) - { - // R-bit is supported by both 2-byte and 3-byte VEX prefix - assert(hasVexPrefix(code)); - - // R-bit is added in bit-inverted form. - return code & 0xFF7FFFFFFFFFFFULL; - } - } + BYTE prefix = (code >> 32) & 0xFF; + noway_assert(prefix >= 0x40 && prefix <= 0x4F); + code &= 0x00000000FFFFFFFFLL; - return code | 0x4400000000ULL; -} + // TODO-AMD64-Cleanup: when we remove the prefixes (just the SSE opcodes right now) + // we can remove this code as well -emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) -{ - if (UseVEXEncoding() && IsAVXInstruction(ins)) - { - if (TakesVexPrefix(ins)) + // The REX prefix is required to come after all other prefixes. + // Some of our 'opcodes' actually include some prefixes, if that + // is the case, shift them over and place the REX prefix after + // the other prefixes, and emit any prefix that got moved out. + BYTE check = (code >> 24) & 0xFF; + if (check == 0) { - // X-bit is available only in 3-byte VEX prefix that starts with byte C4. - assert(hasVexPrefix(code)); - - // X-bit is added in bit-inverted form. - return code & 0xFFBFFFFFFFFFFFULL; + // 3-byte opcode: with the bytes ordered as 0x00113322 + // check for a prefix in the 11 position + check = (code >> 16) & 0xFF; + if (check != 0 && isPrefix(check)) + { + // Swap the rex prefix and whatever this prefix is + code = (((DWORD)prefix << 16) | (code & 0x0000FFFFLL)); + // and then emit the other prefix + return emitOutputByte(dst, check); + } } - } - - return code | 0x4200000000ULL; -} - -emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code) -{ - if (UseVEXEncoding() && IsAVXInstruction(ins)) - { - if (TakesVexPrefix(ins)) + else { - // B-bit is available only in 3-byte VEX prefix that starts with byte C4. - assert(hasVexPrefix(code)); - - // B-bit is added in bit-inverted form. - return code & 0xFFDFFFFFFFFFFFULL; + // 4-byte opcode with the bytes ordered as 0x22114433 + // first check for a prefix in the 11 position + BYTE check2 = (code >> 16) & 0xFF; + if (isPrefix(check2)) + { + assert(!isPrefix(check)); // We currently don't use this, so it is untested + if (isPrefix(check)) + { + // 3 prefixes were rex = rr, check = c1, check2 = c2 encoded as 0xrrc1c2XXXX + // Change to c2rrc1XXXX, and emit check2 now + code = (((code_t)prefix << 24) | ((code_t)check << 16) | (code & 0x0000FFFFLL)); + } + else + { + // 2 prefixes were rex = rr, check2 = c2 encoded as 0xrrXXc2XXXX, (check is part of the opcode) + // Change to c2XXrrXXXX, and emit check2 now + code = (((code_t)check << 24) | ((code_t)prefix << 16) | (code & 0x0000FFFFLL)); + } + return emitOutputByte(dst, check2); + } } - } - - return code | 0x4100000000ULL; -} - -// Adds REX prefix (0x40) without W, R, X or B bits set -emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code) -{ - assert(!UseVEXEncoding() || !IsAVXInstruction(ins)); - return code | 0x4000000000ULL; -} + return emitOutputByte(dst, prefix); + } #endif // TARGET_AMD64 -bool isPrefix(BYTE b) -{ - assert(b != 0); // Caller should check this - assert(b != 0x67); // We don't use the address size prefix - assert(b != 0x65); // The GS segment override prefix is emitted separately - assert(b != 0x64); // The FS segment override prefix is emitted separately - assert(b != 0xF0); // The lock prefix is emitted separately - assert(b != 0x2E); // We don't use the CS segment override prefix - assert(b != 0x3E); // Or the DS segment override prefix - assert(b != 0x26); // Or the ES segment override prefix - assert(b != 0x36); // Or the SS segment override prefix - - // That just leaves the size prefixes used in SSE opcodes: - // Scalar Double Scalar Single Packed Double - return ((b == 0xF2) || (b == 0xF3) || (b == 0x66)); + return 0; } // Outputs VEX prefix (in case of AVX instructions) and REX.R/X/W/B otherwise. @@ -1236,6 +1883,27 @@ unsigned emitter::emitGetRexPrefixSize(instruction ins) return 1; } +//------------------------------------------------------------------------ +// emitGetEvexPrefixSize: Gets Size of Evex prefix in bytes +// TODO-XArch-AVX512: Once evex encoding is supported fully, this will take the place of emitGetAdjustedSize() +// +// Arguments: +// ins -- The instruction +// +// Returns: +// Prefix size in bytes. +// +unsigned emitter::emitGetEvexPrefixSize(instruction ins) +{ + if (IsAvx512Instruction(ins)) + { + return 4; + } + + // If not AVX512, then we don't need to encode prefix. + return 0; +} + // Size of vex prefix in bytes unsigned emitter::emitGetVexPrefixSize(instruction ins, emitAttr attr) { @@ -1248,6 +1916,143 @@ unsigned emitter::emitGetVexPrefixSize(instruction ins, emitAttr attr) return 0; } +//------------------------------------------------------------------------ +// emitGetAdjustedSizeEvexAware: Determines any size adjustment needed for a given instruction based on the current +// configuration. +// TODO-XArch-AVX512: Once evex encoding is supported fully, this will take the place of emitGetAdjustedSize() +// +// Arguments: +// ins -- The instruction being emitted +// attr -- The emit attribute +// code -- The current opcode and any known prefixes +// +// Returns: +// Updated size. +// +unsigned emitter::emitGetAdjustedSizeEvexAware(instruction ins, emitAttr attr, code_t code) +{ + unsigned adjustedSize = 0; + + // TODO-XArch-AVX512: Remove redundant code and possiblly collapse EVEX and VEX into a single pathway + // IsAvx512Instruction(ins) is TRUE for AVX/SSE instructions also which needs to be VEX encoded unless explicitly + // asked for EVEX. + if (IsAvx512Instruction(ins) && TakesEvexPrefix(ins)) + { + // EVEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces. + // Therefore, to estimate the size adding EVEX prefix size and size of instruction opcode bytes will always + // overstimate. + // Instead this routine will adjust the size of EVEX prefix based on the number of bytes of opcode it encodes so + // that + // instruction size estimate will be accurate. + // Basically this will decrease the evexPrefixSize, so that opcodeSize + evexPrefixAdjustedSize will be the + // right size. + // + // rightOpcodeSize + evexPrefixSize + // = (opcodeSize - ExtrabytesSize) + evexPrefixSize + // = opcodeSize + (evexPrefixSize - ExtrabytesSize) + // = opcodeSize + evexPrefixAdjustedSize + + unsigned evexPrefixAdjustedSize = emitGetEvexPrefixSize(ins); + assert(evexPrefixAdjustedSize == 4); + + // In this case, opcode will contains escape prefix at least one byte, + // simdPrefixAdjustedSize should be minus one. + evexPrefixAdjustedSize -= 1; + + // Get the fourth byte in Opcode. + // If this byte is non-zero, then we should check whether the opcode contains SIMD prefix or not. + BYTE check = (code >> 24) & 0xFF; + if (check != 0) + { + // 3-byte opcode: with the bytes ordered as 0x2211RM33 or + // 4-byte opcode: with the bytes ordered as 0x22114433 + // Simd prefix is at the first byte. + BYTE sizePrefix = (code >> 16) & 0xFF; + if (sizePrefix != 0 && isPrefix(sizePrefix)) + { + evexPrefixAdjustedSize -= 1; + } + + // If the opcode size is 4 bytes, then the second escape prefix is at fourth byte in opcode. + // But in this case the opcode has not counted R\M part. + // opcodeSize + evexPrefixAdjustedSize - ExtraEscapePrefixSize + ModR\MSize + //=opcodeSize + evexPrefixAdjustedSize -1 + 1 + //=opcodeSize + evexPrefixAdjustedSize + // So although we may have second byte escape prefix, we won't decrease evexPrefixAdjustedSize. + } + + adjustedSize = evexPrefixAdjustedSize; + } + else if (IsAVXInstruction(ins)) + { + // VEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces. + // Therefore, to estimate the size adding VEX prefix size and size of instruction opcode bytes will always + // overstimate. + // Instead this routine will adjust the size of VEX prefix based on the number of bytes of opcode it encodes so + // that + // instruction size estimate will be accurate. + // Basically this will decrease the vexPrefixSize, so that opcodeSize + vexPrefixAdjustedSize will be the right + // size. + // + // rightOpcodeSize + vexPrefixSize + // = (opcodeSize - ExtrabytesSize) + vexPrefixSize + // = opcodeSize + (vexPrefixSize - ExtrabytesSize) + // = opcodeSize + vexPrefixAdjustedSize + + unsigned simdPrefixAdjustedSize = emitGetVexPrefixSize(ins, attr); + assert(simdPrefixAdjustedSize == 3); + + // In this case, opcode will contains escape prefix at least one byte, + // vexPrefixAdjustedSize should be minus one. + simdPrefixAdjustedSize -= 1; + + // Get the fourth byte in Opcode. + // If this byte is non-zero, then we should check whether the opcode contains SIMD prefix or not. + BYTE check = (code >> 24) & 0xFF; + if (check != 0) + { + // 3-byte opcode: with the bytes ordered as 0x2211RM33 or + // 4-byte opcode: with the bytes ordered as 0x22114433 + // Simd prefix is at the first byte. + BYTE sizePrefix = (code >> 16) & 0xFF; + if (sizePrefix != 0 && isPrefix(sizePrefix)) + { + simdPrefixAdjustedSize -= 1; + } + + // If the opcode size is 4 bytes, then the second escape prefix is at fourth byte in opcode. + // But in this case the opcode has not counted R\M part. + // opcodeSize + VexPrefixAdjustedSize - ExtraEscapePrefixSize + ModR\MSize + //=opcodeSize + VexPrefixAdjustedSize -1 + 1 + //=opcodeSize + VexPrefixAdjustedSize + // So although we may have second byte escape prefix, we won't decrease vexPrefixAdjustedSize. + } + + adjustedSize = simdPrefixAdjustedSize; + } + else if (Is4ByteSSEInstruction(ins)) + { + // The 4-Byte SSE instructions require one additional byte to hold the ModRM byte + adjustedSize++; + } + else + { + if (ins == INS_crc32) + { + // Adjust code size for CRC32 that has 4-byte opcode but does not use SSE38 or EES3A encoding. + adjustedSize++; + } + + if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx)) + { + // Most 16-bit operand instructions will need a 0x66 prefix. + adjustedSize++; + } + } + + return adjustedSize; +} + //------------------------------------------------------------------------ // emitGetAdjustedSize: Determines any size adjustment needed for a given instruction based on the current // configuration. @@ -1340,6 +2145,11 @@ unsigned emitter::emitGetAdjustedSize(instruction ins, emitAttr attr, code_t cod // unsigned emitter::emitGetPrefixSize(code_t code, bool includeRexPrefixSize) { + if (hasEvexPrefix(code)) + { + return 4; + } + if (hasVexPrefix(code)) { return 3; @@ -1839,14 +2649,14 @@ inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAtt /*********************************************************************************** * - * Returns modified AVX opcode with the specified register encoded in bits 3-6 of - * byte 2 of VEX prefix. + * Returns modified SIMD opcode with the specified register encoded in bits 3-6 of + * byte 2 of VEX and EVEX prefix. */ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code) { assert(reg < REG_STK); - assert(IsAVXInstruction(ins)); - assert(hasVexPrefix(code)); + assert(IsSimdInstruction(ins)); + assert(hasVexOrEvexPrefix(code)); // Get 4-bit register encoding // RegEncoding() gives lower 3 bits @@ -1857,10 +2667,30 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, regBits |= 0x08; } - // VEX prefix encodes register operand in 1's complement form - // Shift count = 4-bytes of opcode + 0-2 bits + // Both prefix encodes register operand in 1's complement form assert(regBits <= 0xF); - regBits <<= 35; + if (UseEvexEncoding() && IsAvx512Instruction(ins)) + { + if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) + { + assert(hasEvexPrefix(code) && TakesEvexPrefix(ins)); + + // Shift count = 5-bytes of opcode + 0-2 bits for EVEX + regBits <<= 43; + return code ^ regBits; + } + } + if (UseVEXEncoding() && IsAVXInstruction(ins)) + { + if (TakesVexPrefix(ins)) + { + assert(hasVexPrefix(code)); + + // Shift count = 4-bytes of opcode + 0-2 bits for VEX + regBits <<= 35; + return code ^ regBits; + } + } return code ^ regBits; } @@ -2168,7 +2998,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code) instruction ins = id->idIns(); emitAttr attr = id->idOpSize(); - UNATIVE_OFFSET sz = emitGetAdjustedSize(ins, attr, code); + UNATIVE_OFFSET sz = emitGetAdjustedSizeEvexAware(ins, attr, code); bool includeRexPrefixSize = true; // REX prefix @@ -12847,7 +13677,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) code_t code; instruction ins = id->idIns(); - assert(IsAVXInstruction(ins)); + assert(IsSimdInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins)); regNumber targetReg = id->idReg1(); regNumber src1 = id->idReg2(); @@ -12855,10 +13685,13 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) emitAttr size = id->idOpSize(); code = insCodeRM(ins); - code = AddVexPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(ins, code, size); + code = insEncodeRMreg(ins, code); - if (TakesRexWPrefix(ins, size)) + // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. + // Explore moving IsWEvexOpcodeExtension() logic to instruction table as flag. + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) { code = AddRexWPrefix(ins, code); } @@ -12868,8 +13701,8 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) // encode source operand reg in 'vvvv' bits in 1's complement form code = insEncodeReg3456(ins, src1, size, code); - // Output the REX prefix - dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); + // Output the REX/VEX/EVEX prefix + dst += emitOutputSimdPrefixIfNeeded(ins, dst, code); // Is this a 'big' opcode? if (code & 0xFF000000) @@ -12892,7 +13725,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) else if ((code & 0xFF) == 0x00) { // This case happens for AVX instructions only - assert(IsAVXInstruction(ins)); + assert(IsSimdInstruction(ins)); dst += emitOutputByte(dst, (code >> 8) & 0xFF); dst += emitOutputByte(dst, (0xC0 | regCode)); diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 7f8b1bd85bbb0..644fed77da824 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -65,11 +65,14 @@ BYTE* emitOutputRRR(BYTE* dst, instrDesc* id); BYTE* emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* id); +unsigned emitOutputSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code); unsigned emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code); unsigned emitGetRexPrefixSize(instruction ins); unsigned emitGetVexPrefixSize(instruction ins, emitAttr attr); +unsigned emitGetEvexPrefixSize(instruction ins); unsigned emitGetPrefixSize(code_t code, bool includeRexPrefixSize); unsigned emitGetAdjustedSize(instruction ins, emitAttr attr, code_t code); +unsigned emitGetAdjustedSizeEvexAware(instruction ins, emitAttr attr, code_t code); unsigned insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code); unsigned insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code); @@ -86,7 +89,9 @@ unsigned insSSval(unsigned scale); static bool IsSSEInstruction(instruction ins); static bool IsSSEOrAVXInstruction(instruction ins); +static bool IsAvx512OrPriorInstruction(instruction ins); static bool IsAVXOnlyInstruction(instruction ins); +static bool IsAvx512OnlyInstruction(instruction ins); static bool IsFMAInstruction(instruction ins); static bool IsAVXVNNIInstruction(instruction ins); static bool IsBMIInstruction(instruction ins); @@ -94,6 +99,9 @@ static bool IsBMIInstruction(instruction ins); static regNumber getBmiRegNumber(instruction ins); static regNumber getSseShiftRegNumber(instruction ins); bool IsAVXInstruction(instruction ins) const; +bool IsAvx512Instruction(instruction ins) const; +bool IsSimdInstruction(instruction ins) const; + code_t insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code); code_t AddRexWPrefix(instruction ins, code_t code); @@ -160,6 +168,278 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr return code; } +//------------------------------------------------------------------------ +// IsWEvexOpcodeExtension: Some instructions use W bit as an opcode extension bit. +// Identify instructions which requires W bit to be set to 1 +// for Evex encoding. +// TODO-XArch-AVX512: Explore adding this as a flag to instr table. +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// TRUE if W bit needs to be set to 1. +// +bool IsWEvexOpcodeExtension(instruction ins) +{ + if (!TakesEvexPrefix(ins)) + { + return false; + } + + switch (ins) + { + case INS_movq: + case INS_addpd: + case INS_addsd: + case INS_movsd: + case INS_movsdsse2: + case INS_mulsd: + case INS_mulpd: + case INS_movntpd: + case INS_movlpd: + case INS_movhpd: + case INS_movapd: + case INS_movupd: + case INS_shufpd: + case INS_subsd: + case INS_subpd: + case INS_minsd: + case INS_minpd: + case INS_divsd: + case INS_divpd: + case INS_maxsd: + case INS_maxpd: + case INS_xorpd: + case INS_andpd: + case INS_sqrtsd: + case INS_sqrtpd: + case INS_andnpd: + case INS_orpd: + case INS_cvtpd2ps: + case INS_cvtsd2ss: + case INS_cvtpd2dq: + case INS_cvttpd2dq: + case INS_comisd: + case INS_ucomisd: + case INS_paddq: + case INS_psubq: + case INS_pmuludq: + case INS_psllq: + case INS_psrlq: + case INS_punpckhqdq: + case INS_punpcklqdq: + case INS_unpckhpd: + case INS_pmuldq: + case INS_movddup: + case INS_pinsrq: + case INS_pextrq: + case INS_vpbroadcastq: + case INS_vpermq: + case INS_vpsrlvq: + case INS_vpsllvq: + case INS_vpermilpd: + case INS_vpermpd: + case INS_vpgatherdq: + case INS_vpgatherqq: + case INS_vgatherdpd: + case INS_vgatherqpd: + case INS_vfmadd132pd: + case INS_vfmadd213pd: + case INS_vfmadd231pd: + case INS_vfmadd132sd: + case INS_vfmadd213sd: + case INS_vfmadd231sd: + case INS_vfmaddsub132pd: + case INS_vfmaddsub213pd: + case INS_vfmaddsub231pd: + case INS_vfmsubadd132pd: + case INS_vfmsubadd213pd: + case INS_vfmsubadd231pd: + case INS_vfmsub132pd: + case INS_vfmsub213pd: + case INS_vfmsub231pd: + case INS_vfmsub132sd: + case INS_vfmsub213sd: + case INS_vfmsub231sd: + case INS_vfnmadd132pd: + case INS_vfnmadd213pd: + case INS_vfnmadd231pd: + case INS_vfnmadd132sd: + case INS_vfnmadd213sd: + case INS_vfnmadd231sd: + case INS_vfnmsub132pd: + case INS_vfnmsub213pd: + case INS_vfnmsub231pd: + case INS_vfnmsub132sd: + case INS_vfnmsub213sd: + case INS_vfnmsub231sd: + case INS_unpcklpd: + case INS_vpermilpdvar: + { + return true; // W1 + } + case INS_movd: + case INS_punpckldq: + case INS_movntdq: + case INS_movntps: + case INS_movlps: + case INS_movhps: + case INS_movss: + case INS_movaps: + case INS_movups: + case INS_movhlps: + case INS_movlhps: + case INS_unpckhps: + case INS_unpcklps: + case INS_shufps: + case INS_punpckhdq: + case INS_addps: + case INS_addss: + case INS_mulss: + case INS_mulps: + case INS_subss: + case INS_subps: + case INS_minss: + case INS_minps: + case INS_divss: + case INS_divps: + case INS_maxss: + case INS_maxps: + case INS_xorps: + case INS_andps: + case INS_sqrtss: + case INS_sqrtps: + case INS_andnps: + case INS_orps: + case INS_cvtss2sd: + case INS_cvtdq2ps: + case INS_cvtps2dq: + case INS_cvttps2dq: + case INS_cvtdq2pd: + case INS_comiss: + case INS_ucomiss: + case INS_paddd: + case INS_psubd: + case INS_pslld: + case INS_psrld: + case INS_psrad: + case INS_pshufd: + case INS_packssdw: + case INS_insertps: + case INS_pmulld: + case INS_pabsd: + case INS_pminsd: + case INS_pminud: + case INS_pmaxud: + case INS_pmovsxdq: + case INS_pmovzxdq: + case INS_packusdw: + case INS_movntdqa: + case INS_movsldup: + case INS_movshdup: + case INS_pinsrd: + case INS_pextrd: + case INS_vbroadcastss: + case INS_vbroadcastsd: + case INS_vpbroadcastb: + case INS_vpbroadcastw: + case INS_vpbroadcastd: + case INS_vpsravd: + case INS_vpsllvd: + case INS_vpermilps: + case INS_vpermd: + case INS_vpermps: + case INS_vpgatherdd: + case INS_vpgatherqd: + case INS_vgatherdps: + case INS_vgatherqps: + case INS_vfmadd132ps: + case INS_vfmadd213ps: + case INS_vfmadd231ps: + case INS_vfmadd132ss: + case INS_vfmadd213ss: + case INS_vfmadd231ss: + case INS_vfmaddsub132ps: + case INS_vfmaddsub213ps: + case INS_vfmaddsub231ps: + case INS_vfmsubadd132ps: + case INS_vfmsubadd213ps: + case INS_vfmsubadd231ps: + case INS_vfmsub132ps: + case INS_vfmsub213ps: + case INS_vfmsub231ps: + case INS_vfmsub132ss: + case INS_vfmsub213ss: + case INS_vfmsub231ss: + case INS_vfnmadd132ps: + case INS_vfnmadd213ps: + case INS_vfnmadd231ps: + case INS_vfnmadd132ss: + case INS_vfnmadd213ss: + case INS_vfnmadd231ss: + case INS_vfnmsub132ps: + case INS_vfnmsub213ps: + case INS_vfnmsub231ps: + case INS_vfnmsub132ss: + case INS_vfnmsub213ss: + case INS_vfnmsub231ss: + case INS_vpdpbusd: + case INS_vpdpwssd: + case INS_vpdpbusds: + case INS_vpdpwssds: + case INS_vpermilpsvar: + { + return false; // W0 + } + default: + { + return false; // WIG + } + } +} + +//------------------------------------------------------------------------ +// HasKMaskRegisterDest: Temporary check to identify instructions that can +// be Evex encoded but require Opmask(KMask) register support. +// These are cases where for comparison instructions, result is written +// to KMask when Evex encoded. +// TODO-XArch-AVX512: Refactor once KMask is added. +// +// Arguments: +// ins - The instruction to check. +// +// Returns: +// TRUE if Evex encoding requires KMAsk support. +// +bool HasKMaskRegisterDest(instruction ins) const +{ + assert(UseEvexEncoding() == true); + switch (ins) + { + // Requires KMask. + case INS_pcmpgtb: + case INS_pcmpgtd: + case INS_pcmpgtw: + case INS_pcmpgtq: + case INS_pcmpeqb: + case INS_pcmpeqd: + case INS_pcmpeqq: + case INS_pcmpeqw: + case INS_cmpps: + case INS_cmpss: + case INS_cmppd: + case INS_cmpsd: + { + return true; + } + default: + { + return false; + } + } +} + bool useVEXEncodings; bool UseVEXEncoding() const { @@ -170,6 +450,90 @@ void SetUseVEXEncoding(bool value) useVEXEncodings = value; } +// Is Evex encoding supported. +bool useEvexEncodings; +bool UseEvexEncoding() const +{ + return useEvexEncodings; +} +void SetUseEvexEncoding(bool value) +{ + useEvexEncodings = value; +} + +// 4-byte EVEX prefix starts with byte 0x62 +#define EVEX_PREFIX_MASK 0xFF00000000000000ULL +#define EVEX_PREFIX_CODE 0x6200000000000000ULL + +bool TakesEvexPrefix(instruction ins) const; + +//------------------------------------------------------------------------ +// hasEvexPrefix: Returns true if the instruction encoding already +// contains Evex prefix. +// +// Arguments: +// code - opcode + prefixes bits at some stage of encoding. +// +// Returns: +// TRUE if code has an Evex prefix. +bool hasEvexPrefix(code_t code) +{ + return (code & EVEX_PREFIX_MASK) == EVEX_PREFIX_CODE; +} +code_t AddEvexPrefix(instruction ins, code_t code, emitAttr attr); + +//------------------------------------------------------------------------ +// AddSimdPrefixIfNeeded: Add the correct SIMD prefix if required. +// +// Arguments: +// ins - the instruction being encoded. +// code - opcode + prefixes bits at some stage of encoding. +// size - operand size +// +// Returns: +// TRUE if code has an Evex prefix. +code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size) +{ + if (TakesEvexPrefix(ins)) + { + code = AddEvexPrefix(ins, code, size); + } + else if (TakesVexPrefix(ins)) + { + code = AddVexPrefix(ins, code, size); + } + return code; +} + +//------------------------------------------------------------------------ +// hasVexOrEvexPrefix: Returns true if the instruction encoding already +// contains a Vex or Evex prefix. +// +// Arguments: +// code - opcode + prefixes bits at some stage of encoding. +// +// Returns: +// TRUE if code has a SIMD prefix. +bool hasVexOrEvexPrefix(code_t code) +{ + return (hasVexPrefix(code) || hasEvexPrefix(code)); +} + +//------------------------------------------------------------------------ +// codeEvexMigrationCheck: Temporary check to use when adding EVEX codepaths +// TODO-XArch-AVX512: Remove implementation and uses once all Evex paths are +// completed. +// +// Arguments: +// code - opcode + prefixes bits at some stage of encoding. +// +// Returns: +// TRUE if code has an Evex prefix. +bool codeEvexMigrationCheck(code_t code) +{ + return hasEvexPrefix(code); +} + bool containsAVXInstruction = false; bool ContainsAVX() { @@ -203,6 +567,7 @@ bool IsThreeOperandAVXInstruction(instruction ins) { return (IsDstDstSrcAVXInstruction(ins) || IsDstSrcSrcAVXInstruction(ins)); } + bool isAvxBlendv(instruction ins) { return ins == INS_vblendvps || ins == INS_vblendvpd || ins == INS_vpblendvb; diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index e766df67304d8..87de5ef1eb0af 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -614,6 +614,8 @@ INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) +INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) +INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None) // Scalar instructions in SSE4.2 INST3(crc32, "crc32", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF2, 0x0F, 0x38, 0xF0), INS_FLAGS_None) diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 23fb5ee3c9ff5..ad54ce7153f58 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -297,8 +297,9 @@ CONFIG_INTEGER(AltJitAssertOnNYI, W("AltJitAssertOnNYI"), 1) // Controls the Alt CONFIG_INTEGER(EnableEHWriteThru, W("EnableEHWriteThru"), 1) // Enable the register allocator to support EH-write thru: // partial enregistration of vars exposed on EH boundaries -CONFIG_INTEGER(EnableMultiRegLocals, W("EnableMultiRegLocals"), 1) // Enable the enregistration of locals that are - // defined or used in a multireg context. +CONFIG_INTEGER(EnableMultiRegLocals, W("EnableMultiRegLocals"), 1) // Enable the enregistration of locals that are + // defined or used in a multireg context. +CONFIG_INTEGER(JitStressEVEXEncoding, W("JitStressEVEXEncoding"), 0) // Enable EVEX encoding for SIMD instructions. // clang-format off