From 028cdf5058ab44cbf7b4014272229984a2c6a8ea Mon Sep 17 00:00:00 2001
From: Deepak Rajendrakumaran <deepak.rajendrakumaran@intel.com>
Date: Wed, 24 Aug 2022 16:02:39 -0700
Subject: [PATCH] Adding EVEX encoding pathways for emitOutputRRR(). Adding
 flag to turn on EVEX encoding.

---
 src/coreclr/jit/compiler.cpp      |   5 +
 src/coreclr/jit/compiler.h        |  33 +
 src/coreclr/jit/emit.h            |   1 +
 src/coreclr/jit/emitxarch.cpp     | 997 +++++++++++++++++++++++++++---
 src/coreclr/jit/emitxarch.h       | 365 +++++++++++
 src/coreclr/jit/instrsxarch.h     |   2 +
 src/coreclr/jit/jitconfigvalues.h |   5 +-
 7 files changed, 1324 insertions(+), 84 deletions(-)

diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index e92d45518e9a6..bc2d9bfa1c3f0 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2284,6 +2284,11 @@ void Compiler::compSetProcessor()
 #ifdef TARGET_XARCH
     if (!compIsForInlining())
     {
+        if (canUseEvexEncoding())
+        {
+            codeGen->GetEmitter()->SetUseEvexEncoding(true);
+            // TODO-XArch-AVX512: Revisit other flags to be set once avx512 instructions are added.
+        }
         if (canUseVexEncoding())
         {
             codeGen->GetEmitter()->SetUseVEXEncoding(true);
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 51b8764a0da35..d2031af55f7ef 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -8960,6 +8960,39 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #endif
     }
 
+    //------------------------------------------------------------------------
+    // canUseEvexEncoding - Answer the question: Is Evex encoding supported on this target.
+    //
+    // Returns:
+    //    TRUE if Evex encoding is supported, FALSE if not.
+    bool canUseEvexEncoding() const
+    {
+#ifdef TARGET_XARCH
+        return compOpportunisticallyDependsOn(InstructionSet_AVX512F);
+#else
+        return false;
+#endif
+    }
+
+    //------------------------------------------------------------------------
+    // DoJitStressEvexEncoding- Answer the question: Do we force EVEX encoding.
+    //
+    // Returns:
+    //    TRUE if user requests EVEX encoding and it's safe, FALSE if not.
+    bool DoJitStressEvexEncoding() const
+    {
+#ifdef TARGET_XARCH
+        // Using JitStressEVEXEncoding flag will force instructions which would
+        // otherwise use VEX encoding but can be EVEX encoded to use EVEX encoding
+        // This requires AVX512VL support.
+        if (JitConfig.JitStressEVEXEncoding() && compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL))
+        {
+            return true;
+        }
+#endif
+        return false;
+    }
+
     /*
     XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
     XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index c5d245ba83c66..34379a6a3d05a 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -445,6 +445,7 @@ class emitter
 
 #ifdef TARGET_XARCH
         SetUseVEXEncoding(false);
+        SetUseEvexEncoding(false);
 #endif // TARGET_XARCH
 
         emitDataSecCur = nullptr;
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 1ef3510a75260..84d6df4e2251d 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -34,11 +34,38 @@ bool emitter::IsSSEOrAVXInstruction(instruction ins)
     return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION);
 }
 
+//------------------------------------------------------------------------
+// IsAvx512OrPriorInstruction: Is this an Avx512 or Avx or Sse instruction.
+//
+// Arguments:
+//    ins - The instruction to check.
+//
+// Returns:
+//    TRUE if it is a sse or avx or avx512 instruction..
+bool emitter::IsAvx512OrPriorInstruction(instruction ins)
+{
+    // TODO-XArch-AVX512: Fix check once AVX512 instructions are added.
+    return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION);
+}
+
 bool emitter::IsAVXOnlyInstruction(instruction ins)
 {
     return (ins >= INS_FIRST_AVX_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION);
 }
 
+//------------------------------------------------------------------------
+// IsAvx512OnlyInstruction: Is this an Avx512 instruction.
+//
+// Arguments:
+//    ins - The instruction to check.
+//
+// Returns:
+//    TRUE if it is a avx512f+ instruction.
+bool emitter::IsAvx512OnlyInstruction(instruction ins)
+{
+    return (ins >= INS_FIRST_AVX512_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION);
+}
+
 bool emitter::IsFMAInstruction(instruction ins)
 {
     return (ins >= INS_FIRST_FMA_INSTRUCTION) && (ins <= INS_LAST_FMA_INSTRUCTION);
@@ -128,6 +155,146 @@ bool emitter::IsAVXInstruction(instruction ins) const
     return UseVEXEncoding() && IsSSEOrAVXInstruction(ins);
 }
 
+//------------------------------------------------------------------------
+// IsAvx512Instruction: Answer the question- Can this instruction be Evex encoded.
+//
+// Arguments:
+//    ins - The instruction to check.
+//
+// Returns:
+//    TRUE if ins can be Evex encoded.
+bool emitter::IsAvx512Instruction(instruction ins) const
+{
+    if (!UseEvexEncoding())
+    {
+        return false;
+    }
+
+    // TODO-XArch-AVX512: Explore adding this as a flag to instr table.
+    switch (ins)
+    {
+        // No EVEX Encoding exists at all.
+        case INS_pmovmskb:
+        case INS_movmskpd:
+        case INS_movmskps:
+        case INS_dppd:
+        case INS_dpps:
+        case INS_maskmovdqu:
+        case INS_haddps:
+        case INS_haddpd:
+        case INS_hsubps:
+        case INS_hsubpd:
+        case INS_addsubps:
+        case INS_addsubpd:
+        case INS_rcpps:
+        case INS_rcpss:
+        case INS_rsqrtps:
+        case INS_rsqrtss:
+        case INS_psignb:
+        case INS_psignd:
+        case INS_psignw:
+        case INS_roundps:
+        case INS_roundss:
+        case INS_roundpd:
+        case INS_roundsd:
+        case INS_blendps:
+        case INS_blendpd:
+        case INS_blendvps:
+        case INS_pblendw:
+        case INS_pblendvb:
+        case INS_blendvpd:
+        case INS_ptest:
+        case INS_phaddw:
+        case INS_phsubw:
+        case INS_phaddd:
+        case INS_phsubd:
+        case INS_phaddsw:
+        case INS_phsubsw:
+        case INS_lddqu:
+        case INS_phminposuw:
+        case INS_mpsadbw:
+        case INS_pclmulqdq:
+        case INS_aesdec:
+        case INS_aesdeclast:
+        case INS_aesenc:
+        case INS_aesenclast:
+        case INS_aesimc:
+        case INS_aeskeygenassist:
+        case INS_vzeroupper:
+        case INS_vperm2i128:
+        case INS_vperm2f128:
+        case INS_vpblendd:
+        case INS_vblendvps:
+        case INS_vblendvpd:
+        case INS_vpblendvb:
+        case INS_vtestps:
+        case INS_vtestpd:
+        case INS_vmaskmovps:
+        case INS_vmaskmovpd:
+        case INS_vpmaskmovd:
+        case INS_vpmaskmovq:
+        case INS_andn:
+        case INS_blsi:
+        case INS_blsmsk:
+        case INS_blsr:
+        case INS_bextr:
+        case INS_rorx:
+        case INS_pdep:
+        case INS_pext:
+        case INS_bzhi:
+        case INS_mulx:
+#ifdef TARGET_AMD64
+        case INS_shlx:
+        case INS_sarx:
+        case INS_shrx:
+#endif
+        case INS_lfence:
+        case INS_mfence:
+        case INS_movnti:
+        case INS_prefetchnta:
+        case INS_prefetcht0:
+        case INS_prefetcht1:
+        case INS_prefetcht2:
+        case INS_sfence:
+        // Might need new INS_<INS_NAME>*suffix* instructions for these.
+        case INS_por:            // INS_pord, INS_porq.
+        case INS_pxor:           // INS_pxord, INS_pxorq
+        case INS_movdqa:         // INS_movdqa32, INS_movdqa64.
+        case INS_movdqu:         // INS_movdqu8, INS_movdqu16, INS_movdqu32, INS_movdqu64.
+        case INS_pand:           // INS_pandd, INS_pandq.
+        case INS_pandn:          // INS_pandnd, INS_pandnq.
+        case INS_vextractf128:   // INS_vextractf32x4, INS_vextractf64x2.
+        case INS_vextracti128:   // INS_vextracti32x4, INS_vextracti64x2.
+        case INS_vinsertf128:    // INS_vinsertf32x4, INS_vinsertf64x2.
+        case INS_vinserti128:    // INS_vinserti32x4, INS_vinserti64x2.
+        case INS_vbroadcastf128: // INS_vbroadcastf32x4, INS_vbroadcastf64x2.
+        case INS_vbroadcasti128: // INS_vbroadcasti32x4, INS_vbroadcasti64x2.
+        {
+            return false;
+        }
+        default:
+        {
+            break;
+        }
+    }
+
+    return IsAvx512OrPriorInstruction(ins);
+}
+
+//------------------------------------------------------------------------
+// Answer the question: Is this a SIMD instruction.
+//
+// Arguments:
+//    ins - The instruction to check.
+//
+// Returns:
+//    TRUE if ins is a SIMD instruction.
+//
+bool emitter::IsSimdInstruction(instruction ins) const
+{
+    return IsAvx512Instruction(ins) || IsAVXInstruction(ins);
+}
+
 // Returns true if the AVX instruction is a binary operator that requires 3 operands.
 // When we emit an instruction with only two operands, we will duplicate the destination
 // as a source.
@@ -136,7 +303,7 @@ bool emitter::IsAVXInstruction(instruction ins) const
 // to indicate whether a 3-operand instruction.
 bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
 {
-    return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstDstSrcAVXInstruction) != 0) && IsAVXInstruction(ins);
+    return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstDstSrcAVXInstruction) != 0) && IsSimdInstruction(ins);
 }
 
 // Returns true if the AVX instruction requires 3 operands that duplicate the source
@@ -146,7 +313,7 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins)
 // to indicate whether a 3-operand instruction.
 bool emitter::IsDstSrcSrcAVXInstruction(instruction ins)
 {
-    return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstSrcSrcAVXInstruction) != 0) && IsAVXInstruction(ins);
+    return ((CodeGenInterface::instInfo[ins] & INS_Flags_IsDstSrcSrcAVXInstruction) != 0) && IsSimdInstruction(ins);
 }
 
 //------------------------------------------------------------------------
@@ -579,6 +746,97 @@ bool emitter::Is4ByteSSEInstruction(instruction ins)
     return !UseVEXEncoding() && EncodedBySSE38orSSE3A(ins);
 }
 
+//------------------------------------------------------------------------
+// TakesEvexPrefix: Checks if the instruction should be EVEX encoded.
+// TODO-XArch-AVX512: This check needs to be updated once AVX512 instructions are added.
+// Eventually, this should evolve to return 'TRUE' for the following cases:
+// - JitConfig.JitStressEVEXEncoding flag is set.
+// - Is an new AVX512 instruction.
+// - Uses ZMM vector registers.
+// - Uses upper 128-bit or 256-bit registers for an AVX512VL ins.
+// - Uses Operand mask encoding: 64-bit opmask registers k0-k7 for conditional execution and merging of destination
+// operands.
+// - Need to encode functionality specific to Instruction classes(e.g.,embedded broadcast, embedded rounding control
+// etc.)
+//
+// Arguments:
+//    instruction -- processor instruction to check
+//
+// Return Value:
+//    true if this instruction requires a EVEX prefix.
+//
+bool emitter::TakesEvexPrefix(instruction ins) const
+{
+    if (!emitComp->DoJitStressEvexEncoding())
+    {
+        return false;
+    }
+
+    // TODO-XArch-AVX512: Revisit 'HasKMaskRegisterDest()' check once KMask support is added.
+    return IsAvx512Instruction(ins) && !HasKMaskRegisterDest(ins);
+}
+
+// Add base EVEX prefix without setting W, R, X, or B bits
+// L'L bits will be set based on emitter attr.
+//
+// 4-byte EVEX prefix = 62 <R, X, B, R', 0, 0, m, m> <W, v, v, v, v, 1, p, p> <z, L', L, b, V', a, a, a>
+// - R, X, B, W - bits to express corresponding REX prefixes.Additionally, X combines with B to expand r/m to 32 SIMD
+// registers
+// - R' - combines with R to expand reg to 32 SIMD registers
+// - mm - lower 2 bits of m-mmmmm (5-bit) in corresponding VEX prefix
+// - vvvv (4-bits) - register specifier in 1's complement form; must be 1111 if unused
+// - pp (2-bits) - opcode extension providing equivalent functionality of a SIMD size prefix
+//                 these prefixes are treated mandatory when used with escape opcode 0Fh for
+//                 some SIMD instructions
+//   00  - None   (0F    - packed float)
+//   01  - 66     (66 0F - packed double)
+//   10  - F3     (F3 0F - scalar float
+//   11  - F2     (F2 0F - scalar double)
+// - z - bit to specify merging mode
+// - L - scalar or AVX-128 bit operations (L=0),  256-bit operations (L=1)
+// - L'- bit to support 512-bit operations or rounding control mode
+// - b - broadcast/rc/sae context
+// - V'- bit to extend vvvv
+// - aaa - specifies mask register
+//    Rest    - reserved for future use and usage of them will uresult in Undefined instruction exception.
+#define DEFAULT_BYTE_EVEX_PREFIX 0x62F07C0800000000ULL
+
+#define DEFAULT_BYTE_EVEX_PREFIX_MASK 0xFFFFFFFF00000000ULL
+#define LBIT_IN_BYTE_EVEX_PREFIX 0x0000002000000000ULL
+
+//------------------------------------------------------------------------
+// AddEvexPrefix: Add default EVEX perfix with only LL' bits set.
+//
+// Arguments:
+//    ins -- processor instruction to check.
+//    code -- opcode bits.
+//    attr -- operand size
+//
+// Return Value:
+//    encoded code with Evex prefix.
+//
+emitter::code_t emitter::AddEvexPrefix(instruction ins, code_t code, emitAttr attr)
+{
+
+    // Only AVX512 instructions require EVEX prefix
+    assert(IsAvx512Instruction(ins));
+
+    // Shouldn't have already added EVEX prefix
+    assert(!hasEvexPrefix(code));
+
+    assert((code & DEFAULT_BYTE_EVEX_PREFIX_MASK) == 0);
+
+    code |= DEFAULT_BYTE_EVEX_PREFIX;
+
+    // TODO-XArch-AVX512: Add EA_64BYTE once ZMM is supported
+    if (attr == EA_32BYTE)
+    {
+        // Set L bit to 1 in case of instructions that operate on 256-bits.
+        code |= LBIT_IN_BYTE_EVEX_PREFIX;
+    }
+    return code;
+}
+
 // Returns true if this instruction requires a VEX prefix
 // All AVX instructions require a VEX prefix
 bool emitter::TakesVexPrefix(instruction ins) const
@@ -856,6 +1114,17 @@ unsigned RegEncoding(regNumber reg)
 // AVX:  specific bits within VEX prefix need to be set in bit-inverted form.
 emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
 {
+    if (UseEvexEncoding() && IsAvx512Instruction(ins))
+    {
+        if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck().
+        {
+            // W-bit is available in 4-byte EVEX prefix that starts with byte 62.
+            assert(hasEvexPrefix(code));
+
+            // W-bit is the only bit that is added in non bit-inverted form.
+            return emitter::code_t(code | 0x0000800000000000ULL);
+        }
+    }
     if (UseVEXEncoding() && IsAVXInstruction(ins))
     {
         if (TakesVexPrefix(ins))
@@ -876,82 +1145,460 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code)
 }
 
 #ifdef TARGET_AMD64
-
-emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code)
-{
-    if (UseVEXEncoding() && IsAVXInstruction(ins))
+
+emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code)
+{
+    if (UseEvexEncoding() && IsAvx512Instruction(ins))
+    {
+        if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck().
+        {
+            // R-bit is available in 4-byte EVEX prefix that starts with byte 62.
+            assert(hasEvexPrefix(code));
+
+            // R-bit is added in bit-inverted form.
+            return code & 0xFF7FFFFFFFFFFFFFULL;
+        }
+    }
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
+    {
+        if (TakesVexPrefix(ins))
+        {
+            // R-bit is supported by both 2-byte and 3-byte VEX prefix
+            assert(hasVexPrefix(code));
+
+            // R-bit is added in bit-inverted form.
+            return code & 0xFF7FFFFFFFFFFFULL;
+        }
+    }
+
+    return code | 0x4400000000ULL;
+}
+
+emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code)
+{
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
+    {
+        if (TakesVexPrefix(ins))
+        {
+            // X-bit is available only in 3-byte VEX prefix that starts with byte C4.
+            assert(hasVexPrefix(code));
+
+            // X-bit is added in bit-inverted form.
+            return code & 0xFFBFFFFFFFFFFFULL;
+        }
+    }
+
+    return code | 0x4200000000ULL;
+}
+
+emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code)
+{
+    if (UseEvexEncoding() && IsAvx512Instruction(ins))
+    {
+        if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck().
+        {
+            // B-bit is available in 4-byte EVEX prefix that starts with byte 62.
+            assert(hasEvexPrefix(code));
+
+            // B-bit is added in bit-inverted form.
+            return code & 0xFFDFFFFFFFFFFFFFULL;
+        }
+    }
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
+    {
+        if (TakesVexPrefix(ins))
+        {
+            // B-bit is available only in 3-byte VEX prefix that starts with byte C4.
+            assert(hasVexPrefix(code));
+
+            // B-bit is added in bit-inverted form.
+            return code & 0xFFDFFFFFFFFFFFULL;
+        }
+    }
+
+    return code | 0x4100000000ULL;
+}
+
+// Adds REX prefix (0x40) without W, R, X or B bits set
+emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code)
+{
+    assert(!UseVEXEncoding() || !IsAVXInstruction(ins));
+    assert(!UseEvexEncoding() || !IsAvx512Instruction(ins));
+    return code | 0x4000000000ULL;
+}
+
+#endif // TARGET_AMD64
+
+bool isPrefix(BYTE b)
+{
+    assert(b != 0);    // Caller should check this
+    assert(b != 0x67); // We don't use the address size prefix
+    assert(b != 0x65); // The GS segment override prefix is emitted separately
+    assert(b != 0x64); // The FS segment override prefix is emitted separately
+    assert(b != 0xF0); // The lock prefix is emitted separately
+    assert(b != 0x2E); // We don't use the CS segment override prefix
+    assert(b != 0x3E); // Or the DS segment override prefix
+    assert(b != 0x26); // Or the ES segment override prefix
+    assert(b != 0x36); // Or the SS segment override prefix
+
+    // That just leaves the size prefixes used in SSE opcodes:
+    //      Scalar Double  Scalar Single  Packed Double
+    return ((b == 0xF2) || (b == 0xF3) || (b == 0x66));
+}
+
+//------------------------------------------------------------------------
+// emitOutputSimdPrefixIfNeeded: Outputs EVEX prefix (in case of AVX512 instructions),
+// VEX prefix (in case of AVX instructions) and REX.R/X/W/B otherwise.
+//
+// Arguments:
+//    ins -- processor instruction to check.
+//    dst -- buffer to write prefix to.
+//    code -- opcode bits.
+//    attr -- operand size
+//
+// Return Value:
+//    Size of prefix.
+//
+unsigned emitter::emitOutputSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code)
+{
+    // TODO-XArch-AVX512: Remove redundant code and collapse into single pathway for EVEX and VEX if possible.
+    if (hasEvexPrefix(code))
+    {
+        // Only AVX512 instructions should have an EVEX prefix
+        assert(IsAvx512Instruction(ins));
+
+        code_t evexPrefix = (code >> 32) & 0xFFFFFFFF;
+        code &= 0x00000000FFFFFFFFLL;
+
+        WORD leadingBytes = 0;
+        BYTE check        = (code >> 24) & 0xFF;
+        if (check != 0)
+        {
+            // check for a prefix in the 11 position
+            BYTE sizePrefix = (code >> 16) & 0xFF;
+            if ((sizePrefix != 0) && isPrefix(sizePrefix))
+            {
+                // 'pp' bits in byte 1 of EVEX prefix allows us to encode SIMD size prefixes as two bits
+                //
+                //   00  - None   (0F    - packed float)
+                //   01  - 66     (66 0F - packed double)
+                //   10  - F3     (F3 0F - scalar float
+                //   11  - F2     (F2 0F - scalar double)
+                switch (sizePrefix)
+                {
+                    case 0x66:
+                        // None of the existing BMI instructions should be EVEX encoded.
+                        assert(!IsBMIInstruction(ins));
+                        evexPrefix |= (0x01 << 8);
+                        break;
+                    case 0xF3:
+                        evexPrefix |= (0x02 << 8);
+                        break;
+                    case 0xF2:
+                        evexPrefix |= (0x03 << 8);
+                        break;
+                    default:
+                        assert(!"unrecognized SIMD size prefix");
+                        unreached();
+                }
+
+                // Now the byte in the 22 position must be an escape byte 0F
+                leadingBytes = check;
+                assert(leadingBytes == 0x0F);
+
+                // Get rid of both sizePrefix and escape byte
+                code &= 0x0000FFFFLL;
+
+                // Check the byte in the 33 position to see if it is 3A or 38.
+                // In such a case escape bytes must be 0x0F3A or 0x0F38
+                check = code & 0xFF;
+                if (check == 0x3A || check == 0x38)
+                {
+                    leadingBytes = (leadingBytes << 8) | check;
+                    code &= 0x0000FF00LL;
+                }
+            }
+        }
+        else
+        {
+            // 2-byte opcode with the bytes ordered as 0x0011RM22
+            // the byte in position 11 must be an escape byte.
+            leadingBytes = (code >> 16) & 0xFF;
+            assert(leadingBytes == 0x0F || leadingBytes == 0x00);
+            code &= 0xFFFF;
+        }
+
+        // If there is an escape byte it must be 0x0F or 0x0F3A or 0x0F38
+        // mm bits in byte 0 of EVEX prefix allows us to encode these
+        // implied leading bytes. They are identical to low two bits of VEX.mmmmm
+
+        switch (leadingBytes)
+        {
+            case 0x00:
+                // there is no leading byte
+                break;
+            case 0x0F:
+                evexPrefix |= (0x01 << 16);
+                break;
+            case 0x0F38:
+                evexPrefix |= (0x02 << 16);
+                break;
+            case 0x0F3A:
+                evexPrefix |= (0x03 << 16);
+                break;
+            default:
+                assert(!"encountered unknown leading bytes");
+                unreached();
+        }
+
+        // At this point
+        //     EVEX.2211RM33 got transformed as EVEX.0000RM33
+        //     EVEX.0011RM22 got transformed as EVEX.0000RM22
+        //
+        // Now output EVEX prefix leaving the 4-byte opcode
+        // EVEX prefix is always 4 bytes
+
+        emitOutputByte(dst, ((evexPrefix >> 24) & 0xFF));
+        emitOutputByte(dst + 1, ((evexPrefix >> 16) & 0xFF));
+        emitOutputByte(dst + 2, (evexPrefix >> 8) & 0xFF);
+        emitOutputByte(dst + 3, evexPrefix & 0xFF);
+        return 4;
+    }
+    else if (hasVexPrefix(code))
+    {
+        // Only AVX instructions should have a VEX prefix
+        assert(UseVEXEncoding() && IsAVXInstruction(ins));
+        code_t vexPrefix = (code >> 32) & 0x00FFFFFF;
+        code &= 0x00000000FFFFFFFFLL;
+
+        WORD leadingBytes = 0;
+        BYTE check        = (code >> 24) & 0xFF;
+        if (check != 0)
+        {
+            // 3-byte opcode: with the bytes ordered as 0x2211RM33 or
+            // 4-byte opcode: with the bytes ordered as 0x22114433
+            // check for a prefix in the 11 position
+            BYTE sizePrefix = (code >> 16) & 0xFF;
+            if ((sizePrefix != 0) && isPrefix(sizePrefix))
+            {
+                // 'pp' bits in byte2 of VEX prefix allows us to encode SIMD size prefixes as two bits
+                //
+                //   00  - None   (0F    - packed float)
+                //   01  - 66     (66 0F - packed double)
+                //   10  - F3     (F3 0F - scalar float
+                //   11  - F2     (F2 0F - scalar double)
+                switch (sizePrefix)
+                {
+                    case 0x66:
+                        if (IsBMIInstruction(ins))
+                        {
+                            switch (ins)
+                            {
+                                case INS_rorx:
+                                case INS_pdep:
+                                case INS_mulx:
+// TODO: Unblock when enabled for x86
+#ifdef TARGET_AMD64
+                                case INS_shrx:
+#endif
+                                {
+                                    vexPrefix |= 0x03;
+                                    break;
+                                }
+
+                                case INS_pext:
+// TODO: Unblock when enabled for x86
+#ifdef TARGET_AMD64
+                                case INS_sarx:
+#endif
+                                {
+                                    vexPrefix |= 0x02;
+                                    break;
+                                }
+// TODO: Unblock when enabled for x86
+#ifdef TARGET_AMD64
+                                case INS_shlx:
+                                {
+                                    vexPrefix |= 0x01;
+                                    break;
+                                }
+#endif
+                                default:
+                                {
+                                    vexPrefix |= 0x00;
+                                    break;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            vexPrefix |= 0x01;
+                        }
+                        break;
+                    case 0xF3:
+                        vexPrefix |= 0x02;
+                        break;
+                    case 0xF2:
+                        vexPrefix |= 0x03;
+                        break;
+                    default:
+                        assert(!"unrecognized SIMD size prefix");
+                        unreached();
+                }
+
+                // Now the byte in the 22 position must be an escape byte 0F
+                leadingBytes = check;
+                assert(leadingBytes == 0x0F);
+
+                // Get rid of both sizePrefix and escape byte
+                code &= 0x0000FFFFLL;
+
+                // Check the byte in the 33 position to see if it is 3A or 38.
+                // In such a case escape bytes must be 0x0F3A or 0x0F38
+                check = code & 0xFF;
+                if (check == 0x3A || check == 0x38)
+                {
+                    leadingBytes = (leadingBytes << 8) | check;
+                    code &= 0x0000FF00LL;
+                }
+            }
+        }
+        else
+        {
+            // 2-byte opcode with the bytes ordered as 0x0011RM22
+            // the byte in position 11 must be an escape byte.
+            leadingBytes = (code >> 16) & 0xFF;
+            assert(leadingBytes == 0x0F || leadingBytes == 0x00);
+            code &= 0xFFFF;
+        }
+
+        // If there is an escape byte it must be 0x0F or 0x0F3A or 0x0F38
+        // m-mmmmm bits in byte 1 of VEX prefix allows us to encode these
+        // implied leading bytes. 0x0F is supported by both the 2-byte and
+        // 3-byte encoding. While 0x0F3A and 0x0F38 are only supported by
+        // the 3-byte version.
+
+        switch (leadingBytes)
+        {
+            case 0x00:
+                // there is no leading byte
+                break;
+            case 0x0F:
+                vexPrefix |= 0x0100;
+                break;
+            case 0x0F38:
+                vexPrefix |= 0x0200;
+                break;
+            case 0x0F3A:
+                vexPrefix |= 0x0300;
+                break;
+            default:
+                assert(!"encountered unknown leading bytes");
+                unreached();
+        }
+
+        // At this point
+        //     VEX.2211RM33 got transformed as VEX.0000RM33
+        //     VEX.0011RM22 got transformed as VEX.0000RM22
+        //
+        // Now output VEX prefix leaving the 4-byte opcode
+
+        // The 2-byte VEX encoding, requires that the X and B-bits are set (these
+        // bits are inverted from the REX values so set means off), the W-bit is
+        // not set (this bit is not inverted), and that the m-mmmm bits are 0-0001
+        // (the 2-byte VEX encoding only supports the 0x0F leading byte). When these
+        // conditions are met, we can change byte-0 from 0xC4 to 0xC5 and then
+        // byte-1 is the logical-or of bit 7 from byte-1 and bits 0-6 from byte 2
+        // from the 3-byte VEX encoding.
+        //
+        // Given the above, the check can be reduced to a simple mask and comparison.
+        // * 0xFFFF7F80 is a mask that ignores any bits whose value we don't care about:
+        //   * R can be set or unset              (0x7F ignores bit 7)
+        //   * vvvv can be any value              (0x80 ignores bits 3-6)
+        //   * L can be set or unset              (0x80 ignores bit 2)
+        //   * pp can be any value                (0x80 ignores bits 0-1)
+        // * 0x00C46100 is a value that signifies the requirements listed above were met:
+        //   * We must be a three-byte VEX opcode (0x00C4)
+        //   * X and B must be set                (0x61 validates bits 5-6)
+        //   * m-mmmm must be 0-00001             (0x61 validates bits 0-4)
+        //   * W must be unset                    (0x00 validates bit 7)
+        if ((vexPrefix & 0xFFFF7F80) == 0x00C46100)
+        {
+            // Encoding optimization calculation is not done while estimating the instruction
+            // size and thus over-predict instruction size by 1 byte.
+            // If there are IGs that will be aligned, do not optimize encoding so the
+            // estimated alignment sizes are accurate.
+            if (emitCurIG->igNum > emitLastAlignedIgNum)
+            {
+                emitOutputByte(dst, 0xC5);
+                emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
+                return 2;
+            }
+        }
+
+        emitOutputByte(dst, ((vexPrefix >> 16) & 0xFF));
+        emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0xFF));
+        emitOutputByte(dst + 2, vexPrefix & 0xFF);
+        return 3;
+    }
+
+#ifdef TARGET_AMD64
+    if (code > 0x00FFFFFFFFLL)
     {
-        if (TakesVexPrefix(ins))
-        {
-            // R-bit is supported by both 2-byte and 3-byte VEX prefix
-            assert(hasVexPrefix(code));
-
-            // R-bit is added in bit-inverted form.
-            return code & 0xFF7FFFFFFFFFFFULL;
-        }
-    }
+        BYTE prefix = (code >> 32) & 0xFF;
+        noway_assert(prefix >= 0x40 && prefix <= 0x4F);
+        code &= 0x00000000FFFFFFFFLL;
 
-    return code | 0x4400000000ULL;
-}
+        // TODO-AMD64-Cleanup: when we remove the prefixes (just the SSE opcodes right now)
+        // we can remove this code as well
 
-emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code)
-{
-    if (UseVEXEncoding() && IsAVXInstruction(ins))
-    {
-        if (TakesVexPrefix(ins))
+        // The REX prefix is required to come after all other prefixes.
+        // Some of our 'opcodes' actually include some prefixes, if that
+        // is the case, shift them over and place the REX prefix after
+        // the other prefixes, and emit any prefix that got moved out.
+        BYTE check = (code >> 24) & 0xFF;
+        if (check == 0)
         {
-            // X-bit is available only in 3-byte VEX prefix that starts with byte C4.
-            assert(hasVexPrefix(code));
-
-            // X-bit is added in bit-inverted form.
-            return code & 0xFFBFFFFFFFFFFFULL;
+            // 3-byte opcode: with the bytes ordered as 0x00113322
+            // check for a prefix in the 11 position
+            check = (code >> 16) & 0xFF;
+            if (check != 0 && isPrefix(check))
+            {
+                // Swap the rex prefix and whatever this prefix is
+                code = (((DWORD)prefix << 16) | (code & 0x0000FFFFLL));
+                // and then emit the other prefix
+                return emitOutputByte(dst, check);
+            }
         }
-    }
-
-    return code | 0x4200000000ULL;
-}
-
-emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code)
-{
-    if (UseVEXEncoding() && IsAVXInstruction(ins))
-    {
-        if (TakesVexPrefix(ins))
+        else
         {
-            // B-bit is available only in 3-byte VEX prefix that starts with byte C4.
-            assert(hasVexPrefix(code));
-
-            // B-bit is added in bit-inverted form.
-            return code & 0xFFDFFFFFFFFFFFULL;
+            // 4-byte opcode with the bytes ordered as 0x22114433
+            // first check for a prefix in the 11 position
+            BYTE check2 = (code >> 16) & 0xFF;
+            if (isPrefix(check2))
+            {
+                assert(!isPrefix(check)); // We currently don't use this, so it is untested
+                if (isPrefix(check))
+                {
+                    // 3 prefixes were rex = rr, check = c1, check2 = c2 encoded as 0xrrc1c2XXXX
+                    // Change to c2rrc1XXXX, and emit check2 now
+                    code = (((code_t)prefix << 24) | ((code_t)check << 16) | (code & 0x0000FFFFLL));
+                }
+                else
+                {
+                    // 2 prefixes were rex = rr, check2 = c2 encoded as 0xrrXXc2XXXX, (check is part of the opcode)
+                    // Change to c2XXrrXXXX, and emit check2 now
+                    code = (((code_t)check << 24) | ((code_t)prefix << 16) | (code & 0x0000FFFFLL));
+                }
+                return emitOutputByte(dst, check2);
+            }
         }
-    }
-
-    return code | 0x4100000000ULL;
-}
-
-// Adds REX prefix (0x40) without W, R, X or B bits set
-emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code)
-{
-    assert(!UseVEXEncoding() || !IsAVXInstruction(ins));
-    return code | 0x4000000000ULL;
-}
 
+        return emitOutputByte(dst, prefix);
+    }
 #endif // TARGET_AMD64
 
-bool isPrefix(BYTE b)
-{
-    assert(b != 0);    // Caller should check this
-    assert(b != 0x67); // We don't use the address size prefix
-    assert(b != 0x65); // The GS segment override prefix is emitted separately
-    assert(b != 0x64); // The FS segment override prefix is emitted separately
-    assert(b != 0xF0); // The lock prefix is emitted separately
-    assert(b != 0x2E); // We don't use the CS segment override prefix
-    assert(b != 0x3E); // Or the DS segment override prefix
-    assert(b != 0x26); // Or the ES segment override prefix
-    assert(b != 0x36); // Or the SS segment override prefix
-
-    // That just leaves the size prefixes used in SSE opcodes:
-    //      Scalar Double  Scalar Single  Packed Double
-    return ((b == 0xF2) || (b == 0xF3) || (b == 0x66));
+    return 0;
 }
 
 // Outputs VEX prefix (in case of AVX instructions) and REX.R/X/W/B otherwise.
@@ -1236,6 +1883,27 @@ unsigned emitter::emitGetRexPrefixSize(instruction ins)
     return 1;
 }
 
+//------------------------------------------------------------------------
+// emitGetEvexPrefixSize: Gets Size of Evex prefix in bytes
+// TODO-XArch-AVX512: Once evex encoding is supported fully, this will take the place of emitGetAdjustedSize()
+//
+// Arguments:
+//    ins   -- The instruction
+//
+// Returns:
+//    Prefix size in bytes.
+//
+unsigned emitter::emitGetEvexPrefixSize(instruction ins)
+{
+    if (IsAvx512Instruction(ins))
+    {
+        return 4;
+    }
+
+    // If not AVX512, then we don't need to encode prefix.
+    return 0;
+}
+
 // Size of vex prefix in bytes
 unsigned emitter::emitGetVexPrefixSize(instruction ins, emitAttr attr)
 {
@@ -1248,6 +1916,143 @@ unsigned emitter::emitGetVexPrefixSize(instruction ins, emitAttr attr)
     return 0;
 }
 
+//------------------------------------------------------------------------
+// emitGetAdjustedSizeEvexAware: Determines any size adjustment needed for a given instruction based on the current
+// configuration.
+// TODO-XArch-AVX512: Once evex encoding is supported fully, this will take the place of emitGetAdjustedSize()
+//
+// Arguments:
+//    ins   -- The instruction being emitted
+//    attr  -- The emit attribute
+//    code  -- The current opcode and any known prefixes
+//
+// Returns:
+//    Updated size.
+//
+unsigned emitter::emitGetAdjustedSizeEvexAware(instruction ins, emitAttr attr, code_t code)
+{
+    unsigned adjustedSize = 0;
+
+    // TODO-XArch-AVX512: Remove redundant code and possiblly collapse EVEX and VEX into a single pathway
+    // IsAvx512Instruction(ins) is TRUE for AVX/SSE instructions also which needs to be VEX encoded unless explicitly
+    // asked for EVEX.
+    if (IsAvx512Instruction(ins) && TakesEvexPrefix(ins))
+    {
+        // EVEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces.
+        // Therefore, to estimate the size adding EVEX prefix size and size of instruction opcode bytes will always
+        // overstimate.
+        // Instead this routine will adjust the size of EVEX prefix based on the number of bytes of opcode it encodes so
+        // that
+        // instruction size estimate will be accurate.
+        // Basically this  will decrease the evexPrefixSize, so that opcodeSize + evexPrefixAdjustedSize will be the
+        // right size.
+        //
+        // rightOpcodeSize + evexPrefixSize
+        //  = (opcodeSize - ExtrabytesSize) + evexPrefixSize
+        //  = opcodeSize + (evexPrefixSize - ExtrabytesSize)
+        //  = opcodeSize + evexPrefixAdjustedSize
+
+        unsigned evexPrefixAdjustedSize = emitGetEvexPrefixSize(ins);
+        assert(evexPrefixAdjustedSize == 4);
+
+        // In this case, opcode will contains escape prefix at least one byte,
+        // simdPrefixAdjustedSize should be minus one.
+        evexPrefixAdjustedSize -= 1;
+
+        // Get the fourth byte in Opcode.
+        // If this byte is non-zero, then we should check whether the opcode contains SIMD prefix or not.
+        BYTE check = (code >> 24) & 0xFF;
+        if (check != 0)
+        {
+            // 3-byte opcode: with the bytes ordered as 0x2211RM33 or
+            // 4-byte opcode: with the bytes ordered as 0x22114433
+            // Simd prefix is at the first byte.
+            BYTE sizePrefix = (code >> 16) & 0xFF;
+            if (sizePrefix != 0 && isPrefix(sizePrefix))
+            {
+                evexPrefixAdjustedSize -= 1;
+            }
+
+            // If the opcode size is 4 bytes, then the second escape prefix is at fourth byte in opcode.
+            // But in this case the opcode has not counted R\M part.
+            // opcodeSize + evexPrefixAdjustedSize - ExtraEscapePrefixSize + ModR\MSize
+            //=opcodeSize + evexPrefixAdjustedSize -1 + 1
+            //=opcodeSize + evexPrefixAdjustedSize
+            // So although we may have second byte escape prefix, we won't decrease evexPrefixAdjustedSize.
+        }
+
+        adjustedSize = evexPrefixAdjustedSize;
+    }
+    else if (IsAVXInstruction(ins))
+    {
+        // VEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces.
+        // Therefore, to estimate the size adding VEX prefix size and size of instruction opcode bytes will always
+        // overstimate.
+        // Instead this routine will adjust the size of VEX prefix based on the number of bytes of opcode it encodes so
+        // that
+        // instruction size estimate will be accurate.
+        // Basically this  will decrease the vexPrefixSize, so that opcodeSize + vexPrefixAdjustedSize will be the right
+        // size.
+        //
+        // rightOpcodeSize + vexPrefixSize
+        //  = (opcodeSize - ExtrabytesSize) + vexPrefixSize
+        //  = opcodeSize + (vexPrefixSize - ExtrabytesSize)
+        //  = opcodeSize + vexPrefixAdjustedSize
+
+        unsigned simdPrefixAdjustedSize = emitGetVexPrefixSize(ins, attr);
+        assert(simdPrefixAdjustedSize == 3);
+
+        // In this case, opcode will contains escape prefix at least one byte,
+        // vexPrefixAdjustedSize should be minus one.
+        simdPrefixAdjustedSize -= 1;
+
+        // Get the fourth byte in Opcode.
+        // If this byte is non-zero, then we should check whether the opcode contains SIMD prefix or not.
+        BYTE check = (code >> 24) & 0xFF;
+        if (check != 0)
+        {
+            // 3-byte opcode: with the bytes ordered as 0x2211RM33 or
+            // 4-byte opcode: with the bytes ordered as 0x22114433
+            // Simd prefix is at the first byte.
+            BYTE sizePrefix = (code >> 16) & 0xFF;
+            if (sizePrefix != 0 && isPrefix(sizePrefix))
+            {
+                simdPrefixAdjustedSize -= 1;
+            }
+
+            // If the opcode size is 4 bytes, then the second escape prefix is at fourth byte in opcode.
+            // But in this case the opcode has not counted R\M part.
+            // opcodeSize + VexPrefixAdjustedSize - ExtraEscapePrefixSize + ModR\MSize
+            //=opcodeSize + VexPrefixAdjustedSize -1 + 1
+            //=opcodeSize + VexPrefixAdjustedSize
+            // So although we may have second byte escape prefix, we won't decrease vexPrefixAdjustedSize.
+        }
+
+        adjustedSize = simdPrefixAdjustedSize;
+    }
+    else if (Is4ByteSSEInstruction(ins))
+    {
+        // The 4-Byte SSE instructions require one additional byte to hold the ModRM byte
+        adjustedSize++;
+    }
+    else
+    {
+        if (ins == INS_crc32)
+        {
+            // Adjust code size for CRC32 that has 4-byte opcode but does not use SSE38 or EES3A encoding.
+            adjustedSize++;
+        }
+
+        if ((attr == EA_2BYTE) && (ins != INS_movzx) && (ins != INS_movsx))
+        {
+            // Most 16-bit operand instructions will need a 0x66 prefix.
+            adjustedSize++;
+        }
+    }
+
+    return adjustedSize;
+}
+
 //------------------------------------------------------------------------
 // emitGetAdjustedSize: Determines any size adjustment needed for a given instruction based on the current
 // configuration.
@@ -1340,6 +2145,11 @@ unsigned emitter::emitGetAdjustedSize(instruction ins, emitAttr attr, code_t cod
 //
 unsigned emitter::emitGetPrefixSize(code_t code, bool includeRexPrefixSize)
 {
+    if (hasEvexPrefix(code))
+    {
+        return 4;
+    }
+
     if (hasVexPrefix(code))
     {
         return 3;
@@ -1839,14 +2649,14 @@ inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAtt
 
 /***********************************************************************************
  *
- *  Returns modified AVX opcode with the specified register encoded in bits 3-6 of
- *  byte 2 of VEX prefix.
+ *  Returns modified SIMD opcode with the specified register encoded in bits 3-6 of
+ *  byte 2 of VEX and EVEX prefix.
  */
 inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code)
 {
     assert(reg < REG_STK);
-    assert(IsAVXInstruction(ins));
-    assert(hasVexPrefix(code));
+    assert(IsSimdInstruction(ins));
+    assert(hasVexOrEvexPrefix(code));
 
     // Get 4-bit register encoding
     // RegEncoding() gives lower 3 bits
@@ -1857,10 +2667,30 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg,
         regBits |= 0x08;
     }
 
-    // VEX prefix encodes register operand in 1's complement form
-    // Shift count = 4-bytes of opcode + 0-2 bits
+    // Both prefix encodes register operand in 1's complement form
     assert(regBits <= 0xF);
-    regBits <<= 35;
+    if (UseEvexEncoding() && IsAvx512Instruction(ins))
+    {
+        if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code))
+        {
+            assert(hasEvexPrefix(code) && TakesEvexPrefix(ins));
+
+            // Shift count = 5-bytes of opcode + 0-2 bits for EVEX
+            regBits <<= 43;
+            return code ^ regBits;
+        }
+    }
+    if (UseVEXEncoding() && IsAVXInstruction(ins))
+    {
+        if (TakesVexPrefix(ins))
+        {
+            assert(hasVexPrefix(code));
+
+            // Shift count = 4-bytes of opcode + 0-2 bits for VEX
+            regBits <<= 35;
+            return code ^ regBits;
+        }
+    }
     return code ^ regBits;
 }
 
@@ -2168,7 +2998,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id, code_t code)
     instruction ins  = id->idIns();
     emitAttr    attr = id->idOpSize();
 
-    UNATIVE_OFFSET sz = emitGetAdjustedSize(ins, attr, code);
+    UNATIVE_OFFSET sz = emitGetAdjustedSizeEvexAware(ins, attr, code);
 
     bool includeRexPrefixSize = true;
     // REX prefix
@@ -12847,7 +13677,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
     code_t code;
 
     instruction ins = id->idIns();
-    assert(IsAVXInstruction(ins));
+    assert(IsSimdInstruction(ins));
     assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins));
     regNumber targetReg = id->idReg1();
     regNumber src1      = id->idReg2();
@@ -12855,10 +13685,13 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
     emitAttr  size      = id->idOpSize();
 
     code = insCodeRM(ins);
-    code = AddVexPrefixIfNeeded(ins, code, size);
+    code = AddSimdPrefixIfNeeded(ins, code, size);
+
     code = insEncodeRMreg(ins, code);
 
-    if (TakesRexWPrefix(ins, size))
+    // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support.
+    // Explore moving IsWEvexOpcodeExtension() logic to instruction table as flag.
+    if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins)))
     {
         code = AddRexWPrefix(ins, code);
     }
@@ -12868,8 +13701,8 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
     // encode source operand reg in 'vvvv' bits in 1's complement form
     code = insEncodeReg3456(ins, src1, size, code);
 
-    // Output the REX prefix
-    dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);
+    // Output the REX/VEX/EVEX prefix
+    dst += emitOutputSimdPrefixIfNeeded(ins, dst, code);
 
     // Is this a 'big' opcode?
     if (code & 0xFF000000)
@@ -12892,7 +13725,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id)
     else if ((code & 0xFF) == 0x00)
     {
         // This case happens for AVX instructions only
-        assert(IsAVXInstruction(ins));
+        assert(IsSimdInstruction(ins));
 
         dst += emitOutputByte(dst, (code >> 8) & 0xFF);
         dst += emitOutputByte(dst, (0xC0 | regCode));
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 7f8b1bd85bbb0..644fed77da824 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -65,11 +65,14 @@ BYTE* emitOutputRRR(BYTE* dst, instrDesc* id);
 
 BYTE* emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* id);
 
+unsigned emitOutputSimdPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code);
 unsigned emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, code_t& code);
 unsigned emitGetRexPrefixSize(instruction ins);
 unsigned emitGetVexPrefixSize(instruction ins, emitAttr attr);
+unsigned emitGetEvexPrefixSize(instruction ins);
 unsigned emitGetPrefixSize(code_t code, bool includeRexPrefixSize);
 unsigned emitGetAdjustedSize(instruction ins, emitAttr attr, code_t code);
+unsigned emitGetAdjustedSizeEvexAware(instruction ins, emitAttr attr, code_t code);
 
 unsigned insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code);
 unsigned insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code);
@@ -86,7 +89,9 @@ unsigned insSSval(unsigned scale);
 
 static bool IsSSEInstruction(instruction ins);
 static bool IsSSEOrAVXInstruction(instruction ins);
+static bool IsAvx512OrPriorInstruction(instruction ins);
 static bool IsAVXOnlyInstruction(instruction ins);
+static bool IsAvx512OnlyInstruction(instruction ins);
 static bool IsFMAInstruction(instruction ins);
 static bool IsAVXVNNIInstruction(instruction ins);
 static bool IsBMIInstruction(instruction ins);
@@ -94,6 +99,9 @@ static bool IsBMIInstruction(instruction ins);
 static regNumber getBmiRegNumber(instruction ins);
 static regNumber getSseShiftRegNumber(instruction ins);
 bool IsAVXInstruction(instruction ins) const;
+bool IsAvx512Instruction(instruction ins) const;
+bool IsSimdInstruction(instruction ins) const;
+
 code_t insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code);
 
 code_t AddRexWPrefix(instruction ins, code_t code);
@@ -160,6 +168,278 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr
     return code;
 }
 
+//------------------------------------------------------------------------
+// IsWEvexOpcodeExtension: Some instructions use W bit as an opcode extension bit.
+// Identify instructions which requires W bit to be set to 1
+// for Evex encoding.
+// TODO-XArch-AVX512: Explore adding this as a flag to instr table.
+//
+// Arguments:
+//    ins - The instruction to check.
+//
+// Returns:
+//    TRUE if W bit needs to be set to 1.
+//
+bool IsWEvexOpcodeExtension(instruction ins)
+{
+    if (!TakesEvexPrefix(ins))
+    {
+        return false;
+    }
+
+    switch (ins)
+    {
+        case INS_movq:
+        case INS_addpd:
+        case INS_addsd:
+        case INS_movsd:
+        case INS_movsdsse2:
+        case INS_mulsd:
+        case INS_mulpd:
+        case INS_movntpd:
+        case INS_movlpd:
+        case INS_movhpd:
+        case INS_movapd:
+        case INS_movupd:
+        case INS_shufpd:
+        case INS_subsd:
+        case INS_subpd:
+        case INS_minsd:
+        case INS_minpd:
+        case INS_divsd:
+        case INS_divpd:
+        case INS_maxsd:
+        case INS_maxpd:
+        case INS_xorpd:
+        case INS_andpd:
+        case INS_sqrtsd:
+        case INS_sqrtpd:
+        case INS_andnpd:
+        case INS_orpd:
+        case INS_cvtpd2ps:
+        case INS_cvtsd2ss:
+        case INS_cvtpd2dq:
+        case INS_cvttpd2dq:
+        case INS_comisd:
+        case INS_ucomisd:
+        case INS_paddq:
+        case INS_psubq:
+        case INS_pmuludq:
+        case INS_psllq:
+        case INS_psrlq:
+        case INS_punpckhqdq:
+        case INS_punpcklqdq:
+        case INS_unpckhpd:
+        case INS_pmuldq:
+        case INS_movddup:
+        case INS_pinsrq:
+        case INS_pextrq:
+        case INS_vpbroadcastq:
+        case INS_vpermq:
+        case INS_vpsrlvq:
+        case INS_vpsllvq:
+        case INS_vpermilpd:
+        case INS_vpermpd:
+        case INS_vpgatherdq:
+        case INS_vpgatherqq:
+        case INS_vgatherdpd:
+        case INS_vgatherqpd:
+        case INS_vfmadd132pd:
+        case INS_vfmadd213pd:
+        case INS_vfmadd231pd:
+        case INS_vfmadd132sd:
+        case INS_vfmadd213sd:
+        case INS_vfmadd231sd:
+        case INS_vfmaddsub132pd:
+        case INS_vfmaddsub213pd:
+        case INS_vfmaddsub231pd:
+        case INS_vfmsubadd132pd:
+        case INS_vfmsubadd213pd:
+        case INS_vfmsubadd231pd:
+        case INS_vfmsub132pd:
+        case INS_vfmsub213pd:
+        case INS_vfmsub231pd:
+        case INS_vfmsub132sd:
+        case INS_vfmsub213sd:
+        case INS_vfmsub231sd:
+        case INS_vfnmadd132pd:
+        case INS_vfnmadd213pd:
+        case INS_vfnmadd231pd:
+        case INS_vfnmadd132sd:
+        case INS_vfnmadd213sd:
+        case INS_vfnmadd231sd:
+        case INS_vfnmsub132pd:
+        case INS_vfnmsub213pd:
+        case INS_vfnmsub231pd:
+        case INS_vfnmsub132sd:
+        case INS_vfnmsub213sd:
+        case INS_vfnmsub231sd:
+        case INS_unpcklpd:
+        case INS_vpermilpdvar:
+        {
+            return true; // W1
+        }
+        case INS_movd:
+        case INS_punpckldq:
+        case INS_movntdq:
+        case INS_movntps:
+        case INS_movlps:
+        case INS_movhps:
+        case INS_movss:
+        case INS_movaps:
+        case INS_movups:
+        case INS_movhlps:
+        case INS_movlhps:
+        case INS_unpckhps:
+        case INS_unpcklps:
+        case INS_shufps:
+        case INS_punpckhdq:
+        case INS_addps:
+        case INS_addss:
+        case INS_mulss:
+        case INS_mulps:
+        case INS_subss:
+        case INS_subps:
+        case INS_minss:
+        case INS_minps:
+        case INS_divss:
+        case INS_divps:
+        case INS_maxss:
+        case INS_maxps:
+        case INS_xorps:
+        case INS_andps:
+        case INS_sqrtss:
+        case INS_sqrtps:
+        case INS_andnps:
+        case INS_orps:
+        case INS_cvtss2sd:
+        case INS_cvtdq2ps:
+        case INS_cvtps2dq:
+        case INS_cvttps2dq:
+        case INS_cvtdq2pd:
+        case INS_comiss:
+        case INS_ucomiss:
+        case INS_paddd:
+        case INS_psubd:
+        case INS_pslld:
+        case INS_psrld:
+        case INS_psrad:
+        case INS_pshufd:
+        case INS_packssdw:
+        case INS_insertps:
+        case INS_pmulld:
+        case INS_pabsd:
+        case INS_pminsd:
+        case INS_pminud:
+        case INS_pmaxud:
+        case INS_pmovsxdq:
+        case INS_pmovzxdq:
+        case INS_packusdw:
+        case INS_movntdqa:
+        case INS_movsldup:
+        case INS_movshdup:
+        case INS_pinsrd:
+        case INS_pextrd:
+        case INS_vbroadcastss:
+        case INS_vbroadcastsd:
+        case INS_vpbroadcastb:
+        case INS_vpbroadcastw:
+        case INS_vpbroadcastd:
+        case INS_vpsravd:
+        case INS_vpsllvd:
+        case INS_vpermilps:
+        case INS_vpermd:
+        case INS_vpermps:
+        case INS_vpgatherdd:
+        case INS_vpgatherqd:
+        case INS_vgatherdps:
+        case INS_vgatherqps:
+        case INS_vfmadd132ps:
+        case INS_vfmadd213ps:
+        case INS_vfmadd231ps:
+        case INS_vfmadd132ss:
+        case INS_vfmadd213ss:
+        case INS_vfmadd231ss:
+        case INS_vfmaddsub132ps:
+        case INS_vfmaddsub213ps:
+        case INS_vfmaddsub231ps:
+        case INS_vfmsubadd132ps:
+        case INS_vfmsubadd213ps:
+        case INS_vfmsubadd231ps:
+        case INS_vfmsub132ps:
+        case INS_vfmsub213ps:
+        case INS_vfmsub231ps:
+        case INS_vfmsub132ss:
+        case INS_vfmsub213ss:
+        case INS_vfmsub231ss:
+        case INS_vfnmadd132ps:
+        case INS_vfnmadd213ps:
+        case INS_vfnmadd231ps:
+        case INS_vfnmadd132ss:
+        case INS_vfnmadd213ss:
+        case INS_vfnmadd231ss:
+        case INS_vfnmsub132ps:
+        case INS_vfnmsub213ps:
+        case INS_vfnmsub231ps:
+        case INS_vfnmsub132ss:
+        case INS_vfnmsub213ss:
+        case INS_vfnmsub231ss:
+        case INS_vpdpbusd:
+        case INS_vpdpwssd:
+        case INS_vpdpbusds:
+        case INS_vpdpwssds:
+        case INS_vpermilpsvar:
+        {
+            return false; // W0
+        }
+        default:
+        {
+            return false; // WIG
+        }
+    }
+}
+
+//------------------------------------------------------------------------
+// HasKMaskRegisterDest: Temporary check to identify instructions that can
+// be Evex encoded but require Opmask(KMask) register support.
+// These are cases where for comparison instructions, result is written
+//  to KMask when Evex encoded.
+// TODO-XArch-AVX512: Refactor once KMask is added.
+//
+// Arguments:
+//    ins - The instruction to check.
+//
+// Returns:
+//    TRUE if Evex encoding requires KMAsk support.
+//
+bool HasKMaskRegisterDest(instruction ins) const
+{
+    assert(UseEvexEncoding() == true);
+    switch (ins)
+    {
+        // Requires KMask.
+        case INS_pcmpgtb:
+        case INS_pcmpgtd:
+        case INS_pcmpgtw:
+        case INS_pcmpgtq:
+        case INS_pcmpeqb:
+        case INS_pcmpeqd:
+        case INS_pcmpeqq:
+        case INS_pcmpeqw:
+        case INS_cmpps:
+        case INS_cmpss:
+        case INS_cmppd:
+        case INS_cmpsd:
+        {
+            return true;
+        }
+        default:
+        {
+            return false;
+        }
+    }
+}
+
 bool useVEXEncodings;
 bool UseVEXEncoding() const
 {
@@ -170,6 +450,90 @@ void SetUseVEXEncoding(bool value)
     useVEXEncodings = value;
 }
 
+// Is Evex encoding supported.
+bool useEvexEncodings;
+bool UseEvexEncoding() const
+{
+    return useEvexEncodings;
+}
+void SetUseEvexEncoding(bool value)
+{
+    useEvexEncodings = value;
+}
+
+// 4-byte EVEX prefix starts with byte 0x62
+#define EVEX_PREFIX_MASK 0xFF00000000000000ULL
+#define EVEX_PREFIX_CODE 0x6200000000000000ULL
+
+bool TakesEvexPrefix(instruction ins) const;
+
+//------------------------------------------------------------------------
+// hasEvexPrefix: Returns true if the instruction encoding already
+// contains Evex prefix.
+//
+// Arguments:
+//    code - opcode + prefixes bits at some stage of encoding.
+//
+// Returns:
+//    TRUE if code has an Evex prefix.
+bool hasEvexPrefix(code_t code)
+{
+    return (code & EVEX_PREFIX_MASK) == EVEX_PREFIX_CODE;
+}
+code_t AddEvexPrefix(instruction ins, code_t code, emitAttr attr);
+
+//------------------------------------------------------------------------
+// AddSimdPrefixIfNeeded: Add the correct SIMD prefix if required.
+//
+// Arguments:
+//    ins - the instruction being encoded.
+//    code - opcode + prefixes bits at some stage of encoding.
+//    size - operand size
+//
+// Returns:
+//    TRUE if code has an Evex prefix.
+code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size)
+{
+    if (TakesEvexPrefix(ins))
+    {
+        code = AddEvexPrefix(ins, code, size);
+    }
+    else if (TakesVexPrefix(ins))
+    {
+        code = AddVexPrefix(ins, code, size);
+    }
+    return code;
+}
+
+//------------------------------------------------------------------------
+// hasVexOrEvexPrefix: Returns true if the instruction encoding already
+// contains a Vex or Evex prefix.
+//
+// Arguments:
+//    code - opcode + prefixes bits at some stage of encoding.
+//
+// Returns:
+//    TRUE if code has a SIMD prefix.
+bool hasVexOrEvexPrefix(code_t code)
+{
+    return (hasVexPrefix(code) || hasEvexPrefix(code));
+}
+
+//------------------------------------------------------------------------
+// codeEvexMigrationCheck: Temporary check to use when adding EVEX codepaths
+// TODO-XArch-AVX512: Remove implementation and uses once all Evex paths are
+// completed.
+//
+// Arguments:
+//    code - opcode + prefixes bits at some stage of encoding.
+//
+// Returns:
+//    TRUE if code has an Evex prefix.
+bool codeEvexMigrationCheck(code_t code)
+{
+    return hasEvexPrefix(code);
+}
+
 bool containsAVXInstruction = false;
 bool ContainsAVX()
 {
@@ -203,6 +567,7 @@ bool IsThreeOperandAVXInstruction(instruction ins)
 {
     return (IsDstDstSrcAVXInstruction(ins) || IsDstSrcSrcAVXInstruction(ins));
 }
+
 bool isAvxBlendv(instruction ins)
 {
     return ins == INS_vblendvps || ins == INS_vblendvpd || ins == INS_vpblendvb;
diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h
index e766df67304d8..87de5ef1eb0af 100644
--- a/src/coreclr/jit/instrsxarch.h
+++ b/src/coreclr/jit/instrsxarch.h
@@ -614,6 +614,8 @@ INST3(shrx,             "shrx",             IUM_WR, BAD_CODE,     BAD_CODE,
 INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
 
 INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
+INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_FLAGS_None)
 
 // Scalar instructions in SSE4.2
 INST3(crc32,            "crc32",            IUM_WR, BAD_CODE,     BAD_CODE,     PACK4(0xF2, 0x0F, 0x38, 0xF0),           INS_FLAGS_None)
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 23fb5ee3c9ff5..ad54ce7153f58 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -297,8 +297,9 @@ CONFIG_INTEGER(AltJitAssertOnNYI, W("AltJitAssertOnNYI"), 1) // Controls the Alt
 
 CONFIG_INTEGER(EnableEHWriteThru, W("EnableEHWriteThru"), 1) // Enable the register allocator to support EH-write thru:
                                                              // partial enregistration of vars exposed on EH boundaries
-CONFIG_INTEGER(EnableMultiRegLocals, W("EnableMultiRegLocals"), 1) // Enable the enregistration of locals that are
-                                                                   // defined or used in a multireg context.
+CONFIG_INTEGER(EnableMultiRegLocals, W("EnableMultiRegLocals"), 1)   // Enable the enregistration of locals that are
+                                                                     // defined or used in a multireg context.
+CONFIG_INTEGER(JitStressEVEXEncoding, W("JitStressEVEXEncoding"), 0) // Enable EVEX encoding for SIMD instructions.
 
 // clang-format off