Expose various Convert intrinsics for Avx512F, Avx512BW, and Avx512DQ (…

…#85281) * Expose various Convert intrinsics for Avx512F and Avx512DQ * Expose various integer conversion APIs for Avx512F and Avx512BW * Ensure special instructions are handled in codegen * Apply formatting patch * Ensure the AVX512F_VL variant is picked for simdSize=16/32 * Ensure conversion instructions are handled in PERFSCORE * Ensure instructions use the right tuple type * Removing an invalid API and fix more PERFSCORE entries * Resolve additional failures masked by #85056 * Ensure TieredCompilation=0 is also passing * Apply formatting patch * Fixing some more test edge cases * Ensure uint64->double and uint64->Float masks the input
dotnet · Apr 25, 2023 · f1a4cdd · f1a4cdd
1 parent 5b9848f
commit f1a4cdd
Show file tree

Hide file tree

Showing 20 changed files with 2,963 additions and 302 deletions.
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
@@ -5658,18 +5658,47 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
                             break;
                         }
 
+                        case NI_AVX512F_ConvertToVector256Int32:
+                        case NI_AVX512F_ConvertToVector256UInt32:
+                        case NI_AVX512F_VL_ConvertToVector128UInt32:
+                        case NI_AVX512F_VL_ConvertToVector128UInt32WithSaturation:
+                        {
+                            assert(!varTypeIsFloating(baseType));
+                            FALLTHROUGH;
+                        }
+
+                        case NI_AVX512F_ConvertToVector128Byte:
+                        case NI_AVX512F_ConvertToVector128ByteWithSaturation:
                         case NI_AVX512F_ConvertToVector128Int16:
-                        case NI_AVX512F_ConvertToVector128Int32:
+                        case NI_AVX512F_ConvertToVector128Int16WithSaturation:
+                        case NI_AVX512F_ConvertToVector128SByte:
+                        case NI_AVX512F_ConvertToVector128SByteWithSaturation:
                         case NI_AVX512F_ConvertToVector128UInt16:
-                        case NI_AVX512F_ConvertToVector128UInt32:
+                        case NI_AVX512F_ConvertToVector128UInt16WithSaturation:
                         case NI_AVX512F_ConvertToVector256Int16:
-                        case NI_AVX512F_ConvertToVector256Int32:
+                        case NI_AVX512F_ConvertToVector256Int16WithSaturation:
+                        case NI_AVX512F_ConvertToVector256Int32WithSaturation:
                         case NI_AVX512F_ConvertToVector256UInt16:
-                        case NI_AVX512F_ConvertToVector256UInt32:
-                        case NI_AVX512BW_ConvertToVector128Byte:
-                        case NI_AVX512BW_ConvertToVector128SByte:
+                        case NI_AVX512F_ConvertToVector256UInt16WithSaturation:
+                        case NI_AVX512F_ConvertToVector256UInt32WithSaturation:
+                        case NI_AVX512F_VL_ConvertToVector128Byte:
+                        case NI_AVX512F_VL_ConvertToVector128ByteWithSaturation:
+                        case NI_AVX512F_VL_ConvertToVector128Int16:
+                        case NI_AVX512F_VL_ConvertToVector128Int16WithSaturation:
+                        case NI_AVX512F_VL_ConvertToVector128Int32:
+                        case NI_AVX512F_VL_ConvertToVector128Int32WithSaturation:
+                        case NI_AVX512F_VL_ConvertToVector128SByte:
+                        case NI_AVX512F_VL_ConvertToVector128SByteWithSaturation:
+                        case NI_AVX512F_VL_ConvertToVector128UInt16:
+                        case NI_AVX512F_VL_ConvertToVector128UInt16WithSaturation:
                         case NI_AVX512BW_ConvertToVector256Byte:
+                        case NI_AVX512BW_ConvertToVector256ByteWithSaturation:
                         case NI_AVX512BW_ConvertToVector256SByte:
+                        case NI_AVX512BW_ConvertToVector256SByteWithSaturation:
+                        case NI_AVX512BW_VL_ConvertToVector128Byte:
+                        case NI_AVX512BW_VL_ConvertToVector128ByteWithSaturation:
+                        case NI_AVX512BW_VL_ConvertToVector128SByte:
+                        case NI_AVX512BW_VL_ConvertToVector128SByteWithSaturation:
                         {
                             // These intrinsics are "ins reg/mem, xmm"
                             ins  = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);

diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
@@ -1905,7 +1905,7 @@ class emitter
     ssize_t emitGetInsCIdisp(instrDesc* id);
     unsigned emitGetInsCIargs(instrDesc* id);
 
-    inline static emitAttr emitGetMemOpSize(instrDesc* id);
+    inline emitAttr emitGetMemOpSize(instrDesc* id) const;
 
     // Return the argument count for a direct call "id".
     int emitGetInsCDinfo(instrDesc* id);
@@ -3456,11 +3456,12 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
 //  Arguments:
 //       id - Instruction descriptor
 //
-/* static */ emitAttr emitter::emitGetMemOpSize(instrDesc* id)
+emitAttr emitter::emitGetMemOpSize(instrDesc* id) const
 {
-    emitAttr defaultSize = id->idOpSize();
+    emitAttr    defaultSize = id->idOpSize();
+    instruction ins         = id->idIns();
 
-    switch (id->idIns())
+    switch (ins)
     {
         case INS_pextrb:
         case INS_pinsrb:
@@ -3570,9 +3571,6 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
 
         case INS_cvtdq2pd:
         case INS_cvtps2pd:
-        case INS_vpmovdw:
-        case INS_vpmovqd:
-        case INS_vpmovwb:
         {
             if (defaultSize == 64)
             {
@@ -3589,6 +3587,57 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
             }
         }
 
+        case INS_vpmovdb:
+        case INS_vpmovdw:
+        case INS_vpmovqb:
+        case INS_vpmovqd:
+        case INS_vpmovqw:
+        case INS_vpmovwb:
+        case INS_vpmovsdb:
+        case INS_vpmovsdw:
+        case INS_vpmovsqb:
+        case INS_vpmovsqd:
+        case INS_vpmovsqw:
+        case INS_vpmovswb:
+        case INS_vpmovusdb:
+        case INS_vpmovusdw:
+        case INS_vpmovusqb:
+        case INS_vpmovusqd:
+        case INS_vpmovusqw:
+        case INS_vpmovuswb:
+        {
+            insTupleType tupleType = insTupleTypeInfo(ins);
+            unsigned     memSize   = 0;
+
+            switch (tupleType)
+            {
+                case INS_TT_HALF_MEM:
+                {
+                    memSize = defaultSize / 2;
+                    break;
+                }
+
+                case INS_TT_QUARTER_MEM:
+                {
+                    memSize = defaultSize / 4;
+                    break;
+                }
+
+                case INS_TT_EIGHTH_MEM:
+                {
+                    memSize = defaultSize / 8;
+                    break;
+                }
+
+                default:
+                {
+                    unreached();
+                }
+            }
+
+            return EA_ATTR(memSize);
+        }
+
         case INS_vbroadcastf128:
         case INS_vbroadcasti128:
         case INS_vextractf128:
@@ -3613,7 +3662,11 @@ inline unsigned emitter::emitGetInsCIargs(instrDesc* id)
 
         case INS_movddup:
         {
-            if (defaultSize == 32)
+            if (defaultSize == 64)
+            {
+                return EA_64BYTE;
+            }
+            else if (defaultSize == 32)
             {
                 return EA_32BYTE;
             }

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
@@ -1362,6 +1362,10 @@ bool emitter::TakesRexWPrefix(const instrDesc* id) const
             case INS_shlx:
             case INS_shrx:
 #endif // TARGET_AMD64
+            case INS_vcvtsd2usi:
+            case INS_vcvtss2usi:
+            case INS_vcvttsd2usi:
+            case INS_vcvttss2usi:
             {
                 if (attr == EA_8BYTE)
                 {
@@ -2582,6 +2586,10 @@ bool emitter::emitInsCanOnlyWriteSSE2OrAVXReg(instrDesc* id)
         case INS_sarx:
         case INS_shrx:
 #endif
+        case INS_vcvtsd2usi:
+        case INS_vcvtss2usi:
+        case INS_vcvttsd2usi:
+        case INS_vcvttss2usi:
         {
             // These SSE instructions write to a general purpose integer register.
             return false;
@@ -3010,7 +3018,7 @@ inline bool hasTupleTypeInfo(instruction ins)
 // Return Value:
 //    the tuple type info for a given CPU instruction.
 //
-inline insTupleType insTupleTypeInfo(instruction ins)
+insTupleType emitter::insTupleTypeInfo(instruction ins) const
 {
     assert((unsigned)ins < ArrLen(insTupleTypeInfos));
     assert(insTupleTypeInfos[ins] != INS_TT_NONE);
@@ -3020,9 +3028,9 @@ inline insTupleType insTupleTypeInfo(instruction ins)
 // Return true if the instruction uses the SSE38 or SSE3A macro in instrsXArch.h.
 bool emitter::EncodedBySSE38orSSE3A(instruction ins) const
 {
-    const size_t SSE38 = 0x0F660038;
-    const size_t SSE3A = 0x0F66003A;
-    const size_t MASK  = 0xFFFF00FF;
+    const size_t SSE38 = 0x0F000038;
+    const size_t SSE3A = 0x0F00003A;
+    const size_t MASK  = 0xFF0000FF;
 
     size_t insCode = 0;
 
@@ -3044,8 +3052,19 @@ bool emitter::EncodedBySSE38orSSE3A(instruction ins) const
         insCode = insCodeMR(ins);
     }
 
-    insCode &= MASK;
-    return insCode == SSE38 || insCode == SSE3A;
+    size_t mskCode = insCode & MASK;
+
+    if ((mskCode != SSE38) && (mskCode != SSE3A))
+    {
+        return false;
+    }
+
+#if defined(DEBUG)
+    insCode = (insCode >> 16) & 0xFF;
+    assert((insCode == 0x66) || (insCode == 0xF2) || (insCode == 0xF3));
+#endif // DEBUG
+
+    return true;
 }
 
 /*****************************************************************************
@@ -11214,6 +11233,10 @@ void emitter::emitDispIns(
                 case INS_cvtss2si:
                 case INS_cvtsd2si:
                 case INS_cvttss2si:
+                case INS_vcvtsd2usi:
+                case INS_vcvtss2usi:
+                case INS_vcvttsd2usi:
+                case INS_vcvttss2usi:
                 {
                     printf(" %s, %s", emitRegName(id->idReg1(), attr), emitRegName(id->idReg2(), EA_16BYTE));
                     break;
@@ -15528,9 +15551,9 @@ ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspI
             disp8Compression = inputSize * 4;
             break;
         case INS_TT_TUPLE8:
-            // N = input size in bytes * 4, 32bit for 512 only
+            // N = input size in bytes * 8, 32bit for 512 only
             assert((inputSize == 4 && vectorLength >= 64));
-            disp8Compression = inputSize * 4;
+            disp8Compression = inputSize * 8;
             break;
         case INS_TT_HALF_MEM:
             // N = vector length in bytes / 2
@@ -17825,11 +17848,39 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
         case INS_cvttps2dq:
         case INS_cvtps2dq:
         case INS_cvtdq2ps:
+        case INS_vcvtpd2qq:
+        case INS_vcvtpd2uqq:
+        case INS_vcvtps2udq:
+        case INS_vcvtqq2pd:
+        case INS_vcvttps2udq:
+        case INS_vcvtudq2ps:
+        case INS_vcvttpd2qq:
+        case INS_vcvttpd2uqq:
+        case INS_vcvtuqq2pd:
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency += PERFSCORE_LATENCY_4C;
+            break;
+
+        case INS_vpmovdb:
         case INS_vpmovdw:
+        case INS_vpmovqb:
         case INS_vpmovqd:
+        case INS_vpmovqw:
+        case INS_vpmovsdb:
+        case INS_vpmovsdw:
+        case INS_vpmovsqb:
+        case INS_vpmovsqd:
+        case INS_vpmovsqw:
+        case INS_vpmovswb:
+        case INS_vpmovusdb:
+        case INS_vpmovusdw:
+        case INS_vpmovusqb:
+        case INS_vpmovusqd:
+        case INS_vpmovusqw:
+        case INS_vpmovuswb:
         case INS_vpmovwb:
-            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
-            result.insLatency += PERFSCORE_LATENCY_4C;
+            result.insThroughput = PERFSCORE_THROUGHPUT_2C;
+            result.insLatency += (opSize == EA_16BYTE) ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_4C;
             break;
 
         case INS_haddps:
@@ -17892,12 +17943,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
         case INS_cvtsi2ss32:
         case INS_cvtsi2sd64:
         case INS_cvtsi2ss64:
+        case INS_vcvtsd2usi:
+        case INS_vcvttsd2usi:
+        case INS_vcvtusi2sd32:
+        case INS_vcvtusi2sd64:
+        case INS_vcvtusi2ss32:
+        case INS_vcvtusi2ss64:
             result.insThroughput = PERFSCORE_THROUGHPUT_1C;
             result.insLatency += PERFSCORE_LATENCY_7C;
             break;
 
         case INS_cvttss2si:
         case INS_cvtss2si:
+        case INS_vcvtss2usi:
+        case INS_vcvttss2usi:
             result.insThroughput = PERFSCORE_THROUGHPUT_1C;
             result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_8C : PERFSCORE_LATENCY_7C;
             break;
@@ -18241,6 +18300,15 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
         case INS_cvtdq2pd:
         case INS_cvtpd2ps:
         case INS_cvttpd2dq:
+        case INS_vcvtpd2udq:
+        case INS_vcvtps2qq:
+        case INS_vcvtps2uqq:
+        case INS_vcvtqq2ps:
+        case INS_vcvttpd2udq:
+        case INS_vcvttps2qq:
+        case INS_vcvttps2uqq:
+        case INS_vcvtudq2pd:
+        case INS_vcvtuqq2ps:
             result.insThroughput = PERFSCORE_THROUGHPUT_1C;
             result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_7C : PERFSCORE_LATENCY_5C;
             break;
@@ -18282,17 +18350,25 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
         case INS_vpbroadcastq_gpr:
         case INS_vbroadcasti128:
         case INS_vbroadcastf128:
+        case INS_vbroadcastf64x2:
+        case INS_vbroadcasti64x2:
+        case INS_vbroadcastf64x4:
+        case INS_vbroadcasti64x4:
+        case INS_vbroadcastf32x2:
+        case INS_vbroadcasti32x2:
+        case INS_vbroadcastf32x8:
+        case INS_vbroadcasti32x8:
         case INS_vbroadcastss:
         case INS_vbroadcastsd:
             if (memAccessKind == PERFSCORE_MEMORY_NONE)
             {
                 result.insThroughput = PERFSCORE_THROUGHPUT_1C;
-                result.insLatency    = opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_1C;
+                result.insLatency    = opSize == EA_16BYTE ? PERFSCORE_LATENCY_1C : PERFSCORE_LATENCY_3C;
             }
             else
             {
                 result.insThroughput = PERFSCORE_THROUGHPUT_2X;
-                result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_2C;
+                result.insLatency += opSize == EA_16BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_3C;
                 if (ins == INS_vpbroadcastb || ins == INS_vpbroadcastw)
                 {
                     result.insLatency += PERFSCORE_LATENCY_1C;

diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
@@ -183,6 +183,8 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr
     return code;
 }
 
+insTupleType insTupleTypeInfo(instruction ins) const;
+
 //------------------------------------------------------------------------
 // HasKMaskRegisterDest: Temporary check to identify instructions that can
 // be Evex encoded but require Opmask(KMask) register support.