Cleanup some xarch emit logic (#85536)

* Ensure floating-point codegen uses the VEX aware path * Fix `IF_RRW_RRW_CNS` to be `IF_RWR_RRD_CNS` * Fixup emitfmtsxarch.h to have a more consistent layout * Allow querying the scheduling info for an insFormat * Ensure the new insFormats are handled * Ensure we consistently use `emitInsModeFormat` * Ensure instructions which write to a mask register are EVEX only * Improve REX.W handling for EVEX only instructions * Ensure that instructions use the right update mode and tuple type * Apply formatting patch * Ensure DstSrcSrc is still handled correctly * Ensure BLSI/BLSR are still handled in emitOutputAM * Use static_assert_no_msg * Fixing the disassembly for IF_RRW_SHF * Fixing the IF check for shld/shrd on x86 * Use the correct name: inst_RV_TT_IV * Ensure the 4 operand insFormats include the necessary constant * Resolve an insFormat check on x86 * Ensure other SIMD code paths are VEX aware * Improve throughput by using a less expensive emitSizeOfInsDsc * Apply formatting patch * Ensure emitSizeOfInsDsc_CNS is used for RWR_RRD_*RD_CNS * Ensure genSimd12UpperClear uses `andps` for the pre-SSE4.1 path
dotnet · May 2, 2023 · da0aa0c · da0aa0c
1 parent b02d7a1
commit da0aa0c
Show file tree

Hide file tree

Showing 17 changed files with 1,644 additions and 1,311 deletions.
diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h
@@ -46,6 +46,9 @@ class CodeGen final : public CodeGenInterface
     CORINFO_FIELD_HANDLE absBitmaskFlt;
     CORINFO_FIELD_HANDLE absBitmaskDbl;
 
+    // Bit mask used in zeroing the 3rd element of a SIMD12
+    CORINFO_FIELD_HANDLE zroSimd12Elm3;
+
     // Bit mask used in U8 -> double conversion to adjust the result.
     CORINFO_FIELD_HANDLE u8ToDblBitmask;
 
@@ -925,6 +928,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
     void genSimdUpperSave(GenTreeIntrinsic* node);
     void genSimdUpperRestore(GenTreeIntrinsic* node);
 
+    void genSimd12UpperClear(regNumber tgtReg);
+
     // TYP_SIMD12 (i.e Vector3 of size 12 bytes) is not a hardware supported size and requires
     // two reads/writes on 64-bit targets. These routines abstract reading/writing of Vector3
     // values through an indirection. Note that Vector3 locals allocated on stack would have
@@ -1532,6 +1537,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
     void inst_RV_RV_IV(instruction ins, emitAttr size, regNumber reg1, regNumber reg2, unsigned ival);
     void inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival);
     void inst_RV_RV_TT(instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, bool isRMW);
+    void inst_RV_RV_TT_IV(
+        instruction ins, emitAttr size, regNumber targetReg, regNumber op1Reg, GenTree* op2, int8_t ival, bool isRMW);
 #endif
 
     void inst_set_SV_var(GenTree* tree);

diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
@@ -4474,7 +4474,7 @@ void CodeGen::genZeroInitFltRegs(const regMaskTP& initFltRegs, const regMaskTP&
                 }
 #elif defined(TARGET_XARCH)
                 // XORPS is the fastest and smallest way to initialize a XMM register to zero.
-                inst_RV_RV(INS_xorps, reg, reg, TYP_DOUBLE);
+                GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, reg, reg, reg);
                 dblInitReg = reg;
 #elif defined(TARGET_ARM64)
                 // We will just zero out the entire vector register. This sets it to a double/float zero value
@@ -4514,7 +4514,7 @@ void CodeGen::genZeroInitFltRegs(const regMaskTP& initFltRegs, const regMaskTP&
                 }
 #elif defined(TARGET_XARCH)
                 // XORPS is the fastest and smallest way to initialize a XMM register to zero.
-                inst_RV_RV(INS_xorps, reg, reg, TYP_DOUBLE);
+                GetEmitter()->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, reg, reg, reg);
                 fltInitReg = reg;
 #elif defined(TARGET_ARM64)
                 // We will just zero out the entire vector register. This sets it to a double/float zero value