From a2b7648d3b23fa00442ebc24a12d255895a0945e Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Tue, 18 May 2021 03:33:52 -0700 Subject: [PATCH] Port SIMDIntrinsicGetItem and SIMDIntrinsicSetItem to be implemented via HWIntrinsics (#52288) * Port SIMDIntrinsicGetItem and SIMDIntrinsicSetItem to be implemented using SimdAsHWIntrinsic * Apply suggestions from code review Co-authored-by: Egor Chesakov * Resolving mismerge * Added a comment explaining why we sometimes return and sometimes do containment checks * Update src/coreclr/jit/lsraarm64.cpp Co-authored-by: Egor Chesakov Co-authored-by: Egor Chesakov --- src/coreclr/jit/codegen.h | 2 - src/coreclr/jit/codegenarm64.cpp | 247 ------ src/coreclr/jit/compiler.h | 23 +- src/coreclr/jit/decomposelongs.cpp | 120 +-- src/coreclr/jit/decomposelongs.h | 7 +- src/coreclr/jit/emitxarch.cpp | 135 ++- src/coreclr/jit/gentree.cpp | 164 ++++ src/coreclr/jit/hwintrinsic.cpp | 77 +- src/coreclr/jit/hwintrinsicarm64.cpp | 55 +- src/coreclr/jit/hwintrinsiccodegenarm64.cpp | 122 ++- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 194 ++++- src/coreclr/jit/hwintrinsiclistarm64.h | 4 +- src/coreclr/jit/hwintrinsiclistxarch.h | 4 +- src/coreclr/jit/hwintrinsicxarch.cpp | 349 +------- src/coreclr/jit/lclmorph.cpp | 4 +- src/coreclr/jit/lower.h | 2 + src/coreclr/jit/lowerarmarch.cpp | 58 +- src/coreclr/jit/lowerxarch.cpp | 781 ++++++++++++++++-- src/coreclr/jit/lsraarm64.cpp | 69 +- src/coreclr/jit/lsraxarch.cpp | 119 +-- src/coreclr/jit/morph.cpp | 67 +- src/coreclr/jit/simd.cpp | 121 +-- src/coreclr/jit/simdashwintrinsic.cpp | 52 ++ src/coreclr/jit/simdashwintrinsic.h | 9 + src/coreclr/jit/simdashwintrinsiclistarm64.h | 1 + src/coreclr/jit/simdashwintrinsiclistxarch.h | 2 + src/coreclr/jit/simdcodegenxarch.cpp | 394 --------- src/coreclr/jit/simdintrinsiclist.h | 13 - .../tests/GenericVectorTests.cs | 2 +- .../src/System/Numerics/Vector_1.cs | 2 +- src/tests/JIT/SIMD/VectorGet.cs | 8 +- 31 files changed, 1663 insertions(+), 1544 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 99cc72d8b128b..92242ca8f349d 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -977,8 +977,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode); void genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode); void genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode); - void genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode); - void genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode); void genSIMDIntrinsicShuffleSSE2(GenTreeSIMD* simdNode); void genSIMDIntrinsicUpperSave(GenTreeSIMD* simdNode); void genSIMDIntrinsicUpperRestore(GenTreeSIMD* simdNode); diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 14213cbcdc9a1..f0377bfe6a69f 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -3873,17 +3873,6 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) genSIMDIntrinsicBinOp(simdNode); break; - case SIMDIntrinsicGetItem: - genSIMDIntrinsicGetItem(simdNode); - break; - - case SIMDIntrinsicSetX: - case SIMDIntrinsicSetY: - case SIMDIntrinsicSetZ: - case SIMDIntrinsicSetW: - genSIMDIntrinsicSetItem(simdNode); - break; - case SIMDIntrinsicUpperSave: genSIMDIntrinsicUpperSave(simdNode); break; @@ -4346,242 +4335,6 @@ void CodeGen::genSIMDIntrinsicBinOp(GenTreeSIMD* simdNode) genProduceReg(simdNode); } -//------------------------------------------------------------------------------------ -// genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i. -// -// Arguments: -// simdNode - The GT_SIMD node -// -// Return Value: -// None. -// -void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode) -{ - assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem); - - GenTree* op1 = simdNode->gtGetOp1(); - GenTree* op2 = simdNode->gtGetOp2(); - var_types simdType = op1->TypeGet(); - assert(varTypeIsSIMD(simdType)); - - // op1 of TYP_SIMD12 should be considered as TYP_SIMD16 - if (simdType == TYP_SIMD12) - { - simdType = TYP_SIMD16; - } - - var_types baseType = simdNode->GetSimdBaseType(); - regNumber targetReg = simdNode->GetRegNum(); - assert(targetReg != REG_NA); - var_types targetType = simdNode->TypeGet(); - assert(targetType == genActualType(baseType)); - - // GetItem has 2 operands: - // - the source of SIMD type (op1) - // - the index of the value to be returned. - genConsumeOperands(simdNode); - - emitAttr baseTypeSize = emitTypeSize(baseType); - unsigned baseTypeScale = genLog2(EA_SIZE_IN_BYTES(baseTypeSize)); - - if (op2->IsCnsIntOrI()) - { - assert(op2->isContained()); - - ssize_t index = op2->AsIntCon()->gtIconVal; - - // We only need to generate code for the get if the index is valid - // If the index is invalid, previously generated for the range check will throw - if (GetEmitter()->isValidVectorIndex(emitTypeSize(simdType), baseTypeSize, index)) - { - if (op1->isContained()) - { - int offset = (int)index * genTypeSize(baseType); - instruction ins = ins_Load(baseType); - - assert(!op1->isUsedFromReg()); - - if (op1->OperIsLocal()) - { - unsigned varNum = op1->AsLclVarCommon()->GetLclNum(); - - GetEmitter()->emitIns_R_S(ins, emitActualTypeSize(baseType), targetReg, varNum, offset); - } - else - { - assert(op1->OperGet() == GT_IND); - - GenTree* addr = op1->AsIndir()->Addr(); - assert(!addr->isContained()); - regNumber baseReg = addr->GetRegNum(); - - // ldr targetReg, [baseReg, #offset] - GetEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(baseType), targetReg, baseReg, offset); - } - } - else - { - assert(op1->isUsedFromReg()); - regNumber srcReg = op1->GetRegNum(); - - instruction ins; - if (varTypeIsFloating(baseType)) - { - assert(genIsValidFloatReg(targetReg)); - // dup targetReg, srcReg[#index] - ins = INS_dup; - } - else - { - assert(genIsValidIntReg(targetReg)); - if (varTypeIsUnsigned(baseType) || (baseTypeSize == EA_8BYTE)) - { - // umov targetReg, srcReg[#index] - ins = INS_umov; - } - else - { - // smov targetReg, srcReg[#index] - ins = INS_smov; - } - } - GetEmitter()->emitIns_R_R_I(ins, baseTypeSize, targetReg, srcReg, index); - } - } - } - else - { - assert(!op2->isContained()); - - regNumber baseReg = REG_NA; - regNumber indexReg = op2->GetRegNum(); - - if (op1->isContained()) - { - // Optimize the case of op1 is in memory and trying to access ith element. - assert(!op1->isUsedFromReg()); - if (op1->OperIsLocal()) - { - unsigned varNum = op1->AsLclVarCommon()->GetLclNum(); - - baseReg = simdNode->ExtractTempReg(); - - // Load the address of varNum - GetEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, baseReg, varNum, 0); - } - else - { - // Require GT_IND addr to be not contained. - assert(op1->OperGet() == GT_IND); - - GenTree* addr = op1->AsIndir()->Addr(); - assert(!addr->isContained()); - - baseReg = addr->GetRegNum(); - } - } - else - { - assert(op1->isUsedFromReg()); - regNumber srcReg = op1->GetRegNum(); - - unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum; - noway_assert(compiler->lvaSIMDInitTempVarNum != BAD_VAR_NUM); - - baseReg = simdNode->ExtractTempReg(); - - // Load the address of simdInitTempVarNum - GetEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, baseReg, simdInitTempVarNum, 0); - - // Store the vector to simdInitTempVarNum - GetEmitter()->emitIns_R_R(INS_str, emitTypeSize(simdType), srcReg, baseReg); - } - - assert(genIsValidIntReg(indexReg)); - assert(genIsValidIntReg(baseReg)); - assert(baseReg != indexReg); - - // Load item at baseReg[index] - GetEmitter()->emitIns_R_R_R_Ext(ins_Load(baseType), baseTypeSize, targetReg, baseReg, indexReg, INS_OPTS_LSL, - baseTypeScale); - } - - genProduceReg(simdNode); -} - -//------------------------------------------------------------------------------------ -// genSIMDIntrinsicSetItem: Generate code for SIMD Intrinsic set element at index i. -// -// Arguments: -// simdNode - The GT_SIMD node -// -// Return Value: -// None. -// -void CodeGen::genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode) -{ - // Determine index based on intrinsic ID - int index = -1; - switch (simdNode->gtSIMDIntrinsicID) - { - case SIMDIntrinsicSetX: - index = 0; - break; - case SIMDIntrinsicSetY: - index = 1; - break; - case SIMDIntrinsicSetZ: - index = 2; - break; - case SIMDIntrinsicSetW: - index = 3; - break; - - default: - unreached(); - } - assert(index != -1); - - // op1 is the SIMD vector - // op2 is the value to be set - GenTree* op1 = simdNode->gtGetOp1(); - GenTree* op2 = simdNode->gtGetOp2(); - - var_types baseType = simdNode->GetSimdBaseType(); - regNumber targetReg = simdNode->GetRegNum(); - assert(targetReg != REG_NA); - var_types targetType = simdNode->TypeGet(); - assert(varTypeIsSIMD(targetType)); - - assert(op2->TypeGet() == baseType); - assert(simdNode->GetSimdSize() >= ((index + 1) * genTypeSize(baseType))); - - genConsumeOperands(simdNode); - regNumber op1Reg = op1->GetRegNum(); - regNumber op2Reg = op2->GetRegNum(); - - assert(genIsValidFloatReg(targetReg)); - assert(genIsValidFloatReg(op1Reg)); - assert(genIsValidIntReg(op2Reg) || genIsValidFloatReg(op2Reg)); - assert(targetReg != op2Reg); - - emitAttr attr = emitTypeSize(baseType); - - // Insert mov if register assignment requires it - GetEmitter()->emitIns_Mov(INS_mov, EA_16BYTE, targetReg, op1Reg, /* canSkip */ false); - - if (genIsValidIntReg(op2Reg)) - { - GetEmitter()->emitIns_R_R_I(INS_ins, attr, targetReg, op2Reg, index); - } - else - { - GetEmitter()->emitIns_R_R_I_I(INS_ins, attr, targetReg, op2Reg, index, 0); - } - - genProduceReg(simdNode); -} - //----------------------------------------------------------------------------- // genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD16 vector to // the given register, if any, or to memory. diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 56f7b44d669cf..0efcbdf5cedd0 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -2906,6 +2906,21 @@ class Compiler GenTreeHWIntrinsic* gtNewSimdCreateBroadcastNode( var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic); + GenTreeHWIntrinsic* gtNewSimdGetElementNode(var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic); + + GenTreeHWIntrinsic* gtNewSimdWithElementNode(var_types type, + GenTree* op1, + GenTree* op2, + GenTree* op3, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic); + GenTreeHWIntrinsic* gtNewSimdAsHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID, CorInfoType simdBaseJitType, @@ -4116,6 +4131,7 @@ class Compiler GenTree* impNonConstFallback(NamedIntrinsic intrinsic, var_types simdType, CorInfoType simdBaseJitType); GenTree* addRangeCheckIfNeeded( NamedIntrinsic intrinsic, GenTree* immOp, bool mustExpand, int immLowerBound, int immUpperBound); + GenTree* addRangeCheckForHWIntrinsic(GenTree* immOp, int immLowerBound, int immUpperBound); #ifdef TARGET_XARCH GenTree* impBaseIntrinsic(NamedIntrinsic intrinsic, @@ -5899,8 +5915,8 @@ class Compiler unsigned* indexOut, unsigned* simdSizeOut, bool ignoreUsedInSIMDIntrinsic = false); - GenTree* fgMorphFieldAssignToSIMDIntrinsicSet(GenTree* tree); - GenTree* fgMorphFieldToSIMDIntrinsicGet(GenTree* tree); + GenTree* fgMorphFieldAssignToSimdSetElement(GenTree* tree); + GenTree* fgMorphFieldToSimdGetElement(GenTree* tree); bool fgMorphCombineSIMDFieldAssignments(BasicBlock* block, Statement* stmt); void impMarkContiguousSIMDFieldAssignments(Statement* stmt); @@ -8545,9 +8561,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // Normalizes TYP_STRUCT value in case of GT_CALL, GT_RET_EXPR and arg nodes. GenTree* impSIMDPopStack(var_types type, bool expectAddr = false, CORINFO_CLASS_HANDLE structType = nullptr); - // Create a GT_SIMD tree for a Get property of SIMD vector with a fixed index. - GenTreeSIMD* impSIMDGetFixed(var_types simdType, CorInfoType simdBaseJitType, unsigned simdSize, int index); - // Transforms operands and returns the SIMD intrinsic to be applied on // transformed operands to obtain given relop result. SIMDIntrinsicID impSIMDRelOp(SIMDIntrinsicID relOpIntrinsicId, diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp index 882168e0224c7..3ad6ce9b9ff96 100644 --- a/src/coreclr/jit/decomposelongs.cpp +++ b/src/coreclr/jit/decomposelongs.cpp @@ -240,11 +240,11 @@ GenTree* DecomposeLongs::DecomposeNode(GenTree* tree) nextNode = DecomposeRotate(use); break; -#ifdef FEATURE_SIMD - case GT_SIMD: - nextNode = DecomposeSimd(use); +#ifdef FEATURE_HW_INTRINSICS + case GT_HWINTRINSIC: + nextNode = DecomposeHWIntrinsic(use); break; -#endif // FEATURE_SIMD +#endif // FEATURE_HW_INTRINSICS case GT_LOCKADD: case GT_XORR: @@ -1622,10 +1622,10 @@ GenTree* DecomposeLongs::DecomposeUMod(LIR::Use& use) return FinalizeDecomposition(use, loResult, hiResult, hiResult); } -#ifdef FEATURE_SIMD +#ifdef FEATURE_HW_INTRINSICS //------------------------------------------------------------------------ -// DecomposeSimd: Decompose GT_SIMD. +// DecomposeHWIntrinsic: Decompose GT_HWINTRINSIC. // // Arguments: // use - the LIR::Use object for the def that needs to be decomposed. @@ -1633,22 +1633,21 @@ GenTree* DecomposeLongs::DecomposeUMod(LIR::Use& use) // Return Value: // The next node to process. // -GenTree* DecomposeLongs::DecomposeSimd(LIR::Use& use) +GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use) { - GenTree* tree = use.Def(); - genTreeOps oper = tree->OperGet(); - - assert(oper == GT_SIMD); + GenTree* tree = use.Def(); + assert(tree->OperIs(GT_HWINTRINSIC)); - GenTreeSIMD* simdTree = tree->AsSIMD(); + GenTreeHWIntrinsic* hwintrinsicTree = tree->AsHWIntrinsic(); - switch (simdTree->gtSIMDIntrinsicID) + switch (hwintrinsicTree->gtHWIntrinsicId) { - case SIMDIntrinsicGetItem: - return DecomposeSimdGetItem(use); + case NI_Vector128_GetElement: + case NI_Vector256_GetElement: + return DecomposeHWIntrinsicGetElement(use, hwintrinsicTree); default: - noway_assert(!"unexpected GT_SIMD node in long decomposition"); + noway_assert(!"unexpected GT_HWINTRINSIC node in long decomposition"); break; } @@ -1656,72 +1655,75 @@ GenTree* DecomposeLongs::DecomposeSimd(LIR::Use& use) } //------------------------------------------------------------------------ -// DecomposeSimdGetItem: Decompose GT_SIMD -- SIMDIntrinsicGetItem. +// DecomposeHWIntrinsicGetElement: Decompose GT_HWINTRINSIC -- NI_Vector*_GetElement. // -// Decompose a get[i] node on Vector. For: +// Decompose a get[i] node on Vector*. For: // -// GT_SIMD{get_item}[long](simd_var, index) +// GT_HWINTRINSIC{GetElement}[long](simd_var, index) // // create: // // tmp_simd_var = simd_var // tmp_index = index -// loResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2) -// hiResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2 + 1) +// loResult = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, tmp_index * 2) +// hiResult = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, tmp_index * 2 + 1) // return: GT_LONG(loResult, hiResult) // -// This isn't optimal codegen, since SIMDIntrinsicGetItem sometimes requires +// This isn't optimal codegen, since NI_Vector*_GetElement sometimes requires // temps that could be shared, for example. // // Arguments: // use - the LIR::Use object for the def that needs to be decomposed. +// node - the hwintrinsic node to decompose // // Return Value: // The next node to process. // -GenTree* DecomposeLongs::DecomposeSimdGetItem(LIR::Use& use) +GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHWIntrinsic* node) { - GenTree* tree = use.Def(); - genTreeOps oper = tree->OperGet(); - - assert(oper == GT_SIMD); + assert(node == use.Def()); + assert(varTypeIsLong(node)); + assert((node->gtHWIntrinsicId == NI_Vector128_GetElement) || (node->gtHWIntrinsicId == NI_Vector256_GetElement)); - GenTreeSIMD* simdTree = tree->AsSIMD(); - var_types baseType = simdTree->GetSimdBaseType(); - unsigned simdSize = simdTree->GetSimdSize(); + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); - assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicGetItem); - assert(varTypeIsLong(baseType)); - assert(varTypeIsLong(simdTree)); - assert(varTypeIsSIMD(simdTree->AsOp()->gtOp1->gtType)); - assert(simdTree->AsOp()->gtOp2->gtType == TYP_INT); + assert(varTypeIsLong(simdBaseType)); + assert(varTypeIsSIMD(op1->TypeGet())); + assert(op2->TypeIs(TYP_INT)); - bool indexIsConst = simdTree->AsOp()->gtOp2->IsCnsIntOrI(); + bool indexIsConst = op2->OperIsConst(); ssize_t index = 0; + if (indexIsConst) { - index = simdTree->AsOp()->gtOp2->AsIntCon()->gtIconVal; + index = op2->AsIntCon()->IconValue(); } - GenTree* simdTmpVar = RepresentOpAsLocalVar(simdTree->AsOp()->gtOp1, simdTree, &simdTree->AsOp()->gtOp1); + GenTree* simdTmpVar = RepresentOpAsLocalVar(op1, node, &node->gtOp1); unsigned simdTmpVarNum = simdTmpVar->AsLclVarCommon()->GetLclNum(); - JITDUMP("[DecomposeSimdGetItem]: Saving op1 tree to a temp var:\n"); + JITDUMP("[DecomposeHWIntrinsicGetElement]: Saving op1 tree to a temp var:\n"); DISPTREERANGE(Range(), simdTmpVar); Range().Remove(simdTmpVar); + op1 = node->gtGetOp1(); GenTree* indexTmpVar = nullptr; unsigned indexTmpVarNum = 0; + if (!indexIsConst) { - indexTmpVar = RepresentOpAsLocalVar(simdTree->AsOp()->gtOp2, simdTree, &simdTree->AsOp()->gtOp2); + indexTmpVar = RepresentOpAsLocalVar(op2, node, &node->gtOp2); indexTmpVarNum = indexTmpVar->AsLclVarCommon()->GetLclNum(); - JITDUMP("[DecomposeSimdGetItem]: Saving op2 tree to a temp var:\n"); + JITDUMP("[DecomposeHWIntrinsicGetElement]: Saving op2 tree to a temp var:\n"); DISPTREERANGE(Range(), indexTmpVar); Range().Remove(indexTmpVar); + op2 = node->gtGetOp2(); } // Create: - // loResult = GT_SIMD{get_item}[int](tmp_simd_var, index * 2) + // loResult = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, index * 2) GenTree* simdTmpVar1 = simdTmpVar; GenTree* indexTimesTwo1; @@ -1729,34 +1731,34 @@ GenTree* DecomposeLongs::DecomposeSimdGetItem(LIR::Use& use) if (indexIsConst) { // Reuse the existing index constant node. - indexTimesTwo1 = simdTree->AsOp()->gtOp2; + indexTimesTwo1 = op2; Range().Remove(indexTimesTwo1); - indexTimesTwo1->AsIntCon()->gtIconVal = index * 2; + indexTimesTwo1->AsIntCon()->SetIconValue(index * 2); - Range().InsertBefore(simdTree, simdTmpVar1, indexTimesTwo1); + Range().InsertBefore(node, simdTmpVar1, indexTimesTwo1); } else { GenTree* indexTmpVar1 = indexTmpVar; GenTree* two1 = m_compiler->gtNewIconNode(2, TYP_INT); indexTimesTwo1 = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, indexTmpVar1, two1); - Range().InsertBefore(simdTree, simdTmpVar1, indexTmpVar1, two1, indexTimesTwo1); + Range().InsertBefore(node, simdTmpVar1, indexTmpVar1, two1, indexTimesTwo1); } - GenTree* loResult = m_compiler->gtNewSIMDNode(TYP_INT, simdTmpVar1, indexTimesTwo1, SIMDIntrinsicGetItem, - CORINFO_TYPE_INT, simdSize); - Range().InsertBefore(simdTree, loResult); + GenTree* loResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, simdTmpVar1, indexTimesTwo1, + node->gtHWIntrinsicId, CORINFO_TYPE_INT, simdSize); + Range().InsertBefore(node, loResult); // Create: - // hiResult = GT_SIMD{get_item}[int](tmp_simd_var, index * 2 + 1) + // hiResult = GT_HWINTRINSIC{GetElement}[int](tmp_simd_var, index * 2 + 1) - GenTree* simdTmpVar2 = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTree->AsOp()->gtOp1->gtType); + GenTree* simdTmpVar2 = m_compiler->gtNewLclLNode(simdTmpVarNum, op1->TypeGet()); GenTree* indexTimesTwoPlusOne; if (indexIsConst) { indexTimesTwoPlusOne = m_compiler->gtNewIconNode(index * 2 + 1, TYP_INT); - Range().InsertBefore(simdTree, simdTmpVar2, indexTimesTwoPlusOne); + Range().InsertBefore(node, simdTmpVar2, indexTimesTwoPlusOne); } else { @@ -1765,22 +1767,22 @@ GenTree* DecomposeLongs::DecomposeSimdGetItem(LIR::Use& use) GenTree* indexTimesTwo2 = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, indexTmpVar2, two2); GenTree* one = m_compiler->gtNewIconNode(1, TYP_INT); indexTimesTwoPlusOne = m_compiler->gtNewOperNode(GT_ADD, TYP_INT, indexTimesTwo2, one); - Range().InsertBefore(simdTree, simdTmpVar2, indexTmpVar2, two2, indexTimesTwo2); - Range().InsertBefore(simdTree, one, indexTimesTwoPlusOne); + Range().InsertBefore(node, simdTmpVar2, indexTmpVar2, two2, indexTimesTwo2); + Range().InsertBefore(node, one, indexTimesTwoPlusOne); } - GenTree* hiResult = m_compiler->gtNewSIMDNode(TYP_INT, simdTmpVar2, indexTimesTwoPlusOne, SIMDIntrinsicGetItem, - CORINFO_TYPE_INT, simdSize); - Range().InsertBefore(simdTree, hiResult); + GenTree* hiResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, simdTmpVar2, indexTimesTwoPlusOne, + node->gtHWIntrinsicId, CORINFO_TYPE_INT, simdSize); + Range().InsertBefore(node, hiResult); // Done with the original tree; remove it. - Range().Remove(simdTree); + Range().Remove(node); return FinalizeDecomposition(use, loResult, hiResult, hiResult); } -#endif // FEATURE_SIMD +#endif // FEATURE_HW_INTRINSICS //------------------------------------------------------------------------ // StoreNodeToVar: Check if the user is a STORE_LCL_VAR, and if it isn't, diff --git a/src/coreclr/jit/decomposelongs.h b/src/coreclr/jit/decomposelongs.h index cc3bddab1287f..42ea8ada70d3c 100644 --- a/src/coreclr/jit/decomposelongs.h +++ b/src/coreclr/jit/decomposelongs.h @@ -55,8 +55,11 @@ class DecomposeLongs GenTree* DecomposeRotate(LIR::Use& use); GenTree* DecomposeMul(LIR::Use& use); GenTree* DecomposeUMod(LIR::Use& use); - GenTree* DecomposeSimd(LIR::Use& use); - GenTree* DecomposeSimdGetItem(LIR::Use& use); + +#ifdef FEATURE_HW_INTRINSICS + GenTree* DecomposeHWIntrinsic(LIR::Use& use); + GenTree* DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHWIntrinsic* node); +#endif // FEATURE_HW_INTRINSICS // Helper functions GenTree* FinalizeDecomposition(LIR::Use& use, GenTree* loResult, GenTree* hiResult, GenTree* insertResultAfter); diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 5fd16415d4f9c..c22d52bdc49ae 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -8499,8 +8499,58 @@ void emitter::emitDispIns( } else { - attr = id->idOpSize(); - sstr = codeGen->genSizeStr(attr); + emitAttr sizeAttr = id->idOpSize(); + attr = sizeAttr; + + switch (ins) + { + case INS_vextractf128: + case INS_vextracti128: + case INS_vinsertf128: + case INS_vinserti128: + { + sizeAttr = EA_16BYTE; + break; + } + + case INS_pextrb: + case INS_pinsrb: + { + sizeAttr = EA_1BYTE; + break; + } + + case INS_pextrw: + case INS_pextrw_sse41: + case INS_pinsrw: + { + sizeAttr = EA_2BYTE; + break; + } + + case INS_extractps: + case INS_insertps: + case INS_pextrd: + case INS_pinsrd: + { + sizeAttr = EA_4BYTE; + break; + } + + case INS_pextrq: + case INS_pinsrq: + { + sizeAttr = EA_8BYTE; + break; + } + + default: + { + break; + } + } + + sstr = codeGen->genSizeStr(sizeAttr); if (ins == INS_lea) { @@ -9031,6 +9081,36 @@ void emitter::emitDispIns( assert(IsThreeOperandAVXInstruction(ins)); printf("%s, ", emitRegName(id->idReg1(), attr)); printf("%s, ", emitRegName(id->idReg2(), attr)); + + switch (ins) + { + case INS_vinsertf128: + case INS_vinserti128: + { + attr = EA_16BYTE; + break; + } + + case INS_pinsrb: + case INS_pinsrw: + case INS_pinsrd: + { + attr = EA_4BYTE; + break; + } + + case INS_pinsrq: + { + attr = EA_8BYTE; + break; + } + + default: + { + break; + } + } + printf("%s, ", emitRegName(id->idReg3(), attr)); val = emitGetInsSC(id); goto PRINT_CONSTANT; @@ -9044,7 +9124,55 @@ void emitter::emitDispIns( printf("%s", emitRegName(id->idReg4(), attr)); break; case IF_RRW_RRW_CNS: - printf("%s,", emitRegName(id->idReg1(), attr)); + { + emitAttr tgtAttr = attr; + + switch (ins) + { + case INS_vextractf128: + case INS_vextracti128: + { + tgtAttr = EA_16BYTE; + break; + } + + case INS_extractps: + case INS_pextrb: + case INS_pextrw: + case INS_pextrw_sse41: + case INS_pextrd: + { + tgtAttr = EA_4BYTE; + break; + } + + case INS_pextrq: + { + tgtAttr = EA_8BYTE; + break; + } + + case INS_pinsrb: + case INS_pinsrw: + case INS_pinsrd: + { + attr = EA_4BYTE; + break; + } + + case INS_pinsrq: + { + attr = EA_8BYTE; + break; + } + + default: + { + break; + } + } + + printf("%s,", emitRegName(id->idReg1(), tgtAttr)); printf(" %s", emitRegName(id->idReg2(), attr)); val = emitGetInsSC(id); #ifdef TARGET_AMD64 @@ -9061,6 +9189,7 @@ void emitter::emitDispIns( goto PRINT_CONSTANT; } break; + } case IF_RRD: case IF_RWR: diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index b55131194b71a..cb19c0600abbb 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19266,6 +19266,170 @@ GenTreeHWIntrinsic* Compiler::gtNewSimdCreateBroadcastNode( return gtNewSimdHWIntrinsicNode(type, op1, hwIntrinsicID, simdBaseJitType, simdSize); } +GenTreeHWIntrinsic* Compiler::gtNewSimdGetElementNode(var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic) +{ + NamedIntrinsic intrinsicId = NI_Vector128_GetElement; + var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + + assert(varTypeIsArithmetic(simdBaseType)); + +#if defined(TARGET_XARCH) + switch (simdBaseType) + { + // Using software fallback if simdBaseType is not supported by hardware + case TYP_BYTE: + case TYP_UBYTE: + case TYP_INT: + case TYP_UINT: + case TYP_LONG: + case TYP_ULONG: + assert(compIsaSupportedDebugOnly(InstructionSet_SSE41)); + break; + + case TYP_DOUBLE: + case TYP_FLOAT: + case TYP_SHORT: + case TYP_USHORT: + assert(compIsaSupportedDebugOnly(InstructionSet_SSE2)); + break; + + default: + unreached(); + } + + if (simdSize == 32) + { + intrinsicId = NI_Vector256_GetElement; + } +#elif defined(TARGET_ARM64) + if (simdSize == 8) + { + intrinsicId = NI_Vector64_GetElement; + } +#else +#error Unsupported platform +#endif // !TARGET_XARCH && !TARGET_ARM64 + + int immUpperBound = getSIMDVectorLength(simdSize, simdBaseType) - 1; + bool rangeCheckNeeded = !op2->OperIsConst(); + + if (!rangeCheckNeeded) + { + ssize_t imm8 = op2->AsIntCon()->IconValue(); + rangeCheckNeeded = (imm8 < 0) || (imm8 > immUpperBound); + } + + if (rangeCheckNeeded) + { + op2 = addRangeCheckForHWIntrinsic(op2, 0, immUpperBound); + } + + if (isSimdAsHWIntrinsic) + { + return gtNewSimdAsHWIntrinsicNode(type, op1, op2, intrinsicId, simdBaseJitType, simdSize); + } + + return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsicId, simdBaseJitType, simdSize); +} + +GenTreeHWIntrinsic* Compiler::gtNewSimdWithElementNode(var_types type, + GenTree* op1, + GenTree* op2, + GenTree* op3, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic) +{ + NamedIntrinsic hwIntrinsicID = NI_Vector128_WithElement; + var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + + assert(varTypeIsArithmetic(simdBaseType)); + assert(op2->OperIsConst()); + + ssize_t imm8 = op2->AsIntCon()->IconValue(); + ssize_t count = simdSize / genTypeSize(simdBaseType); + + assert(0 <= imm8 && imm8 < count); + +#if defined(TARGET_XARCH) + switch (simdBaseType) + { + // Using software fallback if simdBaseType is not supported by hardware + case TYP_BYTE: + case TYP_UBYTE: + case TYP_INT: + case TYP_UINT: + assert(compIsaSupportedDebugOnly(InstructionSet_SSE41)); + break; + + case TYP_LONG: + case TYP_ULONG: + assert(compIsaSupportedDebugOnly(InstructionSet_SSE41_X64)); + break; + + case TYP_DOUBLE: + case TYP_FLOAT: + case TYP_SHORT: + case TYP_USHORT: + assert(compIsaSupportedDebugOnly(InstructionSet_SSE2)); + break; + + default: + unreached(); + } + + if (simdSize == 32) + { + hwIntrinsicID = NI_Vector256_WithElement; + } +#elif defined(TARGET_ARM64) + switch (simdBaseType) + { + case TYP_LONG: + case TYP_ULONG: + case TYP_DOUBLE: + if (simdSize == 8) + { + if (isSimdAsHWIntrinsic) + { + return gtNewSimdAsHWIntrinsicNode(type, op3, NI_Vector64_Create, simdBaseJitType, simdSize); + } + + return gtNewSimdHWIntrinsicNode(type, op3, NI_Vector64_Create, simdBaseJitType, simdSize); + } + break; + + case TYP_FLOAT: + case TYP_BYTE: + case TYP_UBYTE: + case TYP_SHORT: + case TYP_USHORT: + case TYP_INT: + case TYP_UINT: + break; + + default: + unreached(); + } + + hwIntrinsicID = NI_AdvSimd_Insert; +#else +#error Unsupported platform +#endif // !TARGET_XARCH && !TARGET_ARM64 + + if (isSimdAsHWIntrinsic) + { + return gtNewSimdAsHWIntrinsicNode(type, op1, op2, op3, hwIntrinsicID, simdBaseJitType, simdSize); + } + + return gtNewSimdHWIntrinsicNode(type, op1, op2, op3, hwIntrinsicID, simdBaseJitType, simdSize); +} + GenTreeHWIntrinsic* Compiler::gtNewScalarHWIntrinsicNode(var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID) { SetOpLclRelatedToSIMDIntrinsic(op1); diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index ea13b509e54ce..f659dd40a669b 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -578,36 +578,7 @@ GenTree* Compiler::addRangeCheckIfNeeded( assert(!immOp->IsCnsIntOrI()); assert(varTypeIsUnsigned(immOp)); - // Bounds check for value of an immediate operand - // (immLowerBound <= immOp) && (immOp <= immUpperBound) - // - // implemented as a single comparison in the form of - // - // if ((immOp - immLowerBound) >= (immUpperBound - immLowerBound + 1)) - // { - // throw new ArgumentOutOfRangeException(); - // } - // - // The value of (immUpperBound - immLowerBound + 1) is denoted as adjustedUpperBound. - - const ssize_t adjustedUpperBound = (ssize_t)immUpperBound - immLowerBound + 1; - GenTree* adjustedUpperBoundNode = gtNewIconNode(adjustedUpperBound, TYP_INT); - - GenTree* immOpDup = nullptr; - - immOp = impCloneExpr(immOp, &immOpDup, NO_CLASS_HANDLE, (unsigned)CHECK_SPILL_ALL, - nullptr DEBUGARG("Clone an immediate operand for immediate value bounds check")); - - if (immLowerBound != 0) - { - immOpDup = gtNewOperNode(GT_SUB, TYP_INT, immOpDup, gtNewIconNode(immLowerBound, TYP_INT)); - } - - GenTreeBoundsChk* hwIntrinsicChk = new (this, GT_HW_INTRINSIC_CHK) - GenTreeBoundsChk(GT_HW_INTRINSIC_CHK, TYP_VOID, immOpDup, adjustedUpperBoundNode, SCK_RNGCHK_FAIL); - hwIntrinsicChk->gtThrowKind = SCK_ARG_RNG_EXCPN; - - return gtNewOperNode(GT_COMMA, immOp->TypeGet(), hwIntrinsicChk, immOp); + return addRangeCheckForHWIntrinsic(immOp, immLowerBound, immUpperBound); } else { @@ -615,6 +586,52 @@ GenTree* Compiler::addRangeCheckIfNeeded( } } +//------------------------------------------------------------------------ +// addRangeCheckForHWIntrinsic: add a GT_HW_INTRINSIC_CHK node for an intrinsic +// +// Arguments: +// immOp -- the immediate operand of the intrinsic +// immLowerBound -- lower incl. bound for a value of the immediate operand (for a non-full-range imm-intrinsic) +// immUpperBound -- upper incl. bound for a value of the immediate operand (for a non-full-range imm-intrinsic) +// +// Return Value: +// add a GT_HW_INTRINSIC_CHK node for non-full-range imm-intrinsic, which would throw ArgumentOutOfRangeException +// when the imm-argument is not in the valid range +// +GenTree* Compiler::addRangeCheckForHWIntrinsic(GenTree* immOp, int immLowerBound, int immUpperBound) +{ + // Bounds check for value of an immediate operand + // (immLowerBound <= immOp) && (immOp <= immUpperBound) + // + // implemented as a single comparison in the form of + // + // if ((immOp - immLowerBound) >= (immUpperBound - immLowerBound + 1)) + // { + // throw new ArgumentOutOfRangeException(); + // } + // + // The value of (immUpperBound - immLowerBound + 1) is denoted as adjustedUpperBound. + + const ssize_t adjustedUpperBound = (ssize_t)immUpperBound - immLowerBound + 1; + GenTree* adjustedUpperBoundNode = gtNewIconNode(adjustedUpperBound, TYP_INT); + + GenTree* immOpDup = nullptr; + + immOp = impCloneExpr(immOp, &immOpDup, NO_CLASS_HANDLE, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone an immediate operand for immediate value bounds check")); + + if (immLowerBound != 0) + { + immOpDup = gtNewOperNode(GT_SUB, TYP_INT, immOpDup, gtNewIconNode(immLowerBound, TYP_INT)); + } + + GenTreeBoundsChk* hwIntrinsicChk = new (this, GT_HW_INTRINSIC_CHK) + GenTreeBoundsChk(GT_HW_INTRINSIC_CHK, TYP_VOID, immOpDup, adjustedUpperBoundNode, SCK_RNGCHK_FAIL); + hwIntrinsicChk->gtThrowKind = SCK_ARG_RNG_EXCPN; + + return gtNewOperNode(GT_COMMA, immOp->TypeGet(), hwIntrinsicChk, immOp); +} + //------------------------------------------------------------------------ // compSupportsHWIntrinsic: check whether a given instruction is enabled via configuration // diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 4f57bc38aceb5..41c96e3156d2f 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -256,8 +256,6 @@ void HWIntrinsicInfo::lookupImmBounds( case NI_AdvSimd_StoreSelectedScalar: case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128: case NI_AdvSimd_Arm64_InsertSelectedScalar: - case NI_Vector64_GetElement: - case NI_Vector128_GetElement: immUpperBound = Compiler::getSIMDVectorLength(simdSize, baseType) - 1; break; @@ -418,6 +416,25 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector64_GetElement: + case NI_Vector128_GetElement: + { + assert(!sig->hasThis()); + assert(numArgs == 2); + + if (!featureSIMD || !compExactlyDependsOn(InstructionSet_AdvSimd)) + { + return nullptr; + } + + op2 = impPopStack().val; + op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize)); + + const bool isSimdAsHWIntrinsic = true; + retNode = gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + break; + } + case NI_Vector64_get_Zero: case NI_Vector64_get_AllBitsSet: case NI_Vector128_get_Zero: @@ -454,38 +471,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, impPopStack(); // pop the indexOp that we already have. GenTree* vectorOp = impSIMDPopStack(getSIMDTypeForSize(simdSize)); - switch (simdBaseType) - { - case TYP_LONG: - case TYP_ULONG: - case TYP_DOUBLE: - if (simdSize == 16) - { - retNode = gtNewSimdHWIntrinsicNode(retType, vectorOp, gtNewIconNode(imm8), valueOp, - NI_AdvSimd_Insert, simdBaseJitType, simdSize); - } - else - { - retNode = - gtNewSimdHWIntrinsicNode(retType, valueOp, NI_Vector64_Create, simdBaseJitType, simdSize); - } - break; - - case TYP_FLOAT: - case TYP_BYTE: - case TYP_UBYTE: - case TYP_SHORT: - case TYP_USHORT: - case TYP_INT: - case TYP_UINT: - retNode = gtNewSimdHWIntrinsicNode(retType, vectorOp, gtNewIconNode(imm8), valueOp, - NI_AdvSimd_Insert, simdBaseJitType, simdSize); - break; - - default: - return nullptr; - } - + retNode = gtNewSimdWithElementNode(retType, vectorOp, indexOp, valueOp, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); break; } diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index 8ae7457dbc200..3352c9ba59571 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -796,15 +796,127 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Vector64_GetElement: case NI_Vector128_GetElement: - case NI_Vector64_ToScalar: - case NI_Vector128_ToScalar: { - ssize_t indexValue = 0; - if ((intrin.id == NI_Vector64_GetElement) || (intrin.id == NI_Vector128_GetElement)) + assert(intrin.numOperands == 2); + + var_types simdType = Compiler::getSIMDTypeForSize(node->GetSimdSize()); + + if (simdType == TYP_SIMD12) + { + // op1 of TYP_SIMD12 should be considered as TYP_SIMD16 + simdType = TYP_SIMD16; + } + + if (!intrin.op2->OperIsConst()) + { + assert(!intrin.op2->isContained()); + + emitAttr baseTypeSize = emitTypeSize(intrin.baseType); + unsigned baseTypeScale = genLog2(EA_SIZE_IN_BYTES(baseTypeSize)); + + regNumber baseReg; + regNumber indexReg = op2Reg; + + // Optimize the case of op1 is in memory and trying to access ith element. + if (!intrin.op1->isUsedFromReg()) + { + assert(intrin.op1->isContained()); + + if (intrin.op1->OperIsLocal()) + { + unsigned varNum = intrin.op1->AsLclVarCommon()->GetLclNum(); + baseReg = node->ExtractTempReg(); + + // Load the address of varNum + GetEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, baseReg, varNum, 0); + } + else + { + // Require GT_IND addr to be not contained. + assert(intrin.op1->OperIs(GT_IND)); + + GenTree* addr = intrin.op1->AsIndir()->Addr(); + assert(!addr->isContained()); + baseReg = addr->GetRegNum(); + } + } + else + { + unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum; + noway_assert(simdInitTempVarNum != BAD_VAR_NUM); + + baseReg = node->ExtractTempReg(); + + // Load the address of simdInitTempVarNum + GetEmitter()->emitIns_R_S(INS_lea, EA_PTRSIZE, baseReg, simdInitTempVarNum, 0); + + // Store the vector to simdInitTempVarNum + GetEmitter()->emitIns_R_R(INS_str, emitTypeSize(simdType), op1Reg, baseReg); + } + + assert(genIsValidIntReg(indexReg)); + assert(genIsValidIntReg(baseReg)); + assert(baseReg != indexReg); + + // Load item at baseReg[index] + GetEmitter()->emitIns_R_R_R_Ext(ins_Load(intrin.baseType), baseTypeSize, targetReg, baseReg, + indexReg, INS_OPTS_LSL, baseTypeScale); + } + else if (!GetEmitter()->isValidVectorIndex(emitTypeSize(simdType), emitTypeSize(intrin.baseType), + intrin.op2->AsIntCon()->IconValue())) + { + // We only need to generate code for the get if the index is valid + // If the index is invalid, previously generated for the range check will throw + } + else if (!intrin.op1->isUsedFromReg()) + { + assert(intrin.op1->isContained()); + assert(intrin.op2->IsCnsIntOrI()); + + int offset = (int)intrin.op2->AsIntCon()->IconValue() * genTypeSize(intrin.baseType); + instruction ins = ins_Load(intrin.baseType); + + assert(!intrin.op1->isUsedFromReg()); + + if (intrin.op1->OperIsLocal()) + { + unsigned varNum = intrin.op1->AsLclVarCommon()->GetLclNum(); + GetEmitter()->emitIns_R_S(ins, emitActualTypeSize(intrin.baseType), targetReg, varNum, offset); + } + else + { + assert(intrin.op1->OperIs(GT_IND)); + + GenTree* addr = intrin.op1->AsIndir()->Addr(); + assert(!addr->isContained()); + regNumber baseReg = addr->GetRegNum(); + + // ldr targetReg, [baseReg, #offset] + GetEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(intrin.baseType), targetReg, baseReg, + offset); + } + } + else { assert(intrin.op2->IsCnsIntOrI()); - indexValue = intrin.op2->AsIntCon()->gtIconVal; + ssize_t indexValue = intrin.op2->AsIntCon()->IconValue(); + + // no-op if vector is float/double, targetReg == op1Reg and fetching for 0th index. + if ((varTypeIsFloating(intrin.baseType) && (targetReg == op1Reg) && (indexValue == 0))) + { + break; + } + + GetEmitter()->emitIns_R_R_I(ins, emitTypeSize(intrin.baseType), targetReg, op1Reg, indexValue, + INS_OPTS_NONE); } + break; + } + + case NI_Vector64_ToScalar: + case NI_Vector128_ToScalar: + { + const ssize_t indexValue = 0; // no-op if vector is float/double, targetReg == op1Reg and fetching for 0th index. if ((varTypeIsFloating(intrin.baseType) && (targetReg == op1Reg) && (indexValue == 0))) diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 9680e91fde384..be6e5779269ad 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -226,13 +226,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2)) { assert(ival == -1); - - if (intrinsicId == NI_SSE2_Extract) - { - // extract instructions return to GP-registers, so it needs int size as the emitsize - simdSize = emitTypeSize(TYP_INT); - } - auto emitSwCase = [&](int8_t i) { genHWIntrinsic_R_RM_I(node, ins, i); }; if (op2->IsCnsIntOrI()) @@ -1146,15 +1139,15 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node) assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE)); GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); genConsumeHWIntrinsicOperands(node); regNumber op1Reg = (op1 == nullptr) ? REG_NA : op1->GetRegNum(); - assert(node->gtGetOp2() == nullptr); - - emitter* emit = GetEmitter(); - emitAttr attr = emitActualTypeSize(Compiler::getSIMDTypeForSize(node->GetSimdSize())); - instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); + emitter* emit = GetEmitter(); + var_types simdType = Compiler::getSIMDTypeForSize(node->GetSimdSize()); + emitAttr attr = emitActualTypeSize(simdType); + instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); switch (intrinsicId) { @@ -1184,6 +1177,160 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_Vector128_GetElement: + case NI_Vector256_GetElement: + { + if (simdType == TYP_SIMD12) + { + // op1 of TYP_SIMD12 should be considered as TYP_SIMD16 + simdType = TYP_SIMD16; + } + + // Optimize the case of op1 is in memory and trying to access ith element. + if (!op1->isUsedFromReg()) + { + assert(op1->isContained()); + + regNumber baseReg; + regNumber indexReg; + int offset = 0; + + if (op1->OperIsLocal()) + { + // There are three parts to the total offset here: + // {offset of local} + {offset of vector field (lclFld only)} + {offset of element within vector}. + bool isEBPbased; + unsigned varNum = op1->AsLclVarCommon()->GetLclNum(); + offset += compiler->lvaFrameAddress(varNum, &isEBPbased); + +#if !FEATURE_FIXED_OUT_ARGS + if (!isEBPbased) + { + // Adjust the offset by the amount currently pushed on the CPU stack + offset += genStackLevel; + } +#else + assert(genStackLevel == 0); +#endif // !FEATURE_FIXED_OUT_ARGS + + if (op1->OperIs(GT_LCL_FLD)) + { + offset += op1->AsLclFld()->GetLclOffs(); + } + baseReg = (isEBPbased) ? REG_EBP : REG_ESP; + } + else + { + // Require GT_IND addr to be not contained. + assert(op1->OperIs(GT_IND)); + + GenTree* addr = op1->AsIndir()->Addr(); + assert(!addr->isContained()); + baseReg = addr->GetRegNum(); + } + + if (op2->OperIsConst()) + { + assert(op2->isContained()); + indexReg = REG_NA; + offset += (int)op2->AsIntCon()->IconValue() * genTypeSize(baseType); + } + else + { + indexReg = op2->GetRegNum(); + assert(genIsValidIntReg(indexReg)); + } + + // Now, load the desired element. + GetEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load + emitTypeSize(baseType), // Of the vector baseType + targetReg, // To targetReg + baseReg, // Base Reg + indexReg, // Indexed + genTypeSize(baseType), // by the size of the baseType + offset); + } + else if (op2->OperIsConst()) + { + assert(intrinsicId == NI_Vector128_GetElement); + assert(varTypeIsFloating(baseType)); + assert(op1Reg != REG_NA); + + ssize_t ival = op2->AsIntCon()->IconValue(); + + if (baseType == TYP_FLOAT) + { + if (ival == 1) + { + if (compiler->compOpportunisticallyDependsOn(InstructionSet_SSE3)) + { + emit->emitIns_R_R(INS_movshdup, attr, targetReg, op1Reg); + } + else + { + emit->emitIns_SIMD_R_R_R_I(INS_shufps, attr, targetReg, op1Reg, op1Reg, + static_cast(0x55)); + } + } + else if (ival == 2) + { + emit->emitIns_SIMD_R_R_R(INS_unpckhps, attr, targetReg, op1Reg, op1Reg); + } + else + { + assert(ival == 3); + emit->emitIns_SIMD_R_R_R_I(INS_shufps, attr, targetReg, op1Reg, op1Reg, + static_cast(0xFF)); + } + } + else + { + assert(baseType == TYP_DOUBLE); + assert(ival == 1); + emit->emitIns_SIMD_R_R_R(INS_unpckhpd, attr, targetReg, op1Reg, op1Reg); + } + } + else + { + // We don't have an instruction to implement this intrinsic if the index is not a constant. + // So we will use the SIMD temp location to store the vector, and the load the desired element. + // The range check will already have been performed, so at this point we know we have an index + // within the bounds of the vector. + + unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum; + noway_assert(simdInitTempVarNum != BAD_VAR_NUM); + + bool isEBPbased; + unsigned offs = compiler->lvaFrameAddress(simdInitTempVarNum, &isEBPbased); + +#if !FEATURE_FIXED_OUT_ARGS + if (!isEBPbased) + { + // Adjust the offset by the amount currently pushed on the CPU stack + offs += genStackLevel; + } +#else + assert(genStackLevel == 0); +#endif // !FEATURE_FIXED_OUT_ARGS + + regNumber indexReg = op2->GetRegNum(); + + // Store the vector to the temp location. + GetEmitter()->emitIns_S_R(ins_Store(simdType, compiler->isSIMDTypeLocalAligned(simdInitTempVarNum)), + emitTypeSize(simdType), op1Reg, simdInitTempVarNum, 0); + + // Now, load the desired element. + GetEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load + emitTypeSize(baseType), // Of the vector baseType + targetReg, // To targetReg + (isEBPbased) ? REG_EBP : REG_ESP, // Stack-based + indexReg, // Indexed + genTypeSize(baseType), // by the size of the baseType + offs); + } + break; + } + case NI_Vector128_ToScalar: case NI_Vector256_ToScalar: { @@ -1543,25 +1690,12 @@ void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node) case NI_SSE41_Extract: case NI_SSE41_X64_Extract: { - regNumber tmpTargetReg = REG_NA; - instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); - if (baseType == TYP_FLOAT) - { - tmpTargetReg = node->ExtractTempReg(); - } + assert(!varTypeIsFloating(baseType)); - auto emitSwCase = [&](int8_t i) { - if (baseType == TYP_FLOAT) - { - // extract instructions return to GP-registers, so it needs int size as the emitsize - inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1, i); - emit->emitIns_Mov(INS_movd, EA_4BYTE, targetReg, tmpTargetReg, /* canSkip */ false); - } - else - { - inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), targetReg, op1, i); - } - }; + instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); + emitAttr attr = emitActualTypeSize(node->TypeGet()); + + auto emitSwCase = [&](int8_t i) { inst_RV_TT_IV(ins, attr, targetReg, op1, i); }; if (op2->IsCnsIntOrI()) { diff --git a/src/coreclr/jit/hwintrinsiclistarm64.h b/src/coreclr/jit/hwintrinsiclistarm64.h index da9f788db4989..71c3f56121a1e 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64.h +++ b/src/coreclr/jit/hwintrinsiclistarm64.h @@ -32,7 +32,7 @@ HARDWARE_INTRINSIC(Vector64, Dot, HARDWARE_INTRINSIC(Vector64, get_AllBitsSet, 8, 0, {INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector64, get_Count, 8, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector64, get_Zero, 8, 0, {INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Vector64, GetElement, 8, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_NoJmpTableIMM|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Vector64, GetElement, 8, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector64, op_Equality, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, op_Inequality, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, ToScalar, 8, 1, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SIMDScalar|HW_Flag_SpecialCodeGen) @@ -65,7 +65,7 @@ HARDWARE_INTRINSIC(Vector128, Dot, HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, get_Count, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, get_Zero, 16, 0, {INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_NoJmpTableIMM|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, GetLower, 16, 1, {INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov}, HW_Category_SIMD, HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector128, GetUpper, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, op_Equality, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index e5aec99897e4c..82d9a4356b2cd 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -48,7 +48,7 @@ HARDWARE_INTRINSIC(Vector128, Dot, HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, get_Count, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, get_Zero, 16, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, op_Equality, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, op_Inequality, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) @@ -80,7 +80,7 @@ HARDWARE_INTRINSIC(Vector256, get_Zero, HARDWARE_INTRINSIC(Vector256, Create, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, CreateScalarUnsafe, 32, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, Dot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, GetLower, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, op_Equality, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, op_Inequality, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 73a43ab0584c7..0155e4120b50e 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -978,14 +978,29 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_WithElement: { assert(sig->numArgs == 3); - GenTree* indexOp = impStackTop(1).val; - if (!compExactlyDependsOn(InstructionSet_SSE2) || !varTypeIsArithmetic(simdBaseType) || - !indexOp->OperIsConst()) + + if (!compExactlyDependsOn(InstructionSet_SSE2) || !varTypeIsArithmetic(simdBaseType)) { // Using software fallback if // 1. JIT/hardware don't support SSE2 instructions - // 2. simdBaseType is not a numeric type (throw execptions) - // 3. index is not a constant + // 2. simdBaseType is not a numeric type (throw exceptions) + return nullptr; + } + + GenTree* indexOp = impStackTop(1).val; + + if (!indexOp->OperIsConst()) + { + // Index is not a constant, use the software fallback + return nullptr; + } + + ssize_t imm8 = indexOp->AsIntCon()->IconValue(); + ssize_t count = simdSize / genTypeSize(simdBaseType); + + if (imm8 >= count || imm8 < 0) + { + // Using software fallback if index is out of range (throw exeception) return nullptr; } @@ -1018,177 +1033,15 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, break; default: - return nullptr; - } - - ssize_t imm8 = indexOp->AsIntCon()->IconValue(); - ssize_t cachedImm8 = imm8; - ssize_t count = simdSize / genTypeSize(simdBaseType); - - if (imm8 >= count || imm8 < 0) - { - // Using software fallback if index is out of range (throw exeception) - return nullptr; + unreached(); } GenTree* valueOp = impPopStack().val; - impPopStack(); // pops the indexOp that we already have. + impPopStack(); // Pop the indexOp now that we know its valid GenTree* vectorOp = impSIMDPopStack(getSIMDTypeForSize(simdSize)); - GenTree* clonedVectorOp = nullptr; - - if (simdSize == 32) - { - // Extract the half vector that will be modified - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); - - // copy `vectorOp` to accept the modified half vector - vectorOp = impCloneExpr(vectorOp, &clonedVectorOp, NO_CLASS_HANDLE, (unsigned)CHECK_SPILL_ALL, - nullptr DEBUGARG("Clone Vector for Vector256.WithElement")); - - if (imm8 >= count / 2) - { - imm8 -= count / 2; - vectorOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, gtNewIconNode(1), NI_AVX_ExtractVector128, - simdBaseJitType, simdSize); - } - else - { - vectorOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, NI_Vector256_GetLower, simdBaseJitType, - simdSize); - } - } - - GenTree* immNode = gtNewIconNode(imm8); - - switch (simdBaseType) - { - case TYP_LONG: - case TYP_ULONG: - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, valueOp, immNode, NI_SSE41_X64_Insert, - simdBaseJitType, 16); - break; - - case TYP_FLOAT: - { - if (!compOpportunisticallyDependsOn(InstructionSet_SSE41)) - { - // Emulate Vector128.WithElement by SSE instructions - if (imm8 == 0) - { - // vector.WithElement(0, value) - // => - // movss xmm0, xmm1 (xmm0 = vector, xmm1 = value) - valueOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, valueOp, NI_Vector128_CreateScalarUnsafe, - CORINFO_TYPE_FLOAT, 16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, valueOp, NI_SSE_MoveScalar, - CORINFO_TYPE_FLOAT, 16); - } - else if (imm8 == 1) - { - // vector.WithElement(1, value) - // => - // shufps xmm1, xmm0, 0 (xmm0 = vector, xmm1 = value) - // shufps xmm1, xmm0, 226 - GenTree* tmpOp = - gtNewSimdHWIntrinsicNode(TYP_SIMD16, valueOp, NI_Vector128_CreateScalarUnsafe, - CORINFO_TYPE_FLOAT, 16); - GenTree* dupVectorOp = nullptr; - vectorOp = impCloneExpr(vectorOp, &dupVectorOp, NO_CLASS_HANDLE, (unsigned)CHECK_SPILL_ALL, - nullptr DEBUGARG("Clone Vector for Vector128.WithElement")); - tmpOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmpOp, vectorOp, gtNewIconNode(0), - NI_SSE_Shuffle, CORINFO_TYPE_FLOAT, 16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmpOp, dupVectorOp, gtNewIconNode(226), - NI_SSE_Shuffle, CORINFO_TYPE_FLOAT, 16); - } - else - { - ssize_t controlBits1 = 0; - ssize_t controlBits2 = 0; - if (imm8 == 2) - { - controlBits1 = 48; - controlBits2 = 132; - } - else - { - controlBits1 = 32; - controlBits2 = 36; - } - // vector.WithElement(2, value) - // => - // shufps xmm1, xmm0, 48 (xmm0 = vector, xmm1 = value) - // shufps xmm0, xmm1, 132 - // - // vector.WithElement(3, value) - // => - // shufps xmm1, xmm0, 32 (xmm0 = vector, xmm1 = value) - // shufps xmm0, xmm1, 36 - GenTree* tmpOp = - gtNewSimdHWIntrinsicNode(TYP_SIMD16, valueOp, NI_Vector128_CreateScalarUnsafe, - CORINFO_TYPE_FLOAT, 16); - GenTree* dupVectorOp = nullptr; - vectorOp = impCloneExpr(vectorOp, &dupVectorOp, NO_CLASS_HANDLE, (unsigned)CHECK_SPILL_ALL, - nullptr DEBUGARG("Clone Vector for Vector128.WithElement")); - valueOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, tmpOp, gtNewIconNode(controlBits1), - NI_SSE_Shuffle, CORINFO_TYPE_FLOAT, 16); - retNode = - gtNewSimdHWIntrinsicNode(TYP_SIMD16, valueOp, dupVectorOp, gtNewIconNode(controlBits2), - NI_SSE_Shuffle, CORINFO_TYPE_FLOAT, 16); - } - break; - } - else - { - valueOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, valueOp, NI_Vector128_CreateScalarUnsafe, - CORINFO_TYPE_FLOAT, 16); - immNode->AsIntCon()->SetIconValue(imm8 * 16); - FALLTHROUGH; - } - } - - case TYP_BYTE: - case TYP_UBYTE: - case TYP_INT: - case TYP_UINT: - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, valueOp, immNode, NI_SSE41_Insert, - simdBaseJitType, 16); - break; - - case TYP_SHORT: - case TYP_USHORT: - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, valueOp, immNode, NI_SSE2_Insert, - simdBaseJitType, 16); - break; - - case TYP_DOUBLE: - { - // vector.WithElement(0, value) - // => - // movsd xmm0, xmm1 (xmm0 = vector, xmm1 = value) - // - // vector.WithElement(1, value) - // => - // unpcklpd xmm0, xmm1 (xmm0 = vector, xmm1 = value) - valueOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, valueOp, NI_Vector128_CreateScalarUnsafe, - CORINFO_TYPE_DOUBLE, 16); - NamedIntrinsic in = (imm8 == 0) ? NI_SSE2_MoveScalar : NI_SSE2_UnpackLow; - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, valueOp, in, CORINFO_TYPE_DOUBLE, 16); - break; - } - - default: - return nullptr; - } - - if (simdSize == 32) - { - assert(clonedVectorOp); - int upperOrLower = (cachedImm8 >= count / 2) ? 1 : 0; - retNode = gtNewSimdHWIntrinsicNode(retType, clonedVectorOp, retNode, gtNewIconNode(upperOrLower), - NI_AVX_InsertVector128, simdBaseJitType, simdSize); - } - + retNode = gtNewSimdWithElementNode(retType, vectorOp, indexOp, valueOp, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); break; } @@ -1205,14 +1058,12 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_GetElement: { assert(sig->numArgs == 2); - GenTree* indexOp = impStackTop().val; - if (!compExactlyDependsOn(InstructionSet_SSE2) || !varTypeIsArithmetic(simdBaseType) || - !indexOp->OperIsConst()) + + if (!compExactlyDependsOn(InstructionSet_SSE2) || !varTypeIsArithmetic(simdBaseType)) { // Using software fallback if // 1. JIT/hardware don't support SSE2 instructions // 2. simdBaseType is not a numeric type (throw execptions) - // 3. index is not a constant return nullptr; } @@ -1223,15 +1074,9 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, case TYP_UBYTE: case TYP_INT: case TYP_UINT: - if (!compExactlyDependsOn(InstructionSet_SSE41)) - { - return nullptr; - } - break; - case TYP_LONG: case TYP_ULONG: - if (!compExactlyDependsOn(InstructionSet_SSE41_X64)) + if (!compExactlyDependsOn(InstructionSet_SSE41)) { return nullptr; } @@ -1245,144 +1090,14 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, break; default: - break; - } - - ssize_t imm8 = indexOp->AsIntCon()->IconValue(); - ssize_t count = simdSize / genTypeSize(simdBaseType); - - if (imm8 >= count || imm8 < 0) - { - // Using software fallback if index is out of range (throw exeception) - return nullptr; - } - - impPopStack(); - GenTree* vectorOp = impSIMDPopStack(getSIMDTypeForSize(simdSize)); - NamedIntrinsic resIntrinsic = NI_Illegal; - - if (simdSize == 32) - { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); - - if (imm8 >= count / 2) - { - imm8 -= count / 2; - vectorOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, gtNewIconNode(1), NI_AVX_ExtractVector128, - simdBaseJitType, simdSize); - } - else - { - vectorOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, NI_Vector256_GetLower, simdBaseJitType, - simdSize); - } - } - - if (imm8 == 0 && (genTypeSize(simdBaseType) >= 4)) - { - switch (simdBaseType) - { - case TYP_LONG: - resIntrinsic = NI_SSE2_X64_ConvertToInt64; - break; - - case TYP_ULONG: - resIntrinsic = NI_SSE2_X64_ConvertToUInt64; - break; - - case TYP_INT: - resIntrinsic = NI_SSE2_ConvertToInt32; - break; - - case TYP_UINT: - resIntrinsic = NI_SSE2_ConvertToUInt32; - break; - - case TYP_FLOAT: - case TYP_DOUBLE: - resIntrinsic = NI_Vector128_ToScalar; - break; - - default: - return nullptr; - } - - return gtNewSimdHWIntrinsicNode(retType, vectorOp, resIntrinsic, simdBaseJitType, 16); + unreached(); } - GenTree* immNode = gtNewIconNode(imm8); - - switch (simdBaseType) - { - case TYP_LONG: - case TYP_ULONG: - retNode = - gtNewSimdHWIntrinsicNode(retType, vectorOp, immNode, NI_SSE41_X64_Extract, simdBaseJitType, 16); - break; - - case TYP_FLOAT: - { - if (!compOpportunisticallyDependsOn(InstructionSet_SSE41)) - { - assert(imm8 >= 1); - assert(imm8 <= 3); - // Emulate Vector128.GetElement(i) by SSE instructions - // vector.GetElement(i) - // => - // shufps xmm0, xmm0, control - // (xmm0 = vector, control = i + 228) - immNode->AsIntCon()->SetIconValue(228 + imm8); - GenTree* clonedVectorOp = nullptr; - vectorOp = impCloneExpr(vectorOp, &clonedVectorOp, NO_CLASS_HANDLE, (unsigned)CHECK_SPILL_ALL, - nullptr DEBUGARG("Clone Vector for Vector128.GetElement")); - vectorOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, clonedVectorOp, immNode, - NI_SSE_Shuffle, CORINFO_TYPE_FLOAT, 16); - return gtNewSimdHWIntrinsicNode(retType, vectorOp, NI_Vector128_ToScalar, CORINFO_TYPE_FLOAT, - 16); - } - FALLTHROUGH; - } - - case TYP_UBYTE: - case TYP_INT: - case TYP_UINT: - retNode = - gtNewSimdHWIntrinsicNode(retType, vectorOp, immNode, NI_SSE41_Extract, simdBaseJitType, 16); - break; - - case TYP_BYTE: - // We do not have SSE41/SSE2 Extract APIs on signed small int, so need a CAST on the result - retNode = gtNewSimdHWIntrinsicNode(TYP_UBYTE, vectorOp, immNode, NI_SSE41_Extract, - CORINFO_TYPE_UBYTE, 16); - retNode = gtNewCastNode(TYP_INT, retNode, true, TYP_BYTE); - break; - - case TYP_SHORT: - case TYP_USHORT: - // We do not have SSE41/SSE2 Extract APIs on signed small int, so need a CAST on the result - retNode = gtNewSimdHWIntrinsicNode(TYP_USHORT, vectorOp, immNode, NI_SSE2_Extract, - CORINFO_TYPE_USHORT, 16); - if (simdBaseType == TYP_SHORT) - { - retNode = gtNewCastNode(TYP_INT, retNode, true, TYP_SHORT); - } - break; - - case TYP_DOUBLE: - assert(imm8 == 1); - // vector.GetElement(1) - // => - // pshufd xmm1, xmm0, 0xEE (xmm0 = vector) - vectorOp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, gtNewIconNode(0xEE), NI_SSE2_Shuffle, - CORINFO_TYPE_INT, 16); - retNode = - gtNewSimdHWIntrinsicNode(TYP_DOUBLE, vectorOp, NI_Vector128_ToScalar, CORINFO_TYPE_DOUBLE, 16); - break; - - default: - return nullptr; - } + GenTree* op2 = impPopStack().val; + GenTree* op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize)); + retNode = gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); break; } diff --git a/src/coreclr/jit/lclmorph.cpp b/src/coreclr/jit/lclmorph.cpp index db3e134b60790..9e49f30bd70bd 100644 --- a/src/coreclr/jit/lclmorph.cpp +++ b/src/coreclr/jit/lclmorph.cpp @@ -917,7 +917,7 @@ class LocalAddressVisitor final : public GenTreeVisitor // a variable into a LCL_FLD but that blocks enregistration so we need to // detect those case where we can use LCL_VAR instead, perhaps in conjuction // with CAST and/or BITCAST. - // Also skip SIMD variables for now, fgMorphFieldAssignToSIMDIntrinsicSet and + // Also skip SIMD variables for now, fgMorphFieldAssignToSimdSetElement and // others need to be updated to recognize LCL_FLDs. return; } @@ -958,7 +958,7 @@ class LocalAddressVisitor final : public GenTreeVisitor if (varTypeIsSIMD(indir->TypeGet())) { // TODO-ADDR: Skip SIMD indirs for now, SIMD typed LCL_FLDs works most of the time - // but there are exceptions - fgMorphFieldAssignToSIMDIntrinsicSet for example. + // but there are exceptions - fgMorphFieldAssignToSimdSetElement for example. // And more importantly, SIMD call args have to be wrapped in OBJ nodes currently. return; } diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 83ce67d2685dd..bc061bf7548f0 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -330,6 +330,8 @@ class Lowering final : public Phase #if defined(TARGET_XARCH) void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node); void LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node); + void LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node); + void LowerHWIntrinsicWithElement(GenTreeHWIntrinsic* node); #elif defined(TARGET_ARM64) bool IsValidConstForMovImm(GenTreeHWIntrinsic* node); void LowerHWIntrinsicFusedMultiplyAddScalar(GenTreeHWIntrinsic* node); diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index c1845995dc993..6774a4ef1766b 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1659,48 +1659,21 @@ void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode) { switch (simdNode->gtSIMDIntrinsicID) { - GenTree* op1; - GenTree* op2; - case SIMDIntrinsicInit: - op1 = simdNode->AsOp()->gtOp1; + { + GenTree* op1 = simdNode->AsOp()->gtOp1; if (op1->IsIntegralConst(0)) { MakeSrcContained(simdNode, op1); } break; + } case SIMDIntrinsicInitArray: // We have an array and an index, which may be contained. CheckImmedAndMakeContained(simdNode, simdNode->gtGetOp2()); break; - case SIMDIntrinsicGetItem: - { - // This implements get_Item method. The sources are: - // - the source SIMD struct - // - index (which element to get) - // The result is simdBaseType of SIMD struct. - op1 = simdNode->AsOp()->gtOp1; - op2 = simdNode->AsOp()->gtOp2; - - // If the index is a constant, mark it as contained. - if (op2->IsCnsIntOrI()) - { - MakeSrcContained(simdNode, op2); - } - - if (IsContainableMemoryOp(op1)) - { - MakeSrcContained(simdNode, op1); - if (op1->OperGet() == GT_IND) - { - op1->AsIndir()->Addr()->ClearContained(); - } - } - break; - } - default: break; } @@ -1765,8 +1738,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AdvSimd_InsertScalar: case NI_AdvSimd_LoadAndInsertScalar: case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128: - case NI_Vector64_GetElement: - case NI_Vector128_GetElement: assert(hasImmediateOperand); assert(varTypeIsIntegral(intrin.op2)); if (intrin.op2->IsCnsIntOrI()) @@ -1832,6 +1803,29 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) } break; + case NI_Vector64_GetElement: + case NI_Vector128_GetElement: + { + assert(hasImmediateOperand); + assert(varTypeIsIntegral(intrin.op2)); + + if (intrin.op2->IsCnsIntOrI()) + { + MakeSrcContained(node, intrin.op2); + } + + if (IsContainableMemoryOp(intrin.op1)) + { + MakeSrcContained(node, intrin.op1); + + if (intrin.op1->OperIs(GT_IND)) + { + intrin.op1->AsIndir()->Addr()->ClearContained(); + } + } + break; + } + default: unreached(); } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 31279fb8b435a..c9b0d6ef7ae89 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -728,26 +728,6 @@ void Lowering::LowerSIMD(GenTreeSIMD* simdNode) } } -#ifdef TARGET_XARCH - if ((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem) && (simdNode->gtGetOp1()->OperGet() == GT_IND)) - { - // If SIMD vector is already in memory, we force its - // addr to be evaluated into a reg. This would allow - // us to generate [regBase] or [regBase+offset] or - // [regBase+sizeOf(SIMD vector simdBaseType)*regIndex] - // to access the required SIMD vector element directly - // from memory. - // - // TODO-CQ-XARCH: If addr of GT_IND is GT_LEA, we - // might be able update GT_LEA to fold the regIndex - // or offset in some cases. Instead with this - // approach we always evaluate GT_LEA into a reg. - // Ideally, we should be able to lower GetItem intrinsic - // into GT_IND(newAddr) where newAddr combines - // the addr of SIMD vector with the given index. - simdNode->gtOp1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG; - } -#endif ContainCheckSIMD(simdNode); } #endif // FEATURE_SIMD @@ -950,6 +930,33 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) return; } + case NI_Vector128_GetElement: + case NI_Vector256_GetElement: + { + LowerHWIntrinsicGetElement(node); + + if ((node->gtHWIntrinsicId == NI_Vector128_GetElement) || + (node->gtHWIntrinsicId == NI_Vector256_GetElement)) + { + // Most NI_Vector*_GetElement intrinsics are lowered to + // alternative nodes, such as the Extract intrinsics, + // which are themselves lowered. + // + // However, certain types may not have a direct equivalent + // in which case we specially handle them directly as GetElement + // and want to do the relevant containment checks. + break; + } + return; + } + + case NI_Vector128_WithElement: + case NI_Vector256_WithElement: + { + LowerHWIntrinsicWithElement(node); + return; + } + case NI_Vector128_op_Equality: case NI_Vector256_op_Equality: { @@ -971,6 +978,38 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_SSE41_Extract: + { + if (varTypeIsFloating(node->GetSimdBaseType())) + { + assert(node->GetSimdBaseType() == TYP_FLOAT); + assert(node->gtOp1 != nullptr); + assert(node->gtOp2 != nullptr); + assert(node->GetSimdSize() == 16); + + GenTree* op2 = node->gtGetOp2(); + + if (!op2->OperIsConst()) + { + // Extract allows the full range while GetElement only allows + // 0-3, so we need to mask the index here so codegen works. + + GenTree* msk = comp->gtNewIconNode(3, TYP_INT); + BlockRange().InsertAfter(op2, msk); + + GenTree* tmp = comp->gtNewOperNode(GT_AND, TYP_INT, op2, msk); + BlockRange().InsertAfter(msk, tmp); + LowerNode(tmp); + + node->gtOp2 = tmp; + } + + node->gtHWIntrinsicId = NI_Vector128_GetElement; + LowerNode(node); + } + break; + } + case NI_SSE2_Insert: case NI_SSE41_Insert: case NI_SSE41_X64_Insert: @@ -2538,6 +2577,611 @@ void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) } } +//---------------------------------------------------------------------------------------------- +// Lowering::LowerHWIntrinsicGetElement: Lowers a Vector128 or Vector256 GetElement call +// +// Arguments: +// node - The hardware intrinsic node. +// +void Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) +{ + NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; + var_types simdType = node->gtType; + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + + assert(!varTypeIsSIMD(simdType)); + assert(varTypeIsArithmetic(simdBaseType)); + assert(simdSize != 0); + + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); + + assert(op1 != nullptr); + assert(op2 != nullptr); + + if (op1->OperIs(GT_IND)) + { + // If the vector is already in memory, we force its + // addr to be evaluated into a reg. This would allow + // us to generate [regBase] or [regBase + offset] or + // [regBase + sizeOf(simdBaseType) * regIndex] to access + // the required vector element directly from memory. + // + // TODO-CQ-XARCH: If addr of GT_IND is GT_LEA, we + // might be able update GT_LEA to fold the regIndex + // or offset in some cases. Instead with this + // approach we always evaluate GT_LEA into a reg. + // Ideally, we should be able to lower GetItem intrinsic + // into GT_IND(newAddr) where newAddr combines + // the addr of the vector with the given index. + op1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG; + } + + if (!op2->OperIsConst()) + { + // We will specially handle GetElement in codegen when op2 isn't a constant + return; + } + + // We should have a bounds check inserted for any index outside the allowed range + // but we need to generate some code anyways, and so we'll simply mask here for simplicity. + + ssize_t count = simdSize / genTypeSize(simdBaseType); + ssize_t imm8 = static_cast(op2->AsIntCon()->IconValue()) % count; + + assert(0 <= imm8 && imm8 < count); + + if (IsContainableMemoryOp(op1)) + { + // We will specially handle GetElement in codegen when op1 is already in memory + op2->AsIntCon()->SetIconValue(imm8); + return; + } + + switch (simdBaseType) + { + // Using software fallback if simdBaseType is not supported by hardware + case TYP_BYTE: + case TYP_UBYTE: + case TYP_INT: + case TYP_UINT: + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41)); + break; + + case TYP_LONG: + case TYP_ULONG: + // We either support TYP_LONG or we have been decomposed into two TYP_INT inserts + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41_X64)); + break; + + case TYP_DOUBLE: + case TYP_FLOAT: + case TYP_SHORT: + case TYP_USHORT: + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); + break; + + default: + unreached(); + } + + // Remove the index node up front to simplify downstream logic + BlockRange().Remove(op2); + + // Spare GenTrees to be used for the lowering logic below + // Defined upfront to avoid naming conflicts, etc... + GenTree* idx = nullptr; + GenTree* tmp1 = nullptr; + GenTree* tmp2 = nullptr; + + if (intrinsicId == NI_Vector256_GetElement) + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); + + if (imm8 >= count / 2) + { + // idx = CNS_INT int 1 + // /--* op1 simd32 + // +--* idx int + // op1 = * HWINTRINSIC simd32 T ExtractVector128 + + // This is roughly the following managed code: + // ... + // op1 = Avx.ExtractVector128(op1, 0x01); + + imm8 -= count / 2; + + idx = comp->gtNewIconNode(1); + BlockRange().InsertBefore(node, idx); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, idx, NI_AVX_ExtractVector128, simdBaseJitType, + simdSize); + BlockRange().InsertAfter(idx, tmp1); + LowerNode(tmp1); + } + else + { + // /--* op1 simd32 + // op1 = * HWINTRINSIC simd32 T GetLower + + // This is roughly the following managed code: + // ... + // op1 = op1.GetLower(); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector256_GetLower, simdBaseJitType, 16); + BlockRange().InsertBefore(node, tmp1); + LowerNode(tmp1); + } + + op1 = tmp1; + } + + NamedIntrinsic resIntrinsic = NI_Illegal; + + if (imm8 == 0 && (genTypeSize(simdBaseType) >= 4)) + { + switch (simdBaseType) + { + case TYP_LONG: + resIntrinsic = NI_SSE2_X64_ConvertToInt64; + break; + + case TYP_ULONG: + resIntrinsic = NI_SSE2_X64_ConvertToUInt64; + break; + + case TYP_INT: + resIntrinsic = NI_SSE2_ConvertToInt32; + break; + + case TYP_UINT: + resIntrinsic = NI_SSE2_ConvertToUInt32; + break; + + case TYP_FLOAT: + case TYP_DOUBLE: + resIntrinsic = NI_Vector128_ToScalar; + break; + + default: + unreached(); + } + + op2 = nullptr; + } + else + { + op2 = comp->gtNewIconNode(imm8); + BlockRange().InsertBefore(node, op2); + + switch (simdBaseType) + { + case TYP_LONG: + case TYP_ULONG: + { + resIntrinsic = NI_SSE41_X64_Extract; + break; + } + + case TYP_FLOAT: + case TYP_DOUBLE: + { + // We specially handle float and double for more efficient codegen + resIntrinsic = NI_Vector128_GetElement; + break; + } + + case TYP_BYTE: + case TYP_UBYTE: + case TYP_INT: + case TYP_UINT: + { + resIntrinsic = NI_SSE41_Extract; + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + resIntrinsic = NI_SSE2_Extract; + break; + } + + default: + unreached(); + } + } + + assert(resIntrinsic != NI_Illegal); + + node->gtHWIntrinsicId = resIntrinsic; + node->gtOp1 = op1; + node->gtOp2 = op2; + node->SetSimdSize(16); + + if (!varTypeIsFloating(simdBaseType)) + { + assert(node->gtHWIntrinsicId != intrinsicId); + LowerNode(node); + } + + if ((simdBaseType == TYP_BYTE) || (simdBaseType == TYP_SHORT)) + { + // The intrinsic zeros the upper bits, so we need an explicit + // cast to ensure the result is properly sign extended + + LIR::Use use; + bool foundUse = BlockRange().TryGetUse(node, &use); + + GenTreeCast* cast = comp->gtNewCastNode(TYP_INT, node, /* isUnsigned */ true, simdBaseType); + BlockRange().InsertAfter(node, cast); + + if (foundUse) + { + use.ReplaceWith(comp, cast); + } + LowerNode(cast); + } +} + +//---------------------------------------------------------------------------------------------- +// Lowering::LowerHWIntrinsicWithElement: Lowers a Vector128 or Vector256 WithElement call +// +// Arguments: +// node - The hardware intrinsic node. +// +void Lowering::LowerHWIntrinsicWithElement(GenTreeHWIntrinsic* node) +{ + NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; + var_types simdType = node->gtType; + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + + assert(varTypeIsSIMD(simdType)); + assert(varTypeIsArithmetic(simdBaseType)); + assert(simdSize != 0); + + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = nullptr; + GenTree* op3 = nullptr; + + assert(op1->OperIsList()); + assert(node->gtGetOp2() == nullptr); + + GenTreeArgList* argList = op1->AsArgList(); + + op1 = argList->Current(); + argList = argList->Rest(); + + op2 = argList->Current(); + argList = argList->Rest(); + + op3 = argList->Current(); + argList = argList->Rest(); + + assert(op1 != nullptr); + assert(op2 != nullptr); + assert(op3 != nullptr); + + assert(op2->OperIsConst()); + assert(argList == nullptr); + + ssize_t imm8 = op2->AsIntCon()->IconValue(); + ssize_t cachedImm8 = imm8; + ssize_t count = simdSize / genTypeSize(simdBaseType); + + assert(0 <= imm8 && imm8 < count); + + switch (simdBaseType) + { + // Using software fallback if simdBaseType is not supported by hardware + case TYP_BYTE: + case TYP_UBYTE: + case TYP_INT: + case TYP_UINT: + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41)); + break; + + case TYP_LONG: + case TYP_ULONG: + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41_X64)); + break; + + case TYP_DOUBLE: + case TYP_FLOAT: + case TYP_SHORT: + case TYP_USHORT: + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); + break; + + default: + unreached(); + } + + // Remove the index node up front to simplify downstream logic + BlockRange().Remove(op2); + + // Spare GenTrees to be used for the lowering logic below + // Defined upfront to avoid naming conflicts, etc... + GenTree* idx = nullptr; + GenTree* tmp1 = nullptr; + GenTree* tmp2 = nullptr; + GenTree* tmp3 = nullptr; + GenTree* tmpv = nullptr; + + if (intrinsicId == NI_Vector256_WithElement) + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); + + // We will be constructing the following parts: + // ... + // /--* op1 simd32 + // * STORE_LCL_VAR simd32 + // tmpv = LCL_VAR simd32 + // op1 = LCL_VAR simd32 + + node->gtOp1 = op1; + LIR::Use op1Use(BlockRange(), &node->gtOp1, node); + ReplaceWithLclVar(op1Use); + tmpv = node->gtOp1; + + op1 = comp->gtClone(tmpv); + BlockRange().InsertBefore(op3, op1); + + if (imm8 >= count / 2) + { + // We will be constructing the following parts: + // ... + // idx = CNS_INT int 1 + // /--* op1 simd32 + // +--* idx int + // op1 = * HWINTRINSIC simd32 T ExtractVector128 + + // This is roughly the following managed code: + // ... + // op1 = Avx.ExtractVector128(op1, 0x01); + + imm8 -= count / 2; + + idx = comp->gtNewIconNode(1); + BlockRange().InsertAfter(op1, idx); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, idx, NI_AVX_ExtractVector128, simdBaseJitType, + simdSize); + BlockRange().InsertAfter(idx, tmp1); + LowerNode(tmp1); + } + else + { + // We will be constructing the following parts: + // ... + // /--* op1 simd32 + // op1 = * HWINTRINSIC simd32 T GetLower + + // This is roughly the following managed code: + // ... + // op1 = op1.GetLower(); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize); + BlockRange().InsertAfter(op1, tmp1); + LowerNode(tmp1); + } + + op1 = tmp1; + } + + NamedIntrinsic resIntrinsic = NI_Illegal; + + idx = comp->gtNewIconNode(imm8); + BlockRange().InsertBefore(node, idx); + + switch (simdBaseType) + { + case TYP_LONG: + case TYP_ULONG: + { + op2 = idx; + resIntrinsic = NI_SSE41_X64_Insert; + break; + } + + case TYP_FLOAT: + { + // We will be constructing the following parts: + // ... + // /--* op3 float + // tmp1 = * HWINTRINSIC simd16 T CreateScalarUnsafe + + // This is roughly the following managed code: + // ... + // tmp1 = Vector128.CreateScalarUnsafe(op3); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op3, NI_Vector128_CreateScalarUnsafe, CORINFO_TYPE_FLOAT, + 16); + BlockRange().InsertBefore(idx, tmp1); + LowerNode(tmp1); + + if (!comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + if (imm8 == 0) + { + // We will be constructing the following parts: + // ... + // /--* op1 simd16 + // +--* op2 simd16 + // node = * HWINTRINSIC simd16 T MoveScalar + + // This is roughly the following managed code: + // ... + // node = Sse.MoveScalar(op1, op2); + + op2 = tmp1; + resIntrinsic = NI_SSE_MoveScalar; + } + else + { + // We will be constructing the following parts: + // ... + // /--* op1 simd16 + // * STORE_LCL_VAR simd16 + // op2 = LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // idx = CNS_INT int 0 + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // +--* idx int + // op1 = * HWINTRINSIC simd16 T Shuffle + // idx = CNS_INT int 226 + // /--* op1 simd16 + // +--* tmp2 simd16 + // +--* idx int + // op1 = * HWINTRINSIC simd16 T Shuffle + + // This is roughly the following managed code: + // ... + // tmp2 = Sse.Shuffle(tmp1, op1, 0 or 48 or 32); + // node = Sse.Shuffle(tmp2, op1, 226 or 132 or 36); + + node->gtOp1 = op1; + LIR::Use op1Use(BlockRange(), &node->gtOp1, node); + ReplaceWithLclVar(op1Use); + op2 = node->gtOp1; + + tmp2 = comp->gtClone(op2); + BlockRange().InsertAfter(tmp1, tmp2); + + ssize_t controlBits1; + ssize_t controlBits2; + + switch (imm8) + { + case 1: + { + controlBits1 = 0; + controlBits2 = 226; + break; + } + + case 2: + { + controlBits1 = 48; + controlBits2 = 132; + break; + } + + case 3: + { + controlBits1 = 32; + controlBits2 = 36; + break; + } + + default: + unreached(); + } + + idx = comp->gtNewIconNode(controlBits1); + BlockRange().InsertAfter(tmp2, idx); + + if (imm8 == 1) + { + std::swap(tmp1, tmp2); + } + + op1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp2, tmp1, idx, NI_SSE_Shuffle, + CORINFO_TYPE_FLOAT, 16); + BlockRange().InsertAfter(idx, op1); + LowerNode(op1); + + idx = comp->gtNewIconNode(controlBits2); + BlockRange().InsertAfter(op1, idx); + + op1 = comp->gtNewArgList(op1, op2, idx); + op2 = nullptr; + resIntrinsic = NI_SSE_Shuffle; + } + break; + } + else + { + op3 = tmp1; + idx->AsIntCon()->SetIconValue(imm8 * 16); + FALLTHROUGH; + } + } + + case TYP_BYTE: + case TYP_UBYTE: + case TYP_INT: + case TYP_UINT: + { + op1 = comp->gtNewArgList(op1, op3, idx); + op2 = nullptr; + resIntrinsic = NI_SSE41_Insert; + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + op1 = comp->gtNewArgList(op1, op3, idx); + op2 = nullptr; + resIntrinsic = NI_SSE2_Insert; + break; + } + + case TYP_DOUBLE: + { + // We will be constructing the following parts: + // ... + // /--* op3 double + // tmp1 = * HWINTRINSIC simd16 T CreateScalarUnsafe + + // This is roughly the following managed code: + // ... + // tmp1 = Vector128.CreateScalarUnsafe(op3); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op3, NI_Vector128_CreateScalarUnsafe, CORINFO_TYPE_DOUBLE, + 16); + BlockRange().InsertBefore(idx, tmp1); + LowerNode(tmp1); + + op2 = tmp1; + resIntrinsic = (imm8 == 0) ? NI_SSE2_MoveScalar : NI_SSE2_UnpackLow; + break; + } + + default: + unreached(); + } + + assert(resIntrinsic != NI_Illegal); + + if (tmpv != nullptr) + { + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, resIntrinsic, simdBaseJitType, 16); + BlockRange().InsertBefore(node, tmp1); + LowerNode(tmp1); + + idx = comp->gtNewIconNode((cachedImm8 >= count / 2) ? 1 : 0); + BlockRange().InsertAfter(tmp1, idx); + + op1 = comp->gtNewArgList(tmpv, tmp1, idx); + op2 = nullptr; + resIntrinsic = NI_AVX_InsertVector128; + } + + node->gtHWIntrinsicId = resIntrinsic; + node->gtOp1 = op1; + node->gtOp2 = op2; + + assert(node->gtHWIntrinsicId != intrinsicId); + LowerNode(node); +} + //---------------------------------------------------------------------------------------------- // Lowering::LowerHWIntrinsicDot: Lowers a Vector128 or Vector256 Dot call // @@ -4668,12 +5312,9 @@ void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode) { switch (simdNode->gtSIMDIntrinsicID) { - GenTree* op1; - GenTree* op2; - case SIMDIntrinsicInit: { - op1 = simdNode->AsOp()->gtOp1; + GenTree* op1 = simdNode->AsOp()->gtOp1; #ifndef TARGET_64BIT if (op1->OperGet() == GT_LONG) { @@ -4712,34 +5353,6 @@ void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode) CheckImmedAndMakeContained(simdNode, simdNode->gtGetOp2()); break; - case SIMDIntrinsicGetItem: - { - // This implements get_Item method. The sources are: - // - the source SIMD struct - // - index (which element to get) - // The result is simdBaseType of SIMD struct. - op1 = simdNode->AsOp()->gtOp1; - op2 = simdNode->AsOp()->gtOp2; - - if (op1->OperGet() == GT_IND) - { - assert((op1->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0); - op1->AsIndir()->Addr()->ClearContained(); - } - // If the index is a constant, mark it as contained. - CheckImmedAndMakeContained(simdNode, op2); - - if (IsContainableMemoryOp(op1)) - { - MakeSrcContained(simdNode, op1); - if (op1->OperGet() == GT_IND) - { - op1->AsIndir()->Addr()->ClearContained(); - } - } - } - break; - case SIMDIntrinsicShuffleSSE2: // Second operand is an integer constant and marked as contained. assert(simdNode->AsOp()->gtOp2->IsCnsIntOrI()); @@ -5212,9 +5825,13 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if ((node->GetSimdSize() == 8) || (node->GetSimdSize() == 12)) { - // TODO-XArch-CQ: Ideally we would key this off of the size containingNode - // expects vs the size node actually is or would be if spilled to the stack - return; + // We want to handle GetElement still for Vector2/3 + if ((intrinsicId != NI_Vector128_GetElement) && (intrinsicId != NI_Vector256_GetElement)) + { + // TODO-XArch-CQ: Ideally we would key this off of the size containingNode + // expects vs the size node actually is or would be if spilled to the stack + return; + } } // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained @@ -5390,8 +6007,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) switch (intrinsicId) { case NI_SSE2_Extract: - case NI_SSE41_Extract: - case NI_SSE41_X64_Extract: case NI_AVX_ExtractVector128: case NI_AVX2_ExtractVector128: { @@ -5444,6 +6059,15 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_SSE41_Extract: + case NI_SSE41_X64_Extract: + { + assert(!varTypeIsFloating(simdBaseType)); + // TODO-XARCH-CQ: These intrinsics are "ins reg/mem, xmm, imm8" and don't + // currently support containment. + break; + } + case NI_AVX_Permute: { // These intrinsics can have op2 be imm or reg/mem @@ -5515,6 +6139,49 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case HW_Category_Helper: + { + // We don't currently have any IMM intrinsics which are also commutative + assert(!isCommutative); + + switch (intrinsicId) + { + case NI_Vector128_GetElement: + case NI_Vector256_GetElement: + { + if (op1->OperIs(GT_IND)) + { + assert((op1->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0); + op1->AsIndir()->Addr()->ClearContained(); + } + + if (op2->OperIsConst()) + { + MakeSrcContained(node, op2); + } + + if (IsContainableMemoryOp(op1)) + { + MakeSrcContained(node, op1); + + if (op1->OperIs(GT_IND)) + { + op1->AsIndir()->Addr()->ClearContained(); + } + } + break; + } + + default: + { + assert(!"Unhandled containment for helper binary hardware intrinsic"); + break; + } + } + + break; + } + default: { unreached(); diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index d7b51ff42bbe4..223dc906badbf 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -809,44 +809,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) // No special handling required. break; - case SIMDIntrinsicGetItem: - { - op1 = simdTree->gtGetOp1(); - op2 = simdTree->gtGetOp2(); - - // We have an object and an index, either of which may be contained. - bool setOp2DelayFree = false; - if (!op2->IsCnsIntOrI() && (!op1->isContained() || op1->OperIsLocal())) - { - // If the index is not a constant and the object is not contained or is a local - // we will need a general purpose register to calculate the address - // internal register must not clobber input index - // TODO-Cleanup: An internal register will never clobber a source; this code actually - // ensures that the index (op2) doesn't interfere with the target. - buildInternalIntRegisterDefForNode(simdTree); - setOp2DelayFree = true; - } - srcCount += BuildOperandUses(op1); - if (!op2->isContained()) - { - RefPosition* op2Use = BuildUse(op2); - if (setOp2DelayFree) - { - setDelayFree(op2Use); - } - srcCount++; - } - - if (!op2->IsCnsIntOrI() && (!op1->isContained())) - { - // If vector is not already in memory (contained) and the index is not a constant, - // we will use the SIMD temp location to store the vector. - compiler->getSIMDInitTempVarNum(); - } - buildUses = false; - } - break; - case SIMDIntrinsicSub: case SIMDIntrinsicBitwiseAnd: case SIMDIntrinsicBitwiseOr: @@ -854,10 +816,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) // No special handling required. break; - case SIMDIntrinsicSetX: - case SIMDIntrinsicSetY: - case SIMDIntrinsicSetZ: - case SIMDIntrinsicSetW: case SIMDIntrinsicNarrow: { // Op1 will write to dst before Op2 is free @@ -904,10 +862,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) case SIMDIntrinsicCopyToArray: case SIMDIntrinsicCopyToArrayX: case SIMDIntrinsicNone: - case SIMDIntrinsicGetX: - case SIMDIntrinsicGetY: - case SIMDIntrinsicGetZ: - case SIMDIntrinsicGetW: case SIMDIntrinsicHWAccel: case SIMDIntrinsicWiden: case SIMDIntrinsicInvalid: @@ -1202,6 +1156,29 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) } } + if ((intrin.id == NI_Vector64_GetElement) || (intrin.id == NI_Vector128_GetElement)) + { + assert(!op2DelayFree); + + if (!intrin.op2->IsCnsIntOrI() && (!intrin.op1->isContained() || intrin.op1->OperIsLocal())) + { + // If the index is not a constant and the object is not contained or is a local + // we will need a general purpose register to calculate the address + // internal register must not clobber input index + // TODO-Cleanup: An internal register will never clobber a source; this code actually + // ensures that the index (op2) doesn't interfere with the target. + buildInternalIntRegisterDefForNode(intrinsicTree); + op2DelayFree = true; + } + + if (!intrin.op2->IsCnsIntOrI() && !intrin.op1->isContained()) + { + // If the index is not a constant or op1 is in register, + // we will use the SIMD temp location to store the vector. + compiler->getSIMDInitTempVarNum(); + } + } + srcCount += op2DelayFree ? BuildDelayFreeUses(intrin.op2) : BuildOperandUses(intrin.op2); if (intrin.op3 != nullptr) diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index ad750fad953c8..5c76005fc8682 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1956,96 +1956,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) case SIMDIntrinsicEqual: break; - case SIMDIntrinsicGetItem: - { - // This implements get_Item method. The sources are: - // - the source SIMD struct - // - index (which element to get) - // The result is baseType of SIMD struct. - // op1 may be a contained memory op, but if so we will consume its address. - // op2 may be a contained constant. - op1 = simdTree->gtGetOp1(); - op2 = simdTree->gtGetOp2(); - - if (!op1->isContained()) - { - // If the index is not a constant, we will use the SIMD temp location to store the vector. - // Otherwise, if the baseType is floating point, the targetReg will be a xmm reg and we - // can use that in the process of extracting the element. - // - // If the index is a constant and base type is a small int we can use pextrw, but on AVX - // we will need a temp if are indexing into the upper half of the AVX register. - // In all other cases with constant index, we need a temp xmm register to extract the - // element if index is other than zero. - - if (!op2->IsCnsIntOrI()) - { - (void)compiler->getSIMDInitTempVarNum(); - } - else if (!varTypeIsFloating(simdTree->GetSimdBaseType())) - { - bool needFloatTemp; - if (varTypeIsSmallInt(simdTree->GetSimdBaseType()) && - (compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported)) - { - int byteShiftCnt = (int)op2->AsIntCon()->gtIconVal * genTypeSize(simdTree->GetSimdBaseType()); - needFloatTemp = (byteShiftCnt >= 16); - } - else - { - needFloatTemp = !op2->IsIntegralConst(0); - } - - if (needFloatTemp) - { - buildInternalFloatRegisterDefForNode(simdTree); - } - } -#ifdef TARGET_X86 - // This logic is duplicated from genSIMDIntrinsicGetItem(). - // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to - // generate a movzx/movsx. On x86, these require byteable registers. So figure out which - // cases will require this, so the non-byteable registers can be excluded. - - var_types baseType = simdTree->GetSimdBaseType(); - if (op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType)) - { - bool ZeroOrSignExtnReqd = true; - unsigned baseSize = genTypeSize(baseType); - if (baseSize == 1) - { - if ((op2->AsIntCon()->gtIconVal % 2) == 1) - { - ZeroOrSignExtnReqd = (baseType == TYP_BYTE); - } - } - else - { - assert(baseSize == 2); - ZeroOrSignExtnReqd = (baseType == TYP_SHORT); - } - if (ZeroOrSignExtnReqd) - { - dstCandidates = allByteRegs(); - } - } -#endif // TARGET_X86 - } - } - break; - - case SIMDIntrinsicSetX: - case SIMDIntrinsicSetY: - case SIMDIntrinsicSetZ: - case SIMDIntrinsicSetW: - // We need an internal integer register for SSE2 codegen - if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) - { - buildInternalIntRegisterDefForNode(simdTree); - } - - break; - case SIMDIntrinsicCast: break; @@ -2122,13 +2032,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) assert(simdTree->gtGetOp2()->isContainedIntOrIImmed()); break; - case SIMDIntrinsicGetX: - case SIMDIntrinsicGetY: - case SIMDIntrinsicGetZ: - case SIMDIntrinsicGetW: - assert(!"Get intrinsics should not be seen during Lowering."); - unreached(); - default: noway_assert(!"Unimplemented SIMD node type."); unreached(); @@ -2281,6 +2184,20 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) break; } + case NI_Vector128_GetElement: + case NI_Vector256_GetElement: + { + assert(numArgs == 2); + + if (!op2->OperIsConst() && !op1->isContained()) + { + // If the index is not a constant or op1 is in register, + // we will use the SIMD temp location to store the vector. + compiler->getSIMDInitTempVarNum(); + } + break; + } + case NI_Vector128_ToVector256: case NI_Vector128_ToVector256Unsafe: case NI_Vector256_GetLower: @@ -2342,12 +2259,10 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) case NI_SSE41_Extract: { - if (baseType == TYP_FLOAT) - { - buildInternalIntRegisterDefForNode(intrinsicTree); - } + assert(!varTypeIsFloating(baseType)); + #ifdef TARGET_X86 - else if (varTypeIsByte(baseType)) + if (varTypeIsByte(baseType)) { dstCandidates = allByteRegs(); } diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 8dbbbebf72aac..2aac4cb13604e 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -6014,7 +6014,7 @@ GenTree* Compiler::fgMorphField(GenTree* tree, MorphAddrContext* mac) // if this field belongs to simd struct, translate it to simd intrinsic. if (mac == nullptr) { - GenTree* newTree = fgMorphFieldToSIMDIntrinsicGet(tree); + GenTree* newTree = fgMorphFieldToSimdGetElement(tree); if (newTree != tree) { newTree = fgMorphSmpOp(newTree); @@ -12027,29 +12027,33 @@ GenTree* Compiler::getSIMDStructFromField(GenTree* tree, } /***************************************************************************** -* If a read operation tries to access simd struct field, then transform the -* operation to the SIMD intrinsic SIMDIntrinsicGetItem, and return the new tree. -* Otherwise, return the old tree. +* If a read operation tries to access simd struct field, then transform the operation +* to the SimdGetElementNode, and return the new tree. Otherwise, return the old tree. * Argument: * tree - GenTree*. If this pointer points to simd struct which is used for simd -* intrinsic, we will morph it as simd intrinsic SIMDIntrinsicGetItem. +* intrinsic, we will morph it as simd intrinsic NI_Vector128_GetElement. * Return: * A GenTree* which points to the new tree. If the tree is not for simd intrinsic, * return nullptr. */ -GenTree* Compiler::fgMorphFieldToSIMDIntrinsicGet(GenTree* tree) +GenTree* Compiler::fgMorphFieldToSimdGetElement(GenTree* tree) { unsigned index = 0; CorInfoType simdBaseJitType = CORINFO_TYPE_UNDEF; unsigned simdSize = 0; GenTree* simdStructNode = getSIMDStructFromField(tree, &simdBaseJitType, &index, &simdSize); + if (simdStructNode != nullptr) { var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + GenTree* op2 = gtNewIconNode(index, TYP_INT); + + assert(simdSize <= 16); assert(simdSize >= ((index + 1) * genTypeSize(simdBaseType))); - GenTree* op2 = gtNewIconNode(index); - tree = gtNewSIMDNode(simdBaseType, simdStructNode, op2, SIMDIntrinsicGetItem, simdBaseJitType, simdSize); + + tree = gtNewSimdGetElementNode(simdBaseType, simdStructNode, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); #ifdef DEBUG tree->gtDebugFlags |= GTF_DEBUG_NODE_MORPHED; #endif @@ -12058,9 +12062,8 @@ GenTree* Compiler::fgMorphFieldToSIMDIntrinsicGet(GenTree* tree) } /***************************************************************************** -* Transform an assignment of a SIMD struct field to SIMD intrinsic -* SIMDIntrinsicSet*, and return a new tree. If it is not such an assignment, -* then return the old tree. +* Transform an assignment of a SIMD struct field to SimdWithElementNode, and +* return a new tree. If it is not such an assignment, then return the old tree. * Argument: * tree - GenTree*. If this pointer points to simd struct which is used for simd * intrinsic, we will morph it as simd intrinsic set. @@ -12069,46 +12072,32 @@ GenTree* Compiler::fgMorphFieldToSIMDIntrinsicGet(GenTree* tree) * return nullptr. */ -GenTree* Compiler::fgMorphFieldAssignToSIMDIntrinsicSet(GenTree* tree) +GenTree* Compiler::fgMorphFieldAssignToSimdSetElement(GenTree* tree) { assert(tree->OperGet() == GT_ASG); - GenTree* op1 = tree->gtGetOp1(); - GenTree* op2 = tree->gtGetOp2(); unsigned index = 0; CorInfoType simdBaseJitType = CORINFO_TYPE_UNDEF; unsigned simdSize = 0; - GenTree* simdOp1Struct = getSIMDStructFromField(op1, &simdBaseJitType, &index, &simdSize); - if (simdOp1Struct != nullptr) + GenTree* simdStructNode = getSIMDStructFromField(tree->gtGetOp1(), &simdBaseJitType, &index, &simdSize); + + if (simdStructNode != nullptr) { + var_types simdType = simdStructNode->gtType; var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); - // Generate the simd set intrinsic + assert(simdSize <= 16); assert(simdSize >= ((index + 1) * genTypeSize(simdBaseType))); - SIMDIntrinsicID simdIntrinsicID = SIMDIntrinsicInvalid; - switch (index) - { - case 0: - simdIntrinsicID = SIMDIntrinsicSetX; - break; - case 1: - simdIntrinsicID = SIMDIntrinsicSetY; - break; - case 2: - simdIntrinsicID = SIMDIntrinsicSetZ; - break; - case 3: - simdIntrinsicID = SIMDIntrinsicSetW; - break; - default: - noway_assert(!"There is no set intrinsic for index bigger than 3"); - } + GenTree* op2 = gtNewIconNode(index, TYP_INT); + GenTree* op3 = tree->gtGetOp2(); + NamedIntrinsic intrinsicId = NI_Vector128_WithElement; - GenTree* target = gtClone(simdOp1Struct); + GenTree* target = gtClone(simdStructNode); assert(target != nullptr); - var_types simdType = target->gtType; - GenTree* simdTree = gtNewSIMDNode(simdType, simdOp1Struct, op2, simdIntrinsicID, simdBaseJitType, simdSize); + + GenTree* simdTree = gtNewSimdWithElementNode(simdType, simdStructNode, op2, op3, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); tree->AsOp()->gtOp1 = target; tree->AsOp()->gtOp2 = simdTree; @@ -12258,7 +12247,7 @@ GenTree* Compiler::fgMorphSmpOp(GenTree* tree, MorphAddrContext* mac) // We should check whether op2 should be assigned to a SIMD field or not. // If it is, we should tranlate the tree to simd intrinsic. assert(!fgGlobalMorph || ((tree->gtDebugFlags & GTF_DEBUG_NODE_MORPHED) == 0)); - GenTree* newTree = fgMorphFieldAssignToSIMDIntrinsicSet(tree); + GenTree* newTree = fgMorphFieldAssignToSimdSetElement(tree); typ = tree->TypeGet(); op1 = tree->gtGetOp1(); op2 = tree->gtGetOp2(); diff --git a/src/coreclr/jit/simd.cpp b/src/coreclr/jit/simd.cpp index ecded3a159a61..0cb2947cff0de 100644 --- a/src/coreclr/jit/simd.cpp +++ b/src/coreclr/jit/simd.cpp @@ -1167,7 +1167,6 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in switch (intrinsicId) { case SIMDIntrinsicInit: - case SIMDIntrinsicGetItem: case SIMDIntrinsicSub: case SIMDIntrinsicEqual: case SIMDIntrinsicBitwiseAnd: @@ -1212,7 +1211,8 @@ GenTree* Compiler::impSIMDPopStack(var_types type, bool expectAddr, CORINFO_CLAS // SIMD type struct that it points to. if (expectAddr) { - assert(tree->TypeGet() == TYP_BYREF); + assert(tree->TypeIs(TYP_BYREF, TYP_I_IMPL)); + if (tree->OperGet() == GT_ADDR) { tree = tree->gtGetOp1(); @@ -1285,29 +1285,6 @@ GenTree* Compiler::impSIMDPopStack(var_types type, bool expectAddr, CORINFO_CLAS return tree; } -// impSIMDGetFixed: Create a GT_SIMD tree for a Get property of SIMD vector with a fixed index. -// -// Arguments: -// simdBaseJitType - The base (element) JIT type of the SIMD vector. -// simdSize - The total size in bytes of the SIMD vector. -// index - The index of the field to get. -// -// Return Value: -// Returns a GT_SIMD node with the SIMDIntrinsicGetItem intrinsic id. -// -GenTreeSIMD* Compiler::impSIMDGetFixed(var_types simdType, CorInfoType simdBaseJitType, unsigned simdSize, int index) -{ - var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); - assert(simdSize >= ((index + 1) * genTypeSize(simdBaseType))); - - // op1 is a SIMD source. - GenTree* op1 = impSIMDPopStack(simdType, true); - - GenTree* op2 = gtNewIconNode(index); - GenTreeSIMD* simdTree = gtNewSIMDNode(simdBaseType, op1, op2, SIMDIntrinsicGetItem, simdBaseJitType, simdSize); - return simdTree; -} - #ifdef TARGET_XARCH // impSIMDLongRelOpEqual: transforms operands and returns the SIMD intrinsic to be applied on // transformed operands to obtain == comparison result. @@ -2306,11 +2283,13 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE opcode, simdTree = op2; if (op3 != nullptr) { - simdTree = gtNewSIMDNode(simdType, simdTree, op3, SIMDIntrinsicSetZ, simdBaseJitType, size); + simdTree = gtNewSimdWithElementNode(simdType, simdTree, gtNewIconNode(2, TYP_INT), op3, simdBaseJitType, + size, /* isSimdAsHWIntrinsic */ true); } if (op4 != nullptr) { - simdTree = gtNewSIMDNode(simdType, simdTree, op4, SIMDIntrinsicSetW, simdBaseJitType, size); + simdTree = gtNewSimdWithElementNode(simdType, simdTree, gtNewIconNode(3, TYP_INT), op4, simdBaseJitType, + size, /* isSimdAsHWIntrinsic */ true); } copyBlkDst = op1; @@ -2343,94 +2322,6 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE opcode, } break; - case SIMDIntrinsicGetItem: - { - // op1 is a SIMD variable that is "this" arg - // op2 is an index of TYP_INT - op2 = impSIMDPopStack(TYP_INT); - op1 = impSIMDPopStack(simdType, instMethod); - int vectorLength = getSIMDVectorLength(size, simdBaseType); - if (!op2->IsCnsIntOrI() || op2->AsIntCon()->gtIconVal >= vectorLength || op2->AsIntCon()->gtIconVal < 0) - { - // We need to bounds-check the length of the vector. - // For that purpose, we need to clone the index expression. - GenTree* index = op2; - if ((index->gtFlags & GTF_SIDE_EFFECT) != 0) - { - op2 = fgInsertCommaFormTemp(&index); - } - else - { - op2 = gtCloneExpr(index); - } - - // For the non-constant case, we don't want to CSE the SIMD value, as we will just need to store - // it to the stack to do the indexing anyway. - op1->gtFlags |= GTF_DONT_CSE; - - GenTree* lengthNode = new (this, GT_CNS_INT) GenTreeIntCon(TYP_INT, vectorLength); - GenTreeBoundsChk* simdChk = - new (this, GT_SIMD_CHK) GenTreeBoundsChk(GT_SIMD_CHK, TYP_VOID, index, lengthNode, SCK_RNGCHK_FAIL); - - // Create a GT_COMMA tree for the bounds check. - op2 = gtNewOperNode(GT_COMMA, op2->TypeGet(), simdChk, op2); - } - - assert(op1->TypeGet() == simdType); - assert(op2->TypeGet() == TYP_INT); - - simdTree = gtNewSIMDNode(genActualType(callType), op1, op2, simdIntrinsicID, simdBaseJitType, size); - retVal = simdTree; - } - break; - - case SIMDIntrinsicGetW: - retVal = impSIMDGetFixed(simdType, simdBaseJitType, size, 3); - break; - - case SIMDIntrinsicGetZ: - retVal = impSIMDGetFixed(simdType, simdBaseJitType, size, 2); - break; - - case SIMDIntrinsicGetY: - retVal = impSIMDGetFixed(simdType, simdBaseJitType, size, 1); - break; - - case SIMDIntrinsicGetX: - retVal = impSIMDGetFixed(simdType, simdBaseJitType, size, 0); - break; - - case SIMDIntrinsicSetW: - case SIMDIntrinsicSetZ: - case SIMDIntrinsicSetY: - case SIMDIntrinsicSetX: - { - // op2 is the value to be set at indexTemp position - // op1 is SIMD vector that is going to be modified, which is a byref - - // If op1 has a side-effect, then don't make it an intrinsic. - // It would be in-efficient to read the entire vector into xmm reg, - // modify it and write back entire xmm reg. - // - // TODO-CQ: revisit this later. - op1 = impStackTop(1).val; - if ((op1->gtFlags & GTF_SIDE_EFFECT) != 0) - { - return nullptr; - } - - op2 = impSIMDPopStack(simdBaseType); - op1 = impSIMDPopStack(simdType, instMethod); - - GenTree* src = gtCloneExpr(op1); - assert(src != nullptr); - simdTree = gtNewSIMDNode(simdType, src, op2, simdIntrinsicID, simdBaseJitType, size); - - copyBlkDst = gtNewOperNode(GT_ADDR, TYP_BYREF, op1); - doCopyBlk = true; - } - break; - // Unary operators that take and return a Vector. case SIMDIntrinsicCast: case SIMDIntrinsicConvertToSingle: diff --git a/src/coreclr/jit/simdashwintrinsic.cpp b/src/coreclr/jit/simdashwintrinsic.cpp index c093fb05f756c..7369172c5c989 100644 --- a/src/coreclr/jit/simdashwintrinsic.cpp +++ b/src/coreclr/jit/simdashwintrinsic.cpp @@ -230,6 +230,12 @@ GenTree* Compiler::impSimdAsHWIntrinsic(NamedIntrinsic intrinsic, isInstanceMethod = true; argClass = clsHnd; + + if (SimdAsHWIntrinsicInfo::BaseTypeFromThisArg(intrinsic)) + { + assert(simdBaseJitType == CORINFO_TYPE_UNDEF); + simdBaseJitType = getBaseJitTypeAndSizeOfSIMDType(clsHnd, &simdSize); + } } else if ((clsHnd == m_simdHandleCache->SIMDVectorHandle) && (numArgs != 0)) { @@ -435,6 +441,39 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, break; } +#if defined(TARGET_XARCH) + case NI_VectorT256_get_Item: + case NI_VectorT128_get_Item: + { + switch (simdBaseType) + { + // Using software fallback if simdBaseType is not supported by hardware + case TYP_BYTE: + case TYP_UBYTE: + case TYP_INT: + case TYP_UINT: + case TYP_LONG: + case TYP_ULONG: + if (!compExactlyDependsOn(InstructionSet_SSE41)) + { + return nullptr; + } + break; + + case TYP_DOUBLE: + case TYP_FLOAT: + case TYP_SHORT: + case TYP_USHORT: + // short/ushort/float/double is supported by SSE2 + break; + + default: + unreached(); + } + break; + } +#endif // TARGET_XARCH + #if defined(TARGET_XARCH) case NI_VectorT128_Dot: { @@ -737,6 +776,13 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, break; } + case NI_VectorT128_get_Item: + case NI_VectorT256_get_Item: + { + return gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + case NI_Vector2_op_Division: case NI_Vector3_op_Division: { @@ -1058,6 +1104,12 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, break; } + case NI_VectorT128_get_Item: + { + return gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + case NI_VectorT128_Max: case NI_VectorT128_Min: { diff --git a/src/coreclr/jit/simdashwintrinsic.h b/src/coreclr/jit/simdashwintrinsic.h index 41dd1c5a53b38..176507d0b6653 100644 --- a/src/coreclr/jit/simdashwintrinsic.h +++ b/src/coreclr/jit/simdashwintrinsic.h @@ -26,6 +26,9 @@ enum class SimdAsHWIntrinsicFlag : unsigned int // Indicates the operands should be swapped in importation. NeedsOperandsSwapped = 0x04, + + // Base type should come from the this argument + BaseTypeFromThisArg = 0x08, }; inline SimdAsHWIntrinsicFlag operator~(SimdAsHWIntrinsicFlag value) @@ -124,6 +127,12 @@ struct SimdAsHWIntrinsicInfo SimdAsHWIntrinsicFlag flags = lookupFlags(id); return (flags & SimdAsHWIntrinsicFlag::NeedsOperandsSwapped) == SimdAsHWIntrinsicFlag::NeedsOperandsSwapped; } + + static bool BaseTypeFromThisArg(NamedIntrinsic id) + { + SimdAsHWIntrinsicFlag flags = lookupFlags(id); + return (flags & SimdAsHWIntrinsicFlag::BaseTypeFromThisArg) == SimdAsHWIntrinsicFlag::BaseTypeFromThisArg; + } }; #endif // _SIMD_AS_HWINTRINSIC_H_ diff --git a/src/coreclr/jit/simdashwintrinsiclistarm64.h b/src/coreclr/jit/simdashwintrinsiclistarm64.h index fc75eca9f3fc3..4eba54135a54b 100644 --- a/src/coreclr/jit/simdashwintrinsiclistarm64.h +++ b/src/coreclr/jit/simdashwintrinsiclistarm64.h @@ -112,6 +112,7 @@ SIMD_AS_HWINTRINSIC_NM(VectorT128, EqualsInstance, "Equals", SIMD_AS_HWINTRINSIC_ID(VectorT128, Floor, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Floor, NI_AdvSimd_Arm64_Floor}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, get_AllBitsSet, 0, {NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Count, 0, {NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Item, 2, {NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item}, SimdAsHWIntrinsicFlag::InstanceMethod | SimdAsHWIntrinsicFlag::BaseTypeFromThisArg) SIMD_AS_HWINTRINSIC_ID(VectorT128, get_One, 0, {NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Zero, 0, {NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, GreaterThan, 2, {NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_Arm64_CompareGreaterThan, NI_AdvSimd_Arm64_CompareGreaterThan, NI_AdvSimd_CompareGreaterThan, NI_AdvSimd_Arm64_CompareGreaterThan}, SimdAsHWIntrinsicFlag::None) diff --git a/src/coreclr/jit/simdashwintrinsiclistxarch.h b/src/coreclr/jit/simdashwintrinsiclistxarch.h index 99e5c29ff8a9c..af75fb75fae6d 100644 --- a/src/coreclr/jit/simdashwintrinsiclistxarch.h +++ b/src/coreclr/jit/simdashwintrinsiclistxarch.h @@ -112,6 +112,7 @@ SIMD_AS_HWINTRINSIC_NM(VectorT128, EqualsInstance, "Equals", SIMD_AS_HWINTRINSIC_ID(VectorT128, Floor, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_SSE41_Floor, NI_SSE41_Floor}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, get_AllBitsSet, 0, {NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet, NI_Vector128_get_AllBitsSet}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Count, 0, {NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count, NI_VectorT128_get_Count}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Item, 2, {NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item, NI_VectorT128_get_Item}, SimdAsHWIntrinsicFlag::InstanceMethod | SimdAsHWIntrinsicFlag::BaseTypeFromThisArg) SIMD_AS_HWINTRINSIC_ID(VectorT128, get_One, 0, {NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One, NI_VectorT128_get_One}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, get_Zero, 0, {NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero, NI_Vector128_get_Zero}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, GreaterThan, 2, {NI_SSE2_CompareGreaterThan, NI_VectorT128_GreaterThan, NI_SSE2_CompareGreaterThan, NI_VectorT128_GreaterThan, NI_SSE2_CompareGreaterThan, NI_VectorT128_GreaterThan, NI_VectorT128_GreaterThan, NI_VectorT128_GreaterThan, NI_SSE_CompareGreaterThan, NI_SSE2_CompareGreaterThan}, SimdAsHWIntrinsicFlag::None) @@ -149,6 +150,7 @@ SIMD_AS_HWINTRINSIC_NM(VectorT256, EqualsInstance, "Equals", SIMD_AS_HWINTRINSIC_ID(VectorT256, Floor, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AVX_Floor, NI_AVX_Floor}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, get_AllBitsSet, 0, {NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet, NI_Vector256_get_AllBitsSet}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, get_Count, 0, {NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count, NI_VectorT256_get_Count}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT256, get_Item, 2, {NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item, NI_VectorT256_get_Item}, SimdAsHWIntrinsicFlag::InstanceMethod | SimdAsHWIntrinsicFlag::BaseTypeFromThisArg) SIMD_AS_HWINTRINSIC_ID(VectorT256, get_One, 0, {NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One, NI_VectorT256_get_One}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, get_Zero, 0, {NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero, NI_Vector256_get_Zero}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, GreaterThan, 2, {NI_AVX2_CompareGreaterThan, NI_VectorT256_GreaterThan, NI_AVX2_CompareGreaterThan, NI_VectorT256_GreaterThan, NI_AVX2_CompareGreaterThan, NI_VectorT256_GreaterThan, NI_AVX2_CompareGreaterThan, NI_VectorT256_GreaterThan, NI_AVX_CompareGreaterThan, NI_AVX_CompareGreaterThan}, SimdAsHWIntrinsicFlag::None) diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp index 4b0b6cfd0b94e..4523fe48a896e 100644 --- a/src/coreclr/jit/simdcodegenxarch.cpp +++ b/src/coreclr/jit/simdcodegenxarch.cpp @@ -1563,389 +1563,6 @@ void CodeGen::genSIMDIntrinsicRelOp(GenTreeSIMD* simdNode) genProduceReg(simdNode); } -//------------------------------------------------------------------------------------ -// genSIMDIntrinsicGetItem: Generate code for SIMD Intrinsic get element at index i. -// -// Arguments: -// simdNode - The GT_SIMD node -// -// Return Value: -// None. -// -void CodeGen::genSIMDIntrinsicGetItem(GenTreeSIMD* simdNode) -{ - assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicGetItem); - - GenTree* op1 = simdNode->gtGetOp1(); - GenTree* op2 = simdNode->gtGetOp2(); - var_types simdType = op1->TypeGet(); - assert(varTypeIsSIMD(simdType)); - - // op1 of TYP_SIMD12 should be considered as TYP_SIMD16, - // since it is in XMM register. - if (simdType == TYP_SIMD12) - { - simdType = TYP_SIMD16; - } - - var_types baseType = simdNode->GetSimdBaseType(); - regNumber targetReg = simdNode->GetRegNum(); - assert(targetReg != REG_NA); - var_types targetType = simdNode->TypeGet(); - assert(targetType == genActualType(baseType)); - - // GetItem has 2 operands: - // - the source of SIMD type (op1) - // - the index of the value to be returned. - genConsumeOperands(simdNode); - regNumber srcReg = op1->GetRegNum(); - - // Optimize the case of op1 is in memory and trying to access ith element. - if (!op1->isUsedFromReg()) - { - assert(op1->isContained()); - - regNumber baseReg; - regNumber indexReg; - int offset = 0; - - if (op1->OperIsLocal()) - { - // There are three parts to the total offset here: - // {offset of local} + {offset of SIMD Vector field (lclFld only)} + {offset of element within SIMD vector}. - bool isEBPbased; - unsigned varNum = op1->AsLclVarCommon()->GetLclNum(); - offset += compiler->lvaFrameAddress(varNum, &isEBPbased); - -#if !FEATURE_FIXED_OUT_ARGS - if (!isEBPbased) - { - // Adjust the offset by the amount currently pushed on the CPU stack - offset += genStackLevel; - } -#else - assert(genStackLevel == 0); -#endif // !FEATURE_FIXED_OUT_ARGS - - if (op1->OperGet() == GT_LCL_FLD) - { - offset += op1->AsLclFld()->GetLclOffs(); - } - baseReg = (isEBPbased) ? REG_EBP : REG_ESP; - } - else - { - // Require GT_IND addr to be not contained. - assert(op1->OperGet() == GT_IND); - - GenTree* addr = op1->AsIndir()->Addr(); - assert(!addr->isContained()); - baseReg = addr->GetRegNum(); - } - - if (op2->isContainedIntOrIImmed()) - { - indexReg = REG_NA; - offset += (int)op2->AsIntConCommon()->IconValue() * genTypeSize(baseType); - } - else - { - indexReg = op2->GetRegNum(); - assert(genIsValidIntReg(indexReg)); - } - - // Now, load the desired element. - GetEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load - emitTypeSize(baseType), // Of the vector baseType - targetReg, // To targetReg - baseReg, // Base Reg - indexReg, // Indexed - genTypeSize(baseType), // by the size of the baseType - offset); - genProduceReg(simdNode); - return; - } - - // SSE2 doesn't have an instruction to implement this intrinsic if the index is not a constant. - // For the non-constant case, we will use the SIMD temp location to store the vector, and - // the load the desired element. - // The range check will already have been performed, so at this point we know we have an index - // within the bounds of the vector. - if (!op2->IsCnsIntOrI()) - { - unsigned simdInitTempVarNum = compiler->lvaSIMDInitTempVarNum; - noway_assert(simdInitTempVarNum != BAD_VAR_NUM); - bool isEBPbased; - unsigned offs = compiler->lvaFrameAddress(simdInitTempVarNum, &isEBPbased); - -#if !FEATURE_FIXED_OUT_ARGS - if (!isEBPbased) - { - // Adjust the offset by the amount currently pushed on the CPU stack - offs += genStackLevel; - } -#else - assert(genStackLevel == 0); -#endif // !FEATURE_FIXED_OUT_ARGS - - regNumber indexReg = op2->GetRegNum(); - - // Store the vector to the temp location. - GetEmitter()->emitIns_S_R(ins_Store(simdType, compiler->isSIMDTypeLocalAligned(simdInitTempVarNum)), - emitTypeSize(simdType), srcReg, simdInitTempVarNum, 0); - - // Now, load the desired element. - GetEmitter()->emitIns_R_ARX(ins_Move_Extend(baseType, false), // Load - emitTypeSize(baseType), // Of the vector baseType - targetReg, // To targetReg - (isEBPbased) ? REG_EBP : REG_ESP, // Stack-based - indexReg, // Indexed - genTypeSize(baseType), // by the size of the baseType - offs); - genProduceReg(simdNode); - return; - } - - noway_assert(op2->isContained()); - noway_assert(op2->IsCnsIntOrI()); - unsigned int index = (unsigned int)op2->AsIntCon()->gtIconVal; - unsigned int byteShiftCnt = index * genTypeSize(baseType); - - // In general we shouldn't have an index greater than or equal to the length of the vector. - // However, if we have an out-of-range access, under minOpts it will not be optimized - // away. The code will throw before we reach this point, but we still need to generate - // code. In that case, we will simply mask off the upper bits. - if (byteShiftCnt >= compiler->getSIMDVectorRegisterByteLength()) - { - byteShiftCnt &= (compiler->getSIMDVectorRegisterByteLength() - 1); - index = byteShiftCnt / genTypeSize(baseType); - } - - regNumber tmpReg = REG_NA; - if (simdNode->AvailableTempRegCount() != 0) - { - tmpReg = simdNode->GetSingleTempReg(); - } - else - { - assert((byteShiftCnt == 0) || varTypeIsFloating(baseType) || - (varTypeIsSmallInt(baseType) && (byteShiftCnt < 16))); - } - - if (byteShiftCnt >= 16) - { - assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported); - byteShiftCnt -= 16; - regNumber newSrcReg; - if (varTypeIsFloating(baseType)) - { - newSrcReg = targetReg; - } - else - { - // Integer types - assert(tmpReg != REG_NA); - newSrcReg = tmpReg; - } - GetEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, newSrcReg, srcReg, 0x01); - - srcReg = newSrcReg; - } - - // Generate the following sequence: - // 1) baseType is floating point - // movaps targetReg, srcReg - // psrldq targetReg, byteShiftCnt <-- not generated if accessing zero'th element - // - // 2) baseType is not floating point - // movaps tmpReg, srcReg <-- not generated if accessing zero'th element - // OR if tmpReg == srcReg - // psrldq tmpReg, byteShiftCnt <-- not generated if accessing zero'th element - // mov_xmm2i targetReg, tmpReg - if (varTypeIsFloating(baseType)) - { - inst_Mov(simdType, targetReg, srcReg, /* canSkip */ true); - - if (byteShiftCnt != 0) - { - instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); - assert((byteShiftCnt > 0) && (byteShiftCnt < 32)); - GetEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), targetReg, byteShiftCnt); - } - } - else - { - if (varTypeIsSmallInt(baseType)) - { - // Note that pextrw extracts 16-bit value by index and zero extends it to 32-bits. - // In case of vector we also need to sign extend the 16-bit value in targetReg - // Vector - index/2 will give the index of the 16-bit value to extract. Shift right - // by 8-bits if index is odd. In case of Vector also sign extend targetReg. - - unsigned baseSize = genTypeSize(baseType); - if (baseSize == 1) - { - index /= 2; - } - // We actually want index % 8 for the AVX case (for SSE it will never be > 8). - // Note that this doesn't matter functionally, because the instruction uses just the - // low 3 bits of index, but it's better to use the right value. - if (index > 8) - { - assert(compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported); - index -= 8; - } - - assert((index >= 0) && (index <= 8)); - GetEmitter()->emitIns_R_R_I(INS_pextrw, emitTypeSize(TYP_INT), targetReg, srcReg, index); - - bool ZeroOrSignExtnReqd = true; - if (baseSize == 1) - { - if ((op2->AsIntCon()->gtIconVal % 2) == 1) - { - // Right shift extracted word by 8-bits if index is odd if we are extracting a byte sized element. - inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, targetReg, 8); - - // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_BYTE - ZeroOrSignExtnReqd = (baseType == TYP_BYTE); - } - // else - we just need to zero/sign extend the byte since pextrw extracted 16-bits - } - else - { - // Since Pextrw zero extends to 32-bits, we need sign extension in case of TYP_SHORT - assert(baseSize == 2); - ZeroOrSignExtnReqd = (baseType == TYP_SHORT); - } - - if (ZeroOrSignExtnReqd) - { - // Zero/sign extend the byte/short to 32-bits - inst_Mov_Extend(baseType, /* srcInReg */ false, targetReg, targetReg, /* canSkip */ false, - emitTypeSize(baseType)); - } - } - else - { - // We need a temp xmm register if the baseType is not floating point and - // accessing non-zero'th element. - if (byteShiftCnt != 0) - { - assert(tmpReg != REG_NA); - - inst_Mov(simdType, tmpReg, srcReg, /* canSkip */ true); - - assert((byteShiftCnt > 0) && (byteShiftCnt <= 32)); - instruction ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); - GetEmitter()->emitIns_R_I(ins, emitActualTypeSize(simdType), tmpReg, byteShiftCnt); - } - else - { - tmpReg = srcReg; - } - - assert(tmpReg != REG_NA); - inst_Mov(baseType, targetReg, tmpReg, /* canSkip */ false); - } - } - - genProduceReg(simdNode); -} - -//------------------------------------------------------------------------------------ -// genSIMDIntrinsicSetItem: Generate code for SIMD Intrinsic set element at index i. -// -// Arguments: -// simdNode - The GT_SIMD node -// -// Return Value: -// None. -// -// TODO-CQ: Use SIMDIntrinsicShuffleSSE2 for the SSE2 case. -// -void CodeGen::genSIMDIntrinsicSetItem(GenTreeSIMD* simdNode) -{ - // Determine index based on intrinsic ID - int index = -1; - switch (simdNode->gtSIMDIntrinsicID) - { - case SIMDIntrinsicSetX: - index = 0; - break; - case SIMDIntrinsicSetY: - index = 1; - break; - case SIMDIntrinsicSetZ: - index = 2; - break; - case SIMDIntrinsicSetW: - index = 3; - break; - - default: - unreached(); - } - assert(index != -1); - - // op1 is the SIMD vector - // op2 is the value to be set - GenTree* op1 = simdNode->gtGetOp1(); - GenTree* op2 = simdNode->gtGetOp2(); - - var_types baseType = simdNode->GetSimdBaseType(); - regNumber targetReg = simdNode->GetRegNum(); - assert(targetReg != REG_NA); - var_types targetType = simdNode->TypeGet(); - assert(varTypeIsSIMD(targetType)); - - // the following assert must hold. - // supported only on vector2f/3f/4f right now - noway_assert(baseType == TYP_FLOAT); - assert(op2->TypeGet() == baseType); - assert(simdNode->GetSimdSize() >= ((index + 1) * genTypeSize(baseType))); - - genConsumeOperands(simdNode); - regNumber op1Reg = op1->GetRegNum(); - regNumber op2Reg = op2->GetRegNum(); - - // TODO-CQ: For AVX we don't need to do a copy because it supports 3 operands plus immediate. - inst_Mov(targetType, targetReg, op1Reg, /* canSkip */ true); - - // Right now this intrinsic is supported only for float base type vectors. - // If in future need to support on other base type vectors, the below - // logic needs modification. - noway_assert(baseType == TYP_FLOAT); - - if (compiler->getSIMDSupportLevel() == SIMD_SSE2_Supported) - { - // We need one additional int register as scratch - regNumber tmpReg = simdNode->GetSingleTempReg(); - assert(genIsValidIntReg(tmpReg)); - - // Move the value from xmm reg to an int reg - inst_Mov(TYP_INT, tmpReg, op2Reg, /* canSkip */ false, emitActualTypeSize(baseType)); - - assert((index >= 0) && (index <= 15)); - - // First insert the lower 16-bits of tmpReg in targetReg at 2*index position - // since every float has two 16-bit words. - GetEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2 * index); - - // Logical right shift tmpReg by 16-bits and insert in targetReg at 2*index + 1 position - inst_RV_SH(INS_SHIFT_RIGHT_LOGICAL, EA_4BYTE, tmpReg, 16); - GetEmitter()->emitIns_R_R_I(INS_pinsrw, emitTypeSize(TYP_INT), targetReg, tmpReg, 2 * index + 1); - } - else - { - unsigned int insertpsImm = (INSERTPS_SOURCE_SELECT(0) | INSERTPS_TARGET_SELECT(index)); - assert((insertpsImm >= 0) && (insertpsImm <= 255)); - inst_RV_RV_IV(INS_insertps, EA_16BYTE, targetReg, op2Reg, (int8_t)insertpsImm); - } - - genProduceReg(simdNode); -} - //------------------------------------------------------------------------ // genSIMDIntrinsicShuffleSSE2: Generate code for SIMD Intrinsic shuffle. // @@ -2357,21 +1974,10 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) genSIMDIntrinsicRelOp(simdNode); break; - case SIMDIntrinsicGetItem: - genSIMDIntrinsicGetItem(simdNode); - break; - case SIMDIntrinsicShuffleSSE2: genSIMDIntrinsicShuffleSSE2(simdNode); break; - case SIMDIntrinsicSetX: - case SIMDIntrinsicSetY: - case SIMDIntrinsicSetZ: - case SIMDIntrinsicSetW: - genSIMDIntrinsicSetItem(simdNode); - break; - case SIMDIntrinsicUpperSave: genSIMDIntrinsicUpperSave(simdNode); break; diff --git a/src/coreclr/jit/simdintrinsiclist.h b/src/coreclr/jit/simdintrinsiclist.h index fb806804a569a..258fecfdd6578 100644 --- a/src/coreclr/jit/simdintrinsiclist.h +++ b/src/coreclr/jit/simdintrinsiclist.h @@ -57,19 +57,6 @@ SIMD_INTRINSIC(".ctor", true, InitFixed, SIMD_INTRINSIC("CopyTo", true, CopyToArray, "CopyToArray", TYP_VOID, 2, {TYP_BYREF, TYP_REF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) SIMD_INTRINSIC("CopyTo", true, CopyToArrayX, "CopyToArray", TYP_VOID, 3, {TYP_BYREF, TYP_REF, TYP_INT }, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) -// Get operations -SIMD_INTRINSIC("get_Item", true, GetItem, "get[i]", TYP_UNKNOWN, 2, {TYP_BYREF, TYP_INT, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) -SIMD_INTRINSIC("get_X", true, GetX, "getX", TYP_UNKNOWN, 1, {TYP_BYREF, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -SIMD_INTRINSIC("get_Y", true, GetY, "getY", TYP_UNKNOWN, 1, {TYP_BYREF, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -SIMD_INTRINSIC("get_Z", true, GetZ, "getZ", TYP_UNKNOWN, 1, {TYP_BYREF, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -SIMD_INTRINSIC("get_W", true, GetW, "getW", TYP_UNKNOWN, 1, {TYP_BYREF, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) - -// Set operations -SIMD_INTRINSIC("set_X", true, SetX, "setX", TYP_VOID, 2, {TYP_BYREF, TYP_UNKNOWN, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -SIMD_INTRINSIC("set_Y", true, SetY, "setY", TYP_VOID, 2, {TYP_BYREF, TYP_UNKNOWN, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -SIMD_INTRINSIC("set_Z", true, SetZ, "setZ", TYP_VOID, 2, {TYP_BYREF, TYP_UNKNOWN, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -SIMD_INTRINSIC("set_W", true, SetW, "setW", TYP_VOID, 2, {TYP_BYREF, TYP_UNKNOWN, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) - // Arithmetic Operations SIMD_INTRINSIC("op_Subtraction", false, Sub, "-", TYP_STRUCT, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_ULONG}) diff --git a/src/libraries/System.Numerics.Vectors/tests/GenericVectorTests.cs b/src/libraries/System.Numerics.Vectors/tests/GenericVectorTests.cs index 59763ef9d9004..6124cd6294e74 100644 --- a/src/libraries/System.Numerics.Vectors/tests/GenericVectorTests.cs +++ b/src/libraries/System.Numerics.Vectors/tests/GenericVectorTests.cs @@ -487,7 +487,7 @@ private void TestConstructorWithUnsupportedTypes() where T : struct private void TestIndexerOutOfRange() where T : struct { Vector vector = Vector.One; - Assert.Throws(() => + Assert.Throws(() => { T value = vector[Vector.Count]; }); diff --git a/src/libraries/System.Private.CoreLib/src/System/Numerics/Vector_1.cs b/src/libraries/System.Private.CoreLib/src/System/Numerics/Vector_1.cs index d13cbf8e402de..3bd503ac8f4b2 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Numerics/Vector_1.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Numerics/Vector_1.cs @@ -248,7 +248,7 @@ public readonly unsafe T this[int index] if ((uint)index >= (uint)Count) { - throw new IndexOutOfRangeException(SR.Format(SR.Arg_ArgumentOutOfRangeException, index)); + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.index); } return GetElement(index); diff --git a/src/tests/JIT/SIMD/VectorGet.cs b/src/tests/JIT/SIMD/VectorGet.cs index e89862001ec00..d46e98c6533ac 100644 --- a/src/tests/JIT/SIMD/VectorGet.cs +++ b/src/tests/JIT/SIMD/VectorGet.cs @@ -128,7 +128,7 @@ public static int VectorGetIndexerOutOfRange(T value, int index) case 32: check = A[32]; break; } } - catch (IndexOutOfRangeException) + catch (ArgumentOutOfRangeException) { caught = true; } @@ -138,7 +138,7 @@ public static int VectorGetIndexerOutOfRange(T value, int index) } if (!caught) { - Console.WriteLine("Failed to throw IndexOutOfRangeException for index == Count of " + Vector.Count); + Console.WriteLine("Failed to throw ArgumentOutOfRangeException for index == Count of " + Vector.Count); returnVal = Fail; } @@ -148,7 +148,7 @@ public static int VectorGetIndexerOutOfRange(T value, int index) { check = A[-1]; } - catch (IndexOutOfRangeException) + catch (ArgumentOutOfRangeException) { caught = true; } @@ -158,7 +158,7 @@ public static int VectorGetIndexerOutOfRange(T value, int index) } if (!caught) { - Console.WriteLine("Failed to throw IndexOutOfRangeException for index == -1"); + Console.WriteLine("Failed to throw ArgumentOutOfRangeException for index == -1"); returnVal = Fail; }