diff --git a/src/coreclr/src/ToolBox/superpmi/superpmi/icorjitinfo.cpp b/src/coreclr/src/ToolBox/superpmi/superpmi/icorjitinfo.cpp index 1591f347cbe90..24bc4415e586e 100644 --- a/src/coreclr/src/ToolBox/superpmi/superpmi/icorjitinfo.cpp +++ b/src/coreclr/src/ToolBox/superpmi/superpmi/icorjitinfo.cpp @@ -1578,6 +1578,18 @@ bool MyICJI::runWithErrorTrap(void (*function)(void*), void* param) return RunWithErrorTrap(function, param); } +// Ideally we'd just use the copies of this in standardmacros.h +// however, superpmi is missing various other dependencies as well +static size_t ALIGN_UP_SPMI(size_t val, size_t alignment) +{ + return (val + (alignment - 1)) & ~(alignment - 1); +} + +static void* ALIGN_UP_SPMI(void* val, size_t alignment) +{ + return (void*)ALIGN_UP_SPMI((size_t)val, alignment); +} + // get a block of memory for the code, readonly data, and read-write data void MyICJI::allocMem(ULONG hotCodeSize, /* IN */ ULONG coldCodeSize, /* IN */ @@ -1590,13 +1602,46 @@ void MyICJI::allocMem(ULONG hotCodeSize, /* IN */ ) { jitInstance->mc->cr->AddCall("allocMem"); - // TODO-Cleanup: investigate if we need to check roDataBlock as well. Could hot block size be ever 0? + + // TODO-Cleanup: Could hot block size be ever 0? *hotCodeBlock = jitInstance->mc->cr->allocateMemory(hotCodeSize); + if (coldCodeSize > 0) *coldCodeBlock = jitInstance->mc->cr->allocateMemory(coldCodeSize); else *coldCodeBlock = nullptr; - *roDataBlock = jitInstance->mc->cr->allocateMemory(roDataSize); + + if (roDataSize > 0) + { + size_t roDataAlignment = sizeof(void*); + size_t roDataAlignedSize = static_cast(roDataSize); + + if ((flag & CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN) != 0) + { + roDataAlignment = 32; + } + else if ((flag & CORJIT_ALLOCMEM_FLG_RODATA_16BYTE_ALIGN) != 0) + { + roDataAlignment = 16; + } + else if (roDataSize >= 8) + { + roDataAlignment = 8; + } + + // We need to round the roDataSize up to the alignment size and then + // overallocate by at most alignment - sizeof(void*) to ensure that + // we can offset roDataBlock to be an aligned address and that the + // allocation contains at least the originally requested size after + + roDataAlignedSize = ALIGN_UP_SPMI(roDataAlignedSize, roDataAlignment); + roDataAlignedSize = roDataAlignedSize + (roDataAlignment - sizeof(void*)); + *roDataBlock = jitInstance->mc->cr->allocateMemory(roDataAlignedSize); + *roDataBlock = ALIGN_UP_SPMI(*roDataBlock, roDataAlignment); + } + else + *roDataBlock = nullptr; + jitInstance->mc->cr->recAllocMem(hotCodeSize, coldCodeSize, roDataSize, xcptnsCount, flag, hotCodeBlock, coldCodeBlock, roDataBlock); } diff --git a/src/coreclr/src/jit/emit.cpp b/src/coreclr/src/jit/emit.cpp index f0fd875006c97..8b9c2c7e21fe8 100644 --- a/src/coreclr/src/jit/emit.cpp +++ b/src/coreclr/src/jit/emit.cpp @@ -5339,10 +5339,11 @@ UNATIVE_OFFSET emitter::emitDataGenBeg(UNATIVE_OFFSET size, bool align) { // Data can have any size but since alignment is deduced from the size there's no // way to have a larger data size (e.g. 128) and request 4/8/16 byte alignment. - // 32 bytes (and more) alignment requires VM support (see ICorJitInfo::allocMem). - assert(size <= 16); + // As such, we restrict data above 16 bytes to be a multiple of 16 and assume 16-byte + // alignment. Alignment greater than 16 requires VM support (see ICorJitInfo::allocMem). + assert((size <= 16) || ((size % 16) == 0)); - if (size == 16) + if (size >= 16) { emitConsDsc.align16 = true; } diff --git a/src/coreclr/src/jit/emitxarch.cpp b/src/coreclr/src/jit/emitxarch.cpp index 269324d9a9d4f..af753945e3d2f 100644 --- a/src/coreclr/src/jit/emitxarch.cpp +++ b/src/coreclr/src/jit/emitxarch.cpp @@ -10765,7 +10765,8 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } // Check that the offset is properly aligned (i.e. the ddd in [ddd]) - assert((emitChkAlign == false) || (ins == INS_lea) || (((size_t)addr & (byteSize - 1)) == 0)); + assert((emitChkAlign == false) || (ins == INS_lea) || + ((byteSize < 16) && (((size_t)addr & (byteSize - 1)) == 0)) || (((size_t)addr & (16 - 1)) == 0)); } else { diff --git a/src/coreclr/src/jit/hwintrinsic.cpp b/src/coreclr/src/jit/hwintrinsic.cpp index 674b7f9946e0d..8f33c4851771b 100644 --- a/src/coreclr/src/jit/hwintrinsic.cpp +++ b/src/coreclr/src/jit/hwintrinsic.cpp @@ -292,16 +292,19 @@ CORINFO_CLASS_HANDLE Compiler::gtGetStructHandleForHWSIMD(var_types simdType, va // lookupId: Gets the NamedIntrinsic for a given method name and InstructionSet // // Arguments: +// comp -- The compiler +// sig -- The signature of the intrinsic // className -- The name of the class associated with the HWIntrinsic to lookup // methodName -- The name of the method associated with the HWIntrinsic to lookup // enclosingClassName -- The name of the enclosing class of X64 classes // // Return Value: // The NamedIntrinsic associated with methodName and isa -NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp, - const char* className, - const char* methodName, - const char* enclosingClassName) +NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp, + CORINFO_SIG_INFO* sig, + const char* className, + const char* methodName, + const char* enclosingClassName) { // TODO-Throughput: replace sequential search by binary search CORINFO_InstructionSet isa = lookupIsa(className, enclosingClassName); @@ -324,14 +327,23 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp, for (int i = 0; i < (NI_HW_INTRINSIC_END - NI_HW_INTRINSIC_START - 1); i++) { + const HWIntrinsicInfo& intrinsicInfo = hwIntrinsicInfoArray[i]; + if (isa != hwIntrinsicInfoArray[i].isa) { continue; } - if (strcmp(methodName, hwIntrinsicInfoArray[i].name) == 0) + int numArgs = static_cast(intrinsicInfo.numArgs); + + if ((numArgs != -1) && (sig->numArgs != static_cast(intrinsicInfo.numArgs))) + { + continue; + } + + if (strcmp(methodName, intrinsicInfo.name) == 0) { - return hwIntrinsicInfoArray[i].id; + return intrinsicInfo.id; } } diff --git a/src/coreclr/src/jit/hwintrinsic.h b/src/coreclr/src/jit/hwintrinsic.h index b3c00f45d9809..7e5fe7905d454 100644 --- a/src/coreclr/src/jit/hwintrinsic.h +++ b/src/coreclr/src/jit/hwintrinsic.h @@ -258,10 +258,11 @@ struct HWIntrinsicInfo static const HWIntrinsicInfo& lookup(NamedIntrinsic id); - static NamedIntrinsic lookupId(Compiler* comp, - const char* className, - const char* methodName, - const char* enclosingClassName); + static NamedIntrinsic lookupId(Compiler* comp, + CORINFO_SIG_INFO* sig, + const char* className, + const char* methodName, + const char* enclosingClassName); static CORINFO_InstructionSet lookupIsa(const char* className, const char* enclosingClassName); static unsigned lookupSimdSize(Compiler* comp, NamedIntrinsic id, CORINFO_SIG_INFO* sig); diff --git a/src/coreclr/src/jit/hwintrinsiclistxarch.h b/src/coreclr/src/jit/hwintrinsiclistxarch.h index 4be0cdadd809c..db2bacbbb6457 100644 --- a/src/coreclr/src/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/src/jit/hwintrinsiclistxarch.h @@ -43,6 +43,7 @@ HARDWARE_INTRINSIC(Vector128, AsVector2, HARDWARE_INTRINSIC(Vector128, AsVector3, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, AsVector4, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, AsVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector128, Create, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, CreateScalarUnsafe, 16, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) // The instruction generated for float/double depends on which ISAs are supported HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmppd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) @@ -76,6 +77,7 @@ HARDWARE_INTRINSIC(Vector256, AsVector256, HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmppd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, get_Count, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, get_Zero, 32, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector256, Create, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, CreateScalarUnsafe, 32, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, WithElement, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) diff --git a/src/coreclr/src/jit/hwintrinsicxarch.cpp b/src/coreclr/src/jit/hwintrinsicxarch.cpp index 010cd5fa5063d..1ed4dd7716151 100644 --- a/src/coreclr/src/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/src/jit/hwintrinsicxarch.cpp @@ -521,6 +521,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, { GenTree* retNode = nullptr; GenTree* op1 = nullptr; + GenTree* op2 = nullptr; if (!featureSIMD) { @@ -747,6 +748,70 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector128_Create: + case NI_Vector256_Create: + { +#if defined(TARGET_X86) + if (varTypeIsLong(baseType)) + { + // TODO-XARCH-CQ: It may be beneficial to emit the movq + // instruction, which takes a 64-bit memory address and + // works on 32-bit x86 systems. + break; + } +#endif // TARGET_X86 + + // We shouldn't handle this as an intrinsic if the + // respective ISAs have been disabled by the user. + + if (intrinsic == NI_Vector256_Create) + { + if (!compExactlyDependsOn(InstructionSet_AVX)) + { + break; + } + } + else if (baseType == TYP_FLOAT) + { + if (!compExactlyDependsOn(InstructionSet_SSE)) + { + break; + } + } + else if (!compExactlyDependsOn(InstructionSet_SSE2)) + { + break; + } + + if (sig->numArgs == 1) + { + op1 = impPopStack().val; + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize); + } + else if (sig->numArgs == 2) + { + op2 = impPopStack().val; + op1 = impPopStack().val; + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, baseType, simdSize); + } + else + { + assert(sig->numArgs >= 3); + + GenTreeArgList* tmp = nullptr; + + for (unsigned i = 0; i < sig->numArgs; i++) + { + tmp = gtNewArgList(impPopStack().val); + tmp->gtOp2 = op1; + op1 = tmp; + } + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize); + } + break; + } + case NI_Vector128_CreateScalarUnsafe: { assert(sig->numArgs == 1); diff --git a/src/coreclr/src/jit/importer.cpp b/src/coreclr/src/jit/importer.cpp index 41c5e2540bb17..1be5bd797adf2 100644 --- a/src/coreclr/src/jit/importer.cpp +++ b/src/coreclr/src/jit/importer.cpp @@ -4504,7 +4504,10 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method) if ((namespaceName[0] == '\0') || (strcmp(namespaceName, platformNamespaceName) == 0)) { - result = HWIntrinsicInfo::lookupId(this, className, methodName, enclosingClassName); + CORINFO_SIG_INFO sig; + info.compCompHnd->getMethodSig(method, &sig); + + result = HWIntrinsicInfo::lookupId(this, &sig, className, methodName, enclosingClassName); } else if (strcmp(methodName, "get_IsSupported") == 0) { diff --git a/src/coreclr/src/jit/lower.h b/src/coreclr/src/jit/lower.h index 9c79ede16869a..2fa26918e30f4 100644 --- a/src/coreclr/src/jit/lower.h +++ b/src/coreclr/src/jit/lower.h @@ -315,6 +315,7 @@ class Lowering final : public Phase #ifdef FEATURE_HW_INTRINSICS void LowerHWIntrinsic(GenTreeHWIntrinsic* node); void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition); + void LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node); void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node); #endif // FEATURE_HW_INTRINSICS diff --git a/src/coreclr/src/jit/lowerxarch.cpp b/src/coreclr/src/jit/lowerxarch.cpp index ad0426bbb620d..1c36a650392d0 100644 --- a/src/coreclr/src/jit/lowerxarch.cpp +++ b/src/coreclr/src/jit/lowerxarch.cpp @@ -927,8 +927,26 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) node->gtType = TYP_SIMD16; } - switch (node->gtHWIntrinsicId) + NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; + + switch (intrinsicId) { + case NI_Vector128_Create: + case NI_Vector256_Create: + { + // We don't directly support the Vector128.Create or Vector256.Create methods in codegen + // and instead lower them to other intrinsic nodes in LowerHWIntrinsicCreate so we expect + // that the node is modified to either not be a HWIntrinsic node or that it is no longer + // the same intrinsic as when it came in. In the case of Vector256.Create, we may lower + // it into 2x Vector128.Create intrinsics which themselves are also lowered into other + // intrinsics that are not Vector*.Create + + LowerHWIntrinsicCreate(node); + assert(!node->OperIsHWIntrinsic() || (node->gtHWIntrinsicId != intrinsicId)); + LowerNode(node); + return; + } + case NI_SSE2_CompareGreaterThan: { if (node->gtSIMDBaseType != TYP_DOUBLE) @@ -1081,6 +1099,1214 @@ void Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) ContainCheckHWIntrinsic(node); } + +union VectorConstant { + int8_t i8[32]; + uint8_t u8[32]; + int16_t i16[16]; + uint16_t u16[16]; + int32_t i32[8]; + uint32_t u32[8]; + int64_t i64[4]; + uint64_t u64[4]; + float f32[8]; + double f64[4]; +}; + +//---------------------------------------------------------------------------------------------- +// ProcessArgForHWIntrinsicCreate: Processes an argument for the Lowering::LowerHWIntrinsicCreate method +// +// Arguments: +// arg - The argument to process +// argIdx - The index of the argument being processed +// vecCns - The vector constant being constructed +// baseType - The base type of the vector constant +// +// Returns: +// true if arg was a constant; otherwise, false +static bool HandleArgForHWIntrinsicCreate(GenTree* arg, int argIdx, VectorConstant& vecCns, var_types baseType) +{ + switch (baseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + if (arg->IsCnsIntOrI()) + { + vecCns.i8[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); + return true; + } + else + { + // We expect the VectorConstant to have been already zeroed + assert(vecCns.i8[argIdx] == 0); + } + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + if (arg->IsCnsIntOrI()) + { + vecCns.i16[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); + return true; + } + else + { + // We expect the VectorConstant to have been already zeroed + assert(vecCns.i16[argIdx] == 0); + } + break; + } + + case TYP_INT: + case TYP_UINT: + { + if (arg->IsCnsIntOrI()) + { + vecCns.i32[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); + return true; + } + else + { + // We expect the VectorConstant to have been already zeroed + assert(vecCns.i32[argIdx] == 0); + } + break; + } + + case TYP_LONG: + case TYP_ULONG: + { + if (arg->OperIs(GT_CNS_LNG)) + { + vecCns.i64[argIdx] = static_cast(arg->AsLngCon()->gtLconVal); + return true; + } + else + { + // We expect the VectorConstant to have been already zeroed + assert(vecCns.i64[argIdx] == 0); + } + break; + } + + case TYP_FLOAT: + { + if (arg->IsCnsFltOrDbl()) + { + vecCns.f32[argIdx] = static_cast(arg->AsDblCon()->gtDconVal); + return true; + } + else + { + // We expect the VectorConstant to have been already zeroed + // We check against the i32, rather than f32, to account for -0.0 + assert(vecCns.i32[argIdx] == 0); + } + break; + } + + case TYP_DOUBLE: + { + if (arg->IsCnsFltOrDbl()) + { + vecCns.f64[argIdx] = static_cast(arg->AsDblCon()->gtDconVal); + return true; + } + else + { + // We expect the VectorConstant to have been already zeroed + // We check against the i64, rather than f64, to account for -0.0 + assert(vecCns.i64[argIdx] == 0); + } + break; + } + + default: + { + unreached(); + } + } + + return false; +} + +//---------------------------------------------------------------------------------------------- +// Lowering::LowerHWIntrinsicCreate: Lowers a Vector64, Vector128, or Vector256 Create call +// +// Arguments: +// node - The hardware intrinsic node. +// +void Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) +{ + NamedIntrinsic intrinsicId = node->gtHWIntrinsicId; + var_types simdType = node->gtType; + var_types baseType = node->gtSIMDBaseType; + unsigned simdSize = node->gtSIMDSize; + VectorConstant vecCns = {}; + + assert(varTypeIsSIMD(simdType)); + assert(varTypeIsArithmetic(baseType)); + assert(simdSize != 0); + + GenTreeArgList* argList = nullptr; + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); + + // Spare GenTrees to be used for the lowering logic below + // Defined upfront to avoid naming conflicts, etc... + GenTree* idx = nullptr; + GenTree* tmp1 = nullptr; + GenTree* tmp2 = nullptr; + GenTree* tmp3 = nullptr; + + assert(op1 != nullptr); + + unsigned argCnt = 0; + unsigned cnsArgCnt = 0; + + if (op1->OperIsList()) + { + assert(op2 == nullptr); + + for (argList = op1->AsArgList(); argList != nullptr; argList = argList->Rest()) + { + if (HandleArgForHWIntrinsicCreate(argList->Current(), argCnt, vecCns, baseType)) + { + cnsArgCnt += 1; + } + argCnt += 1; + } + } + else + { + if (HandleArgForHWIntrinsicCreate(op1, argCnt, vecCns, baseType)) + { + cnsArgCnt += 1; + } + argCnt += 1; + + if (op2 != nullptr) + { + if (HandleArgForHWIntrinsicCreate(op2, argCnt, vecCns, baseType)) + { + cnsArgCnt += 1; + } + argCnt += 1; + } + else if (cnsArgCnt == 1) + { + // These intrinsics are meant to set the same value to every element + // so we'll just specially handle it here and copy it into the remaining + // indices. + + for (unsigned i = 1; i < simdSize / genTypeSize(baseType); i++) + { + HandleArgForHWIntrinsicCreate(op1, i, vecCns, baseType); + } + } + } + assert((argCnt == 1) || (argCnt == (simdSize / genTypeSize(baseType)))); + + if (argCnt == cnsArgCnt) + { + if (op1->OperIsList()) + { + for (argList = op1->AsArgList(); argList != nullptr; argList = argList->Rest()) + { + BlockRange().Remove(argList->Current()); + } + } + else + { + BlockRange().Remove(op1); + + if (op2 != nullptr) + { + BlockRange().Remove(op2); + } + } + + CORINFO_FIELD_HANDLE hnd = comp->GetEmitter()->emitAnyConst(&vecCns, simdSize, emitDataAlignment::Required); + GenTree* clsVarAddr = new (comp, GT_CLS_VAR_ADDR) GenTreeClsVar(GT_CLS_VAR_ADDR, TYP_I_IMPL, hnd, nullptr); + BlockRange().InsertBefore(node, clsVarAddr); + + node->ChangeOper(GT_IND); + node->gtOp1 = clsVarAddr; + + // TODO-XARCH-CQ: We should be able to modify at least the paths that use Insert to trivially support partial + // vector constants. With this, we can create a constant if say 50% of the inputs are also constant and just + // insert the non-constant values which should still allow some gains. + + return; + } + else if (argCnt == 1) + { + // We have the following (where simd is simd16 or simd32): + // /--* op1 T + // node = * HWINTRINSIC simd T Create + + if (intrinsicId == NI_Vector256_Create) + { + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + { + // We will be constructing the following parts: + // /--* op1 T + // tmp1 = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* tmp1 simd16 + // node = * HWINTRINSIC simd32 T BroadcastScalarToVector256 + + // This is roughly the following managed code: + // var tmp1 = Vector128.CreateScalarUnsafe(op1); + // return Avx2.BroadcastScalarToVector256(tmp1); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_CreateScalarUnsafe, baseType, 16); + BlockRange().InsertAfter(op1, tmp1); + LowerNode(tmp1); + + node->gtOp1 = tmp1; + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_AVX2_BroadcastScalarToVector256; + return; + } + + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); + + // We will be constructing the following parts: + // /--* op1 T + // tmp1 = * HWINTRINSIC simd16 T Create + // /--* tmp1 simd16 + // * STORE_LCL_VAR simd16 + // tmp1 = LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // /--* tmp2 simd16 + // tmp3 = * HWINTRINSIC simd16 T ToVector256Unsafe + // idx = CNS_INT int 0 + // /--* tmp3 simd32 + // +--* tmp1 simd16 + // +--* idx int + // node = * HWINTRINSIC simd32 T InsertVector128 + + // This is roughly the following managed code: + // var tmp1 = Vector128.Create(op1); + // var tmp2 = tmp1; + // var tmp3 = tmp2.ToVector256Unsafe(); + // return Avx.InsertVector128(tmp3, tmp1, 0x01); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_Create, baseType, 16); + BlockRange().InsertAfter(op1, tmp1); + LowerNode(tmp1); + + node->gtOp1 = tmp1; + LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node); + ReplaceWithLclVar(tmp1Use); + tmp1 = node->gtOp1; + + tmp2 = comp->gtClone(tmp1); + BlockRange().InsertAfter(tmp1, tmp2); + + tmp3 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, tmp2, NI_Vector128_ToVector256Unsafe, baseType, 16); + BlockRange().InsertAfter(tmp2, tmp3); + LowerNode(tmp3); + + idx = comp->gtNewIconNode(0x01, TYP_INT); + BlockRange().InsertAfter(tmp3, idx); + + node->gtOp1 = comp->gtNewArgList(tmp3, tmp1, idx); + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_AVX_InsertVector128; + return; + } + + // We will be constructing the following parts: + // /--* op1 T + // tmp1 = * HWINTRINSIC simd16 T CreateScalarUnsafe + // ... + + // This is roughly the following managed code: + // var tmp1 = Vector128.CreateScalarUnsafe(op1); + // ... + + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_CreateScalarUnsafe, baseType, 16); + BlockRange().InsertAfter(op1, tmp1); + LowerNode(tmp1); + + if ((baseType != TYP_DOUBLE) && comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + { + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // node = * HWINTRINSIC simd16 T BroadcastScalarToVector128 + + // This is roughly the following managed code: + // ... + // return Avx2.BroadcastScalarToVector128(tmp1); + + node->gtOp1 = tmp1; + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_AVX2_BroadcastScalarToVector128; + return; + } + + switch (baseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + if (comp->compOpportunisticallyDependsOn(InstructionSet_SSSE3)) + { + // We will be constructing the following parts: + // ... + // tmp2 = HWINTRINSIC simd16 ubyte get_Zero + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // node = * HWINTRINSIC simd16 ubyte Shuffle + + // This is roughly the following managed code: + // ... + // var tmp2 = Vector128.Zero; + // return Ssse3.Shuffle(tmp1, tmp2); + + tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, NI_Vector128_get_Zero, TYP_UBYTE, simdSize); + BlockRange().InsertAfter(tmp1, tmp2); + LowerNode(tmp2); + + node->gtOp1 = tmp1; + node->gtOp2 = tmp2; + + node->gtHWIntrinsicId = NI_SSSE3_Shuffle; + break; + } + + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); + + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // * STORE_LCL_VAR simd16 + // tmp1 = LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // tmp1 = * HWINTRINSIC simd16 ubyte UnpackLow + // ... + + // This is roughly the following managed code: + // ... + // var tmp2 = tmp1; + // tmp1 = Sse2.UnpackLow(tmp1, tmp2); + // ... + + node->gtOp1 = tmp1; + LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node); + ReplaceWithLclVar(tmp1Use); + tmp1 = node->gtOp1; + + tmp2 = comp->gtClone(tmp1); + BlockRange().InsertAfter(tmp1, tmp2); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_SSE2_UnpackLow, TYP_UBYTE, simdSize); + BlockRange().InsertAfter(tmp2, tmp1); + LowerNode(tmp1); + + __fallthrough; + } + + case TYP_SHORT: + case TYP_USHORT: + { + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // * STORE_LCL_VAR simd16 + // tmp1 = LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // tmp1 = * HWINTRINSIC simd16 ushort UnpackLow + // ... + + // This is roughly the following managed code: + // ... + // var tmp2 = tmp1; + // tmp1 = Sse2.UnpackLow(tmp1, tmp2); + // ... + + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); + + node->gtOp1 = tmp1; + LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node); + ReplaceWithLclVar(tmp1Use); + tmp1 = node->gtOp1; + + tmp2 = comp->gtClone(tmp1); + BlockRange().InsertAfter(tmp1, tmp2); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_SSE2_UnpackLow, TYP_USHORT, simdSize); + BlockRange().InsertAfter(tmp2, tmp1); + LowerNode(tmp1); + + __fallthrough; + } + + case TYP_INT: + case TYP_UINT: + { + // We will be constructing the following parts: + // ... + // idx = CNS_INT int 0 + // /--* tmp1 simd16 + // +--* idx int + // node = * HWINTRINSIC simd16 uint Shuffle + + // This is roughly the following managed code: + // ... + // return Sse2.Shuffle(tmp1, 0x00); + + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); + + idx = comp->gtNewIconNode(0x00, TYP_INT); + BlockRange().InsertAfter(tmp1, idx); + + node->gtOp1 = tmp1; + node->gtOp2 = idx; + + node->gtHWIntrinsicId = NI_SSE2_Shuffle; + node->gtSIMDBaseType = TYP_UINT; + + break; + } + +#if defined(TARGET_AMD64) + case TYP_LONG: + case TYP_ULONG: + { + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // * STORE_LCL_VAR simd16 + // tmp1 = LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // node = * HWINTRINSIC simd16 ulong UnpackLow + + // This is roughly the following managed code: + // ... + // var tmp2 = tmp1; + // return Sse2.UnpackLow(tmp1, tmp2); + + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); + + node->gtOp1 = tmp1; + LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node); + ReplaceWithLclVar(tmp1Use); + tmp1 = node->gtOp1; + + tmp2 = comp->gtClone(tmp1); + BlockRange().InsertAfter(tmp1, tmp2); + + node->gtOp1 = tmp1; + node->gtOp2 = tmp2; + + node->gtHWIntrinsicId = NI_SSE2_UnpackLow; + break; + } +#endif // TARGET_AMD64 + + case TYP_FLOAT: + { + if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX)) + { + // We will be constructing the following parts: + // ... + // idx = CNS_INT int 0 + // /--* tmp1 simd16 + // +--* idx int + // node = * HWINTRINSIC simd16 float Permute + + // This is roughly the following managed code: + // ... + // return Avx.Permute(tmp1, 0x00); + + idx = comp->gtNewIconNode(0x00, TYP_INT); + BlockRange().InsertAfter(tmp1, idx); + + node->gtOp1 = tmp1; + node->gtOp2 = idx; + + node->gtHWIntrinsicId = NI_AVX_Permute; + break; + } + + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // * STORE_LCL_VAR simd16 + // tmp1 = LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // idx = CNS_INT int 0 + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // +--* idx int + // node = * HWINTRINSIC simd16 float Shuffle + + // This is roughly the following managed code: + // ... + // var tmp2 = tmp1; + // return Sse.Shuffle(tmp1, tmp2, 0x00); + + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE)); + + node->gtOp1 = tmp1; + LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node); + ReplaceWithLclVar(tmp1Use); + tmp1 = node->gtOp1; + + tmp2 = comp->gtClone(tmp1); + BlockRange().InsertAfter(tmp1, tmp2); + + idx = comp->gtNewIconNode(0x00, TYP_INT); + BlockRange().InsertAfter(tmp2, idx); + + node->gtOp1 = comp->gtNewArgList(tmp1, tmp2, idx); + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_SSE_Shuffle; + break; + } + + case TYP_DOUBLE: + { + if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE3)) + { + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // node = * HWINTRINSIC simd16 double MoveAndDuplicate + + // This is roughly the following managed code: + // ... + // return Sse3.MoveAndDuplicate(tmp1); + + node->gtOp1 = tmp1; + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_SSE3_MoveAndDuplicate; + break; + } + + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); + + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // * STORE_LCL_VAR simd16 + // tmp1 = LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // node = * HWINTRINSIC simd16 float MoveLowToHigh + + // This is roughly the following managed code: + // ... + // var tmp2 = tmp1; + // return Sse.MoveLowToHigh(tmp1, tmp2); + + node->gtOp1 = tmp1; + LIR::Use tmp1Use(BlockRange(), &node->gtOp1, node); + ReplaceWithLclVar(tmp1Use); + tmp1 = node->gtOp1; + + tmp2 = comp->gtClone(tmp1); + BlockRange().InsertAfter(tmp1, tmp2); + + node->gtOp1 = tmp1; + node->gtOp2 = tmp2; + + node->gtHWIntrinsicId = NI_SSE_MoveLowToHigh; + node->gtSIMDBaseType = TYP_FLOAT; + + break; + } + + default: + { + unreached(); + } + } + + return; + } + + // We have the following (where simd is simd16 or simd32): + // /--* op1 T + // +--* ... T + // +--* opN T + // node = * HWINTRINSIC simd T Create + + if (intrinsicId == NI_Vector256_Create) + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); + + // We will be constructing the following parts: + // /--* op1 T + // +--* ... T + // lo = * HWINTRINSIC simd16 T Create + // /--* ... T + // +--* opN T + // hi = * HWINTRINSIC simd16 T Create + // idx = CNS_INT int 1 + // /--* lo simd32 + // +--* hi simd16 + // +--* idx int + // node = * HWINTRINSIC simd32 T InsertVector128 + + // This is roughly the following managed code: + // ... + // var lo = Vector128.Create(op1, ...); + // var hi = Vector128.Create(..., opN); + // return Avx.InsertVector128(lo, hi, 0x01); + + // Each Vector128.Create call gets half the operands. That is: + // lo = Vector128.Create(op1, op2); + // hi = Vector128.Create(op3, op4); + // -or- + // lo = Vector128.Create(op1, ..., op3); + // hi = Vector128.Create(op4, ..., op7); + // -or- + // lo = Vector128.Create(op1, ..., op7); + // hi = Vector128.Create(op8, ..., op15); + // -or- + // lo = Vector128.Create(op1, ..., op15); + // hi = Vector128.Create(op16, ..., op31); + + unsigned halfArgCnt = argCnt / 2; + assert((halfArgCnt * 2) == argCnt); + + argList = op1->AsArgList(); + + for (unsigned i = 0; i < halfArgCnt; i++) + { + op2 = argList; + argList = argList->Rest(); + } + + op2->AsArgList()->gtOp2 = nullptr; + op2 = argList; + + // The above for loop splits the operand count into exactly half. + // Once it exits, op1 will point to op1 and op2 will point to the + // last operand that will be passed to the first Vector128.Create + // We will set its op2 to null, terminating the chain and then + // assign op2 to be argList, which is the first operand that will + // get passed to the second Vector128.Create + + GenTree* lo = nullptr; + GenTree* hi = nullptr; + + if (halfArgCnt == 2) + { + // The Vector256.Create calls that take 4 operands are special + // because the half argument count is 2, which means we can't + // actually use the GT_LIST anymore and need to pass them as + // explicit operands instead. + + argList = op1->AsArgList(); + + tmp1 = argList->Current(); + tmp2 = argList->Rest()->Current(); + + lo = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, NI_Vector128_Create, baseType, 16); + BlockRange().InsertAfter(tmp2, lo); + LowerNode(lo); + + argList = op2->AsArgList(); + + tmp1 = argList->Current(); + tmp2 = argList->Rest()->Current(); + + hi = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, tmp2, NI_Vector128_Create, baseType, 16); + BlockRange().InsertAfter(tmp2, hi); + LowerNode(hi); + } + else + { + // The rest of the Vector256.Create calls take at least 8 operands + // and so the half count is at least 4 and we have to continue + // passing around GT_LIST nodes in op1 with a null op2 + assert(halfArgCnt >= 4); + + tmp1 = op2->AsArgList()->Current(); + + lo = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_Create, baseType, 16); + BlockRange().InsertBefore(tmp1, lo); + LowerNode(lo); + + hi = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_Vector128_Create, baseType, 16); + BlockRange().InsertBefore(node, hi); + LowerNode(hi); + } + + idx = comp->gtNewIconNode(0x01, TYP_INT); + BlockRange().InsertAfter(hi, idx); + + node->gtOp1 = comp->gtNewArgList(lo, hi, idx); + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_AVX_InsertVector128; + return; + } + + if (op1->OperIsList()) + { + argList = op1->AsArgList(); + op1 = argList->Current(); + argList = argList->Rest(); + } + + // We will be constructing the following parts: + // /--* op1 T + // tmp1 = * HWINTRINSIC simd16 T CreateScalarUnsafe + // ... + + // This is roughly the following managed code: + // var tmp1 = Vector128.CreateScalarUnsafe(op1); + // ... + + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector128_CreateScalarUnsafe, baseType, 16); + BlockRange().InsertAfter(op1, tmp1); + LowerNode(tmp1); + + switch (baseType) + { + case TYP_BYTE: + case TYP_UBYTE: + case TYP_SHORT: + case TYP_USHORT: + case TYP_INT: + case TYP_UINT: + { + unsigned N = 0; + GenTree* opN = nullptr; + NamedIntrinsic insIntrinsic = NI_Illegal; + + if ((baseType == TYP_SHORT) || (baseType == TYP_USHORT)) + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); + insIntrinsic = NI_SSE2_Insert; + } + else if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + insIntrinsic = NI_SSE41_Insert; + } + + if (insIntrinsic != NI_Illegal) + { + for (N = 1; N < argCnt - 1; N++) + { + // We will be constructing the following parts: + // ... + // idx = CNS_INT int N + // /--* tmp1 simd16 + // +--* opN T + // +--* idx int + // tmp1 = * HWINTRINSIC simd16 T Insert + // ... + + // This is roughly the following managed code: + // ... + // tmp1 = Sse?.Insert(tmp1, opN, N); + // ... + + opN = argList->Current(); + + idx = comp->gtNewIconNode(N, TYP_INT); + BlockRange().InsertAfter(opN, idx); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, opN, idx, insIntrinsic, baseType, simdSize); + BlockRange().InsertAfter(idx, tmp1); + LowerNode(tmp1); + + argList = argList->Rest(); + } + + assert(N == (argCnt - 1)); + + // We will be constructing the following parts: + // idx = CNS_INT int N + // /--* tmp1 simd16 + // +--* opN T + // +--* idx int + // node = * HWINTRINSIC simd16 T Insert + + // This is roughly the following managed code: + // ... + // tmp1 = Sse?.Insert(tmp1, opN, N); + // ... + + opN = argList->Current(); + + idx = comp->gtNewIconNode(N, TYP_INT); + BlockRange().InsertAfter(opN, idx); + + node->gtOp1 = comp->gtNewArgList(tmp1, opN, idx); + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = insIntrinsic; + break; + } + + assert((baseType != TYP_SHORT) && (baseType != TYP_USHORT)); + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); + + GenTree* op[16]; + op[0] = tmp1; + + for (N = 1; N < argCnt; N++) + { + opN = argList->Current(); + + op[N] = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16); + BlockRange().InsertAfter(opN, op[N]); + LowerNode(op[N]); + + argList = argList->Rest(); + } + assert(argList == nullptr); + + if ((baseType == TYP_BYTE) || (baseType == TYP_UBYTE)) + { + for (N = 0; N < argCnt; N += 4) + { + // We will be constructing the following parts: + // ... + // /--* opN T + // opN = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* opO T + // opO = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* opN simd16 + // +--* opO simd16 + // tmp1 = * HWINTRINSIC simd16 T UnpackLow + // /--* opP T + // opP = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* opQ T + // opQ = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* opP simd16 + // +--* opQ simd16 + // tmp2 = * HWINTRINSIC simd16 T UnpackLow + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // tmp3 = * HWINTRINSIC simd16 T UnpackLow + // ... + + // This is roughly the following managed code: + // ... + // tmp1 = Sse2.UnpackLow(opN, opO); + // tmp2 = Sse2.UnpackLow(opP, opQ); + // tmp3 = Sse2.UnpackLow(tmp1, tmp2); + // ... + + unsigned O = N + 1; + unsigned P = N + 2; + unsigned Q = N + 3; + + tmp1 = + comp->gtNewSimdHWIntrinsicNode(simdType, op[N], op[O], NI_SSE2_UnpackLow, TYP_UBYTE, simdSize); + BlockRange().InsertAfter(op[O], tmp1); + LowerNode(tmp1); + + tmp2 = + comp->gtNewSimdHWIntrinsicNode(simdType, op[P], op[Q], NI_SSE2_UnpackLow, TYP_UBYTE, simdSize); + BlockRange().InsertAfter(op[Q], tmp2); + LowerNode(tmp2); + + tmp3 = + comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_SSE2_UnpackLow, TYP_USHORT, simdSize); + BlockRange().InsertAfter(tmp2, tmp3); + LowerNode(tmp3); + + // This caches the result in index 0 through 3, depending on which + // loop iteration this is and allows the rest of the logic to be + // shared with the TYP_INT and TYP_UINT path. + + op[N / 4] = tmp3; + } + } + + // We will be constructing the following parts: + // ... + // /--* opN T + // opN = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* opO T + // opO = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* opN simd16 + // +--* opO simd16 + // tmp1 = * HWINTRINSIC simd16 T UnpackLow + // /--* opP T + // opP = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* opQ T + // opQ = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* opP simd16 + // +--* opQ simd16 + // tmp2 = * HWINTRINSIC simd16 T UnpackLow + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // node = * HWINTRINSIC simd16 T UnpackLow + + // This is roughly the following managed code: + // ... + // tmp1 = Sse2.UnpackLow(opN, opO); + // tmp2 = Sse2.UnpackLow(opP, opQ); + // return Sse2.UnpackLow(tmp1, tmp2); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op[0], op[1], NI_SSE2_UnpackLow, TYP_UINT, simdSize); + BlockRange().InsertAfter(op[1], tmp1); + LowerNode(tmp1); + + tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, op[2], op[3], NI_SSE2_UnpackLow, TYP_UINT, simdSize); + BlockRange().InsertAfter(op[3], tmp2); + LowerNode(tmp2); + + node->gtOp1 = tmp1; + node->gtOp2 = tmp2; + + node->gtHWIntrinsicId = NI_SSE2_UnpackLow; + node->gtSIMDBaseType = TYP_ULONG; + break; + } + +#if defined(TARGET_AMD64) + case TYP_LONG: + case TYP_ULONG: + { + if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41_X64)) + { + // We will be constructing the following parts: + // ... + // idx = CNS_INT int 1 + // /--* tmp1 simd16 + // +--* op2 T + // +--* idx int + // node = * HWINTRINSIC simd16 T Insert + + // This is roughly the following managed code: + // ... + // return Sse41.X64.Insert(tmp1, op2, 0x01); + + idx = comp->gtNewIconNode(0x01, TYP_INT); + BlockRange().InsertAfter(op2, idx); + + node->gtOp1 = comp->gtNewArgList(tmp1, op2, idx); + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_SSE41_X64_Insert; + break; + } + + // We will be constructing the following parts: + // ... + // /--* op2 T + // tmp2 = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // node = * HWINTRINSIC simd16 T UnpackLow + + // This is roughly the following managed code: + // ... + // var tmp2 = Vector128.CreateScalarUnsafe(op2); + // return Sse2.UnpackLow(tmp1, tmp2); + + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); + + tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_Vector128_CreateScalarUnsafe, baseType, 16); + BlockRange().InsertAfter(op2, tmp2); + LowerNode(tmp2); + + node->gtOp1 = tmp1; + node->gtOp2 = tmp2; + + node->gtHWIntrinsicId = NI_SSE2_UnpackLow; + break; + } +#endif // TARGET_AMD64 + + case TYP_FLOAT: + { + unsigned N = 0; + GenTree* opN = nullptr; + + if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + for (N = 1; N < argCnt - 1; N++) + { + // We will be constructing the following parts: + // ... + // + // /--* opN T + // tmp2 = * HWINTRINSIC simd16 T CreateScalarUnsafe + // idx = CNS_INT int N + // /--* tmp1 simd16 + // +--* opN T + // +--* idx int + // tmp1 = * HWINTRINSIC simd16 T Insert + // ... + + // This is roughly the following managed code: + // ... + // tmp2 = Vector128.CreateScalarUnsafe(opN); + // tmp1 = Sse41.Insert(tmp1, tmp2, N << 4); + // ... + + opN = argList->Current(); + + tmp2 = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16); + BlockRange().InsertAfter(opN, tmp2); + LowerNode(tmp2); + + idx = comp->gtNewIconNode(N << 4, TYP_INT); + BlockRange().InsertAfter(tmp2, idx); + + tmp1 = + comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, idx, NI_SSE41_Insert, baseType, simdSize); + BlockRange().InsertAfter(idx, tmp1); + LowerNode(tmp1); + + argList = argList->Rest(); + } + + // We will be constructing the following parts: + // ... + // + // /--* opN T + // tmp2 = * HWINTRINSIC simd16 T CreateScalarUnsafe + // idx = CNS_INT int N + // /--* tmp1 simd16 + // +--* opN T + // +--* idx int + // node = * HWINTRINSIC simd16 T Insert + + // This is roughly the following managed code: + // ... + // tmp2 = Vector128.CreateScalarUnsafe(opN); + // return Sse41.Insert(tmp1, tmp2, N << 4); + + opN = argList->Current(); + + tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16); + BlockRange().InsertAfter(opN, tmp2); + LowerNode(tmp2); + + idx = comp->gtNewIconNode((argCnt - 1) << 4, TYP_INT); + BlockRange().InsertAfter(tmp2, idx); + + node->gtOp1 = comp->gtNewArgList(tmp1, tmp2, idx); + node->gtOp2 = nullptr; + + node->gtHWIntrinsicId = NI_SSE41_Insert; + break; + } + + // We will be constructing the following parts: + // ... + // /--* opN T + // opN = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* opO T + // opO = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* opN simd16 + // +--* opO simd16 + // tmp1 = * HWINTRINSIC simd16 T UnpackLow + // /--* opP T + // opP = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* opQ T + // opQ = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* opP simd16 + // +--* opQ simd16 + // tmp2 = * HWINTRINSIC simd16 T UnpackLow + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // node = * HWINTRINSIC simd16 T MoveLowToHigh + + // This is roughly the following managed code: + // ... + // tmp1 = Sse.UnpackLow(opN, opO); + // tmp2 = Sse.UnpackLow(opP, opQ); + // return Sse.MoveLowToHigh(tmp1, tmp2); + + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE)); + + GenTree* op[4]; + op[0] = tmp1; + + for (N = 1; N < argCnt; N++) + { + opN = argList->Current(); + + op[N] = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, opN, NI_Vector128_CreateScalarUnsafe, baseType, 16); + BlockRange().InsertAfter(opN, op[N]); + LowerNode(op[N]); + + argList = argList->Rest(); + } + assert(argList == nullptr); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op[0], op[1], NI_SSE_UnpackLow, baseType, simdSize); + BlockRange().InsertAfter(op[1], tmp1); + LowerNode(tmp1); + + tmp2 = comp->gtNewSimdHWIntrinsicNode(simdType, op[2], op[3], NI_SSE_UnpackLow, baseType, simdSize); + BlockRange().InsertAfter(op[3], tmp2); + LowerNode(tmp2); + + node->gtOp1 = tmp1; + node->gtOp2 = tmp2; + + node->gtHWIntrinsicId = NI_SSE_MoveLowToHigh; + break; + } + + case TYP_DOUBLE: + { + // We will be constructing the following parts: + // ... + // /--* op2 T + // tmp2 = * HWINTRINSIC simd16 T CreateScalarUnsafe + // /--* tmp1 simd16 + // +--* tmp2 simd16 + // node = * HWINTRINSIC simd16 T MoveLowToHigh + + // This is roughly the following managed code: + // ... + // var tmp2 = Vector128.CreateScalarUnsafe(op2); + // return Sse.MoveLowToHigh(tmp1, tmp2); + + assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2)); + + tmp2 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_Vector128_CreateScalarUnsafe, baseType, 16); + BlockRange().InsertAfter(op2, tmp2); + LowerNode(tmp2); + + node->gtOp1 = tmp1; + node->gtOp2 = tmp2; + + node->gtHWIntrinsicId = NI_SSE_MoveLowToHigh; + node->gtSIMDBaseType = TYP_FLOAT; + + break; + } + + default: + { + unreached(); + } + } +} #endif // FEATURE_HW_INTRINSICS //---------------------------------------------------------------------------------------------- diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index 5d55d00f11b8a..db15743c6dcba 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -259,31 +259,12 @@ public static Vector AsVector(this Vector128 value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m128i _mm_set1_epi8 /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] [Intrinsic] public static unsafe Vector128 Create(byte value) { - if (Avx2.IsSupported) + if (Sse2.IsSupported || AdvSimd.IsSupported) { - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - return Avx2.BroadcastScalarToVector128(result); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v > - } - - if (Ssse3.IsSupported) - { - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - return Ssse3.Shuffle(result, Vector128.Zero); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v > - } - - if (Sse2.IsSupported) - { - // We first unpack as bytes to duplicate value into the lower 2 bytes, then we treat it as a ushort and unpack again to duplicate those - // bits into the lower 2 words, we can finally treat it as a uint and shuffle the lower dword to duplicate value across the entire result - - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - result = Sse2.UnpackLow(result, result); // < v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - result = Sse2.UnpackLow(result.AsUInt16(), result.AsUInt16()).AsByte(); // < v, v, v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - return Sse2.Shuffle(result.AsUInt32(), 0x00).AsByte(); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -318,23 +299,12 @@ static Vector128 SoftwareFallback(byte value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m128d _mm_set1_pd /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] [Intrinsic] public static unsafe Vector128 Create(double value) { - if (Sse3.IsSupported) + if (Sse2.IsSupported || AdvSimd.IsSupported) { - Vector128 result = CreateScalarUnsafe(value); // < v, ? > - return Sse3.MoveAndDuplicate(result); // < v, v > - } - - if (Sse2.IsSupported) - { - // Treating the value as a set of singles and emitting MoveLowToHigh is more efficient than dealing with the elements directly as double - // However, we still need to check if Sse2 is supported since CreateScalarUnsafe needs it to for movsd, when value is not already in register - - Vector128 result = CreateScalarUnsafe(value); // < v, ? > - return Sse.MoveLowToHigh(result.AsSingle(), result.AsSingle()).AsDouble(); // < v, v > + return Create(value); } return SoftwareFallback(value); @@ -355,24 +325,12 @@ static Vector128 SoftwareFallback(double value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m128i _mm_set1_epi16 /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] [Intrinsic] public static unsafe Vector128 Create(short value) { - if (Avx2.IsSupported) + if (Sse2.IsSupported || AdvSimd.IsSupported) { - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ? > - return Avx2.BroadcastScalarToVector128(result); // < v, v, v, v, v, v, v, v > - } - - if (Sse2.IsSupported) - { - // We first unpack as ushort to duplicate value into the lower 2 words, then we can treat it as a uint and shuffle the lower dword to - // duplicate value across the entire result - - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ? > - result = Sse2.UnpackLow(result, result); // < v, v, ?, ?, ?, ?, ?, ? > - return Sse2.Shuffle(result.AsInt32(), 0x00).AsInt16(); // < v, v, v, v, v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -399,20 +357,12 @@ static Vector128 SoftwareFallback(short value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m128i _mm_set1_epi32 /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] [Intrinsic] public static unsafe Vector128 Create(int value) { - if (Avx2.IsSupported) + if (Sse2.IsSupported || AdvSimd.IsSupported) { - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ? > - return Avx2.BroadcastScalarToVector128(result); // < v, v, v, v > - } - - if (Sse2.IsSupported) - { - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ? > - return Sse2.Shuffle(result, 0x00); // < v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -435,22 +385,12 @@ static Vector128 SoftwareFallback(int value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m128i _mm_set1_epi64x /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] [Intrinsic] public static unsafe Vector128 Create(long value) { - if (Sse2.X64.IsSupported) + if (Sse2.X64.IsSupported || AdvSimd.IsSupported) { - if (Avx2.IsSupported) - { - Vector128 result = CreateScalarUnsafe(value); // < v, ? > - return Avx2.BroadcastScalarToVector128(result); // < v, v > - } - else - { - Vector128 result = CreateScalarUnsafe(value); // < v, ? > - return Sse2.UnpackLow(result, result); // < v, v > - } + return Create(value); } return SoftwareFallback(value); @@ -471,32 +411,13 @@ static Vector128 SoftwareFallback(long value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m128i _mm_set1_epi8 /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CLSCompliant(false)] [Intrinsic] + [CLSCompliant(false)] public static unsafe Vector128 Create(sbyte value) { - if (Avx2.IsSupported) + if (Sse2.IsSupported || AdvSimd.IsSupported) { - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - return Avx2.BroadcastScalarToVector128(result); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v > - } - - if (Ssse3.IsSupported) - { - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - return Ssse3.Shuffle(result, Vector128.Zero); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v > - } - - if (Sse2.IsSupported) - { - // We first unpack as bytes to duplicate value into the lower 2 bytes, then we treat it as a ushort and unpack again to duplicate those - // bits into the lower 2 words, we can finally treat it as a uint and shuffle the lower dword to duplicate value across the entire result - - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - result = Sse2.UnpackLow(result, result); // < v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - result = Sse2.UnpackLow(result.AsInt16(), result.AsInt16()).AsSByte(); // < v, v, v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - return Sse2.Shuffle(result.AsInt32(), 0x00).AsSByte(); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -531,26 +452,12 @@ static Vector128 SoftwareFallback(sbyte value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m128 _mm_set1_ps /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] [Intrinsic] public static unsafe Vector128 Create(float value) { - if (Avx2.IsSupported) + if (Sse.IsSupported || AdvSimd.IsSupported) { - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ? > - return Avx2.BroadcastScalarToVector128(result); // < v, v, v, v > - } - - if (Avx.IsSupported) - { - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ? > - return Avx.Permute(result, 0x00); // < v, v, v, v > - } - - if (Sse.IsSupported) - { - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ? > - return Sse.Shuffle(result, result, 0x00); // < v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -573,25 +480,13 @@ static Vector128 SoftwareFallback(float value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m128i _mm_set1_epi16 /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CLSCompliant(false)] [Intrinsic] + [CLSCompliant(false)] public static unsafe Vector128 Create(ushort value) { - if (Avx2.IsSupported) + if (Sse2.IsSupported || AdvSimd.IsSupported) { - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ? > - return Avx2.BroadcastScalarToVector128(result); // < v, v, v, v, v, v, v, v > - } - - if (Sse2.IsSupported) - { - // We first unpack as ushort to duplicate value into the lower 2 words, then we can treat it as a uint and shuffle the lower dword to - // duplicate value across the entire result - - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ? > - result = Sse2.UnpackLow(result, result); // < v, v, ?, ?, ?, ?, ?, ? > - return Sse2.Shuffle(result.AsUInt32(), 0x00).AsUInt16(); // < v, v, v, v, v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -618,21 +513,13 @@ static Vector128 SoftwareFallback(ushort value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m128i _mm_set1_epi32 /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CLSCompliant(false)] [Intrinsic] + [CLSCompliant(false)] public static unsafe Vector128 Create(uint value) { - if (Avx2.IsSupported) + if (Sse2.IsSupported || AdvSimd.IsSupported) { - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ? > - return Avx2.BroadcastScalarToVector128(result); // < v, v, v, v > - } - - if (Sse2.IsSupported) - { - Vector128 result = CreateScalarUnsafe(value); // < v, ?, ?, ? > - return Sse2.Shuffle(result, 0x00); // < v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -655,23 +542,13 @@ static Vector128 SoftwareFallback(uint value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m128i _mm_set1_epi64x /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] - [CLSCompliant(false)] [Intrinsic] + [CLSCompliant(false)] public static unsafe Vector128 Create(ulong value) { - if (Sse2.X64.IsSupported) + if (Sse2.X64.IsSupported || AdvSimd.IsSupported) { - if (Avx2.IsSupported) - { - Vector128 result = CreateScalarUnsafe(value); // < v, ? > - return Avx2.BroadcastScalarToVector128(result); // < v, v > - } - else - { - Vector128 result = CreateScalarUnsafe(value); // < v, ? > - return Sse2.UnpackLow(result, result); // < v, v > - } + return Create(value); } return SoftwareFallback(value); @@ -707,62 +584,15 @@ static Vector128 SoftwareFallback(ulong value) /// The value that element 15 will be initialized to. /// On x86, this method corresponds to __m128i _mm_setr_epi8 /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector128 Create(byte e0, byte e1, byte e2, byte e3, byte e4, byte e5, byte e6, byte e7, byte e8, byte e9, byte e10, byte e11, byte e12, byte e13, byte e14, byte e15) { - if (Sse41.IsSupported) - { - Vector128 result = CreateScalarUnsafe(e0); // < 0, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e1, 1); // < 0, 1, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e2, 2); // < 0, 1, 2, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e3, 3); // < 0, 1, 2, 3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e4, 4); // < 0, 1, 2, 3, 4, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e5, 5); // < 0, 1, 2, 3, 4, 5, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e6, 6); // < 0, 1, 2, 3, 4, 5, 6, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e7, 7); // < 0, 1, 2, 3, 4, 5, 6, 7, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e8, 8); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e9, 9); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e10, 10); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e11, 11); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ??, ??, ??, ?? > - result = Sse41.Insert(result, e12, 12); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ??, ??, ?? > - result = Sse41.Insert(result, e13, 13); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ??, ?? > - result = Sse41.Insert(result, e14, 14); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ?? > - return Sse41.Insert(result, e15, 15); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 > - } - +#if !TARGET_ARM && !TARGET_ARM64 if (Sse2.IsSupported) { - // We deal with the elements in order, unpacking the ordered pairs of bytes into vectors. We then treat those vectors as ushort and - // unpack them again, then again treating those results as uint, and a final time treating them as ulong. This efficiently gets all - // bytes ordered into the result. - - Vector128 lo16, hi16; - Vector128 lo32, hi32; - Vector128 lo64, hi64; - - lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)).AsUInt16(); // < 0, 1, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3)).AsUInt16(); // < 2, 3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - lo32 = Sse2.UnpackLow(lo16, hi16).AsUInt32(); // < 0, 1, 2, 3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - - lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e4), CreateScalarUnsafe(e5)).AsUInt16(); // < 4, 5, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e6), CreateScalarUnsafe(e7)).AsUInt16(); // < 6, 7, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - hi32 = Sse2.UnpackLow(lo16, hi16).AsUInt32(); // < 4, 5, 6, 7, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - - lo64 = Sse2.UnpackLow(lo32, hi32).AsUInt64(); // < 0, 1, 2, 3, 4, 5, 6, 7, ??, ??, ??, ??, ??, ??, ??, ?? > - - lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e8), CreateScalarUnsafe(e9)).AsUInt16(); // < 8, 9, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e10), CreateScalarUnsafe(e11)).AsUInt16(); // < 10, 11, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - lo32 = Sse2.UnpackLow(lo16, hi16).AsUInt32(); // < 8, 9, 10, 11, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - - lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e12), CreateScalarUnsafe(e13)).AsUInt16(); // < 12, 13, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e14), CreateScalarUnsafe(e15)).AsUInt16(); // < 14, 15, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - hi32 = Sse2.UnpackLow(lo16, hi16).AsUInt32(); // < 12, 13, 14, 15, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - - hi64 = Sse2.UnpackLow(lo32, hi32).AsUInt64(); // < 8, 9, 10, 11, 12, 13, 14, 15, ??, ??, ??, ??, ??, ??, ??, ?? > - - return Sse2.UnpackLow(lo64, hi64).AsByte(); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 > + return Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); } - +#else if (AdvSimd.IsSupported) { Vector128 result = CreateScalarUnsafe(e0); @@ -782,6 +612,7 @@ public static unsafe Vector128 Create(byte e0, byte e1, byte e2, byte e3, result = AdvSimd.Insert(result, 14, e14); return AdvSimd.Insert(result, 15, e15); } +#endif return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); @@ -816,22 +647,21 @@ static Vector128 SoftwareFallback(byte e0, byte e1, byte e2, byte e3, byte /// The value that element 1 will be initialized to. /// On x86, this method corresponds to __m128d _mm_setr_pd /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector128 Create(double e0, double e1) { +#if !TARGET_ARM && !TARGET_ARM64 if (Sse2.IsSupported) { - // Treating the value as a set of singles and emitting MoveLowToHigh is more efficient than dealing with the elements directly as double - // However, we still need to check if Sse2 is supported since CreateScalarUnsafe needs it to for movsd, when value is not already in register - - return Sse.MoveLowToHigh(CreateScalarUnsafe(e0).AsSingle(), CreateScalarUnsafe(e1).AsSingle()).AsDouble(); + return Create(e0, e1); } - +#else if (AdvSimd.IsSupported) { Vector128 result = CreateScalarUnsafe(e0); return AdvSimd.Insert(result, 1, e1); } +#endif return SoftwareFallback(e0, e1); @@ -858,21 +688,15 @@ static Vector128 SoftwareFallback(double e0, double e1) /// The value that element 7 will be initialized to. /// On x86, this method corresponds to __m128i _mm_setr_epi16 /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector128 Create(short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7) { +#if !TARGET_ARM && !TARGET_ARM64 if (Sse2.IsSupported) { - Vector128 result = CreateScalarUnsafe(e0); // < 0, ?, ?, ?, ?, ?, ?, ? > - result = Sse2.Insert(result, e1, 1); // < 0, 1, ?, ?, ?, ?, ?, ? > - result = Sse2.Insert(result, e2, 2); // < 0, 1, 2, ?, ?, ?, ?, ? > - result = Sse2.Insert(result, e3, 3); // < 0, 1, 2, 3, ?, ?, ?, ? > - result = Sse2.Insert(result, e4, 4); // < 0, 1, 2, 3, 4, ?, ?, ? > - result = Sse2.Insert(result, e5, 5); // < 0, 1, 2, 3, 4, 5, ?, ? > - result = Sse2.Insert(result, e6, 6); // < 0, 1, 2, 3, 4, 5, 6, ? > - return Sse2.Insert(result, e7, 7); // < 0, 1, 2, 3, 4, 5, 6, 7 > + return Create(e0, e1, e2, e3, e4, e5, e6, e7); } - +#else if (AdvSimd.IsSupported) { Vector128 result = CreateScalarUnsafe(e0); @@ -884,6 +708,7 @@ public static unsafe Vector128 Create(short e0, short e1, short e2, short result = AdvSimd.Insert(result, 6, e6); return AdvSimd.Insert(result, 7, e7); } +#endif return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7); @@ -912,28 +737,15 @@ static Vector128 SoftwareFallback(short e0, short e1, short e2, short e3, /// The value that element 3 will be initialized to. /// On x86, this method corresponds to __m128i _mm_setr_epi32 /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector128 Create(int e0, int e1, int e2, int e3) { - if (Sse41.IsSupported) - { - Vector128 result = CreateScalarUnsafe(e0); // < 0, ?, ?, ? > - result = Sse41.Insert(result, e1, 1); // < 0, 1, ?, ? > - result = Sse41.Insert(result, e2, 2); // < 0, 1, 2, ? > - return Sse41.Insert(result, e3, 3); // < 0, 1, 2, 3 > - } - +#if !TARGET_ARM && !TARGET_ARM64 if (Sse2.IsSupported) { - // We deal with the elements in order, unpacking the ordered pairs of int into vectors. We then treat those vectors as ulong and - // unpack them again. This efficiently gets all ints ordered into the result. - - Vector128 lo64, hi64; - lo64 = Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)).AsInt64(); // < 0, 1, ?, ? > - hi64 = Sse2.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3)).AsInt64(); // < 2, 3, ?, ? > - return Sse2.UnpackLow(lo64, hi64).AsInt32(); // < 0, 1, 2, 3 > + return Create(e0, e1, e2, e3); } - +#else if (AdvSimd.IsSupported) { Vector128 result = CreateScalarUnsafe(e0); @@ -941,6 +753,7 @@ public static unsafe Vector128 Create(int e0, int e1, int e2, int e3) result = AdvSimd.Insert(result, 2, e2); return AdvSimd.Insert(result, 3, e3); } +#endif return SoftwareFallback(e0, e1, e2, e3); @@ -963,25 +776,21 @@ static Vector128 SoftwareFallback(int e0, int e1, int e2, int e3) /// The value that element 1 will be initialized to. /// On x86, this method corresponds to __m128i _mm_setr_epi64x /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector128 Create(long e0, long e1) { - if (Sse41.X64.IsSupported) - { - Vector128 result = CreateScalarUnsafe(e0); // < 0, ? > - return Sse41.X64.Insert(result, e1, 1); // < 0, 1 > - } - +#if !TARGET_ARM && !TARGET_ARM64 if (Sse2.X64.IsSupported) { - return Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)); // < 0, 1 > + return Create(e0, e1); } - +#else if (AdvSimd.IsSupported) { Vector128 result = CreateScalarUnsafe(e0); return AdvSimd.Insert(result, 1, e1); } +#endif return SoftwareFallback(e0, e1); @@ -1016,63 +825,16 @@ static Vector128 SoftwareFallback(long e0, long e1) /// The value that element 15 will be initialized to. /// On x86, this method corresponds to __m128i _mm_setr_epi8 /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector128 Create(sbyte e0, sbyte e1, sbyte e2, sbyte e3, sbyte e4, sbyte e5, sbyte e6, sbyte e7, sbyte e8, sbyte e9, sbyte e10, sbyte e11, sbyte e12, sbyte e13, sbyte e14, sbyte e15) { - if (Sse41.IsSupported) - { - Vector128 result = CreateScalarUnsafe(e0); // < 0, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e1, 1); // < 0, 1, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e2, 2); // < 0, 1, 2, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e3, 3); // < 0, 1, 2, 3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e4, 4); // < 0, 1, 2, 3, 4, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e5, 5); // < 0, 1, 2, 3, 4, 5, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e6, 6); // < 0, 1, 2, 3, 4, 5, 6, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e7, 7); // < 0, 1, 2, 3, 4, 5, 6, 7, ??, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e8, 8); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, ??, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e9, 9); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ??, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e10, 10); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ??, ??, ??, ??, ?? > - result = Sse41.Insert(result, e11, 11); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ??, ??, ??, ?? > - result = Sse41.Insert(result, e12, 12); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ??, ??, ?? > - result = Sse41.Insert(result, e13, 13); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ??, ?? > - result = Sse41.Insert(result, e14, 14); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ?? > - return Sse41.Insert(result, e15, 15); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 > - } - +#if !TARGET_ARM && !TARGET_ARM64 if (Sse2.IsSupported) { - // We deal with the elements in order, unpacking the ordered pairs of bytes into vectors. We then treat those vectors as ushort and - // unpack them again, then again treating those results as uint, and a final time treating them as ulong. This efficiently gets all - // bytes ordered into the result. - - Vector128 lo16, hi16; - Vector128 lo32, hi32; - Vector128 lo64, hi64; - - lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)).AsInt16(); // < 0, 1, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3)).AsInt16(); // < 2, 3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - lo32 = Sse2.UnpackLow(lo16, hi16).AsInt32(); // < 0, 1, 2, 3, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - - lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e4), CreateScalarUnsafe(e5)).AsInt16(); // < 4, 5, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e6), CreateScalarUnsafe(e7)).AsInt16(); // < 6, 7, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - hi32 = Sse2.UnpackLow(lo16, hi16).AsInt32(); // < 4, 5, 6, 7, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - - lo64 = Sse2.UnpackLow(lo32, hi32).AsInt64(); // < 0, 1, 2, 3, 4, 5, 6, 7, ??, ??, ??, ??, ??, ??, ??, ?? > - - lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e8), CreateScalarUnsafe(e9)).AsInt16(); // < 8, 9, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e10), CreateScalarUnsafe(e11)).AsInt16(); // < 10, 11, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - lo32 = Sse2.UnpackLow(lo16, hi16).AsInt32(); // < 8, 9, 10, 11, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - - lo16 = Sse2.UnpackLow(CreateScalarUnsafe(e12), CreateScalarUnsafe(e13)).AsInt16(); // < 12, 13, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - hi16 = Sse2.UnpackLow(CreateScalarUnsafe(e14), CreateScalarUnsafe(e15)).AsInt16(); // < 14, 15, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - hi32 = Sse2.UnpackLow(lo16, hi16).AsInt32(); // < 12, 13, 14, 15, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ??, ?? > - - hi64 = Sse2.UnpackLow(lo32, hi32).AsInt64(); // < 8, 9, 10, 11, 12, 13, 14, 15, ??, ??, ??, ??, ??, ??, ??, ?? > - - return Sse2.UnpackLow(lo64, hi64).AsSByte(); // < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 > + return Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); } - +#else if (AdvSimd.IsSupported) { Vector128 result = CreateScalarUnsafe(e0); @@ -1092,6 +854,7 @@ public static unsafe Vector128 Create(sbyte e0, sbyte e1, sbyte e2, sbyte result = AdvSimd.Insert(result, 14, e14); return AdvSimd.Insert(result, 15, e15); } +#endif return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); @@ -1128,25 +891,15 @@ static Vector128 SoftwareFallback(sbyte e0, sbyte e1, sbyte e2, sbyte e3, /// The value that element 3 will be initialized to. /// On x86, this method corresponds to __m128 _mm_setr_ps /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector128 Create(float e0, float e1, float e2, float e3) { - if (Sse41.IsSupported) - { - Vector128 result = CreateScalarUnsafe(e0); // < 0, ?, ?, ? > - result = Sse41.Insert(result, CreateScalarUnsafe(e1), 0x10); // < 0, 1, ?, ? > - result = Sse41.Insert(result, CreateScalarUnsafe(e2), 0x20); // < 0, 1, 2, ? > - return Sse41.Insert(result, CreateScalarUnsafe(e3), 0x30); // < 0, 1, 2, 3 > - } - +#if !TARGET_ARM && !TARGET_ARM64 if (Sse.IsSupported) { - Vector128 lo64, hi64; - lo64 = Sse.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)); // < 0, 1, ?, ? > - hi64 = Sse.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3)); // < 2, 3, ?, ? > - return Sse.MoveLowToHigh(lo64, hi64); // < 0, 1, 2, 3 > + return Create(e0, e1, e2, e3); } - +#else if (AdvSimd.IsSupported) { Vector128 result = CreateScalarUnsafe(e0); @@ -1154,6 +907,7 @@ public static unsafe Vector128 Create(float e0, float e1, float e2, float result = AdvSimd.Insert(result, 2, e2); return AdvSimd.Insert(result, 3, e3); } +#endif return SoftwareFallback(e0, e1, e2, e3); @@ -1182,22 +936,16 @@ static Vector128 SoftwareFallback(float e0, float e1, float e2, float e3) /// The value that element 7 will be initialized to. /// On x86, this method corresponds to __m128i _mm_setr_epi16 /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector128 Create(ushort e0, ushort e1, ushort e2, ushort e3, ushort e4, ushort e5, ushort e6, ushort e7) { +#if !TARGET_ARM && !TARGET_ARM64 if (Sse2.IsSupported) { - Vector128 result = CreateScalarUnsafe(e0); // < 0, ?, ?, ?, ?, ?, ?, ? > - result = Sse2.Insert(result, e1, 1); // < 0, 1, ?, ?, ?, ?, ?, ? > - result = Sse2.Insert(result, e2, 2); // < 0, 1, 2, ?, ?, ?, ?, ? > - result = Sse2.Insert(result, e3, 3); // < 0, 1, 2, 3, ?, ?, ?, ? > - result = Sse2.Insert(result, e4, 4); // < 0, 1, 2, 3, 4, ?, ?, ? > - result = Sse2.Insert(result, e5, 5); // < 0, 1, 2, 3, 4, 5, ?, ? > - result = Sse2.Insert(result, e6, 6); // < 0, 1, 2, 3, 4, 5, 6, ? > - return Sse2.Insert(result, e7, 7); // < 0, 1, 2, 3, 4, 5, 6, 7 > + return Create(e0, e1, e2, e3, e4, e5, e6, e7); } - +#else if (AdvSimd.IsSupported) { Vector128 result = CreateScalarUnsafe(e0); @@ -1209,6 +957,7 @@ public static unsafe Vector128 Create(ushort e0, ushort e1, ushort e2, u result = AdvSimd.Insert(result, 6, e6); return AdvSimd.Insert(result, 7, e7); } +#endif return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7); @@ -1237,29 +986,16 @@ static Vector128 SoftwareFallback(ushort e0, ushort e1, ushort e2, ushor /// The value that element 3 will be initialized to. /// On x86, this method corresponds to __m128i _mm_setr_epi32 /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector128 Create(uint e0, uint e1, uint e2, uint e3) { - if (Sse41.IsSupported) - { - Vector128 result = CreateScalarUnsafe(e0); // < 0, ?, ?, ? > - result = Sse41.Insert(result, e1, 1); // < 0, 1, ?, ? > - result = Sse41.Insert(result, e2, 2); // < 0, 1, 2, ? > - return Sse41.Insert(result, e3, 3); // < 0, 1, 2, 3 > - } - +#if !TARGET_ARM && !TARGET_ARM64 if (Sse2.IsSupported) { - // We deal with the elements in order, unpacking the ordered pairs of int into vectors. We then treat those vectors as ulong and - // unpack them again. This efficiently gets all ints ordered into the result. - - Vector128 lo64, hi64; - lo64 = Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)).AsUInt64(); // < 0, 1, ?, ? > - hi64 = Sse2.UnpackLow(CreateScalarUnsafe(e2), CreateScalarUnsafe(e3)).AsUInt64(); // < 2, 3, ?, ? > - return Sse2.UnpackLow(lo64, hi64).AsUInt32(); // < 0, 1, 2, 3 > + return Create(e0, e1, e2, e3); } - +#else if (AdvSimd.IsSupported) { Vector128 result = CreateScalarUnsafe(e0); @@ -1267,6 +1003,7 @@ public static unsafe Vector128 Create(uint e0, uint e1, uint e2, uint e3) result = AdvSimd.Insert(result, 2, e2); return AdvSimd.Insert(result, 3, e3); } +#endif return SoftwareFallback(e0, e1, e2, e3); @@ -1289,26 +1026,22 @@ static Vector128 SoftwareFallback(uint e0, uint e1, uint e2, uint e3) /// The value that element 1 will be initialized to. /// On x86, this method corresponds to __m128i _mm_setr_epi64x /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector128 Create(ulong e0, ulong e1) { - if (Sse41.X64.IsSupported) - { - Vector128 result = CreateScalarUnsafe(e0); // < 0, ? > - return Sse41.X64.Insert(result, e1, 1); // < 0, 1 > - } - +#if !TARGET_ARM && !TARGET_ARM64 if (Sse2.X64.IsSupported) { - return Sse2.UnpackLow(CreateScalarUnsafe(e0), CreateScalarUnsafe(e1)); // < 0, 1 > + return Create(e0, e1); } - +#else if (AdvSimd.IsSupported) { Vector128 result = CreateScalarUnsafe(e0); return AdvSimd.Insert(result, 1, e1); } +#endif return SoftwareFallback(e0, e1); diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs index 2a123ab52099c..ae4e95608b033 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs @@ -208,19 +208,12 @@ public static Vector AsVector(this Vector256 value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m256i _mm256_set1_epi8 /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector256 Create(byte value) { - if (Avx2.IsSupported) - { - Vector128 result = Vector128.CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - return Avx2.BroadcastScalarToVector256(result); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v > - } - if (Avx.IsSupported) { - Vector128 result = Vector128.Create(value); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -271,19 +264,12 @@ static Vector256 SoftwareFallback(byte value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m256d _mm256_set1_pd /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector256 Create(double value) { - if (Avx2.IsSupported) - { - Vector128 result = Vector128.CreateScalarUnsafe(value); // < v, ?, ?, ? > - return Avx2.BroadcastScalarToVector256(result); // < v, v, v, v > - } - if (Avx.IsSupported) { - Vector128 result = Vector128.Create(value); // < v, v, ?, ? > - return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1); // < v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -306,19 +292,12 @@ static Vector256 SoftwareFallback(double value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m256i _mm256_set1_epi16 /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector256 Create(short value) { - if (Avx2.IsSupported) - { - Vector128 result = Vector128.CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - return Avx2.BroadcastScalarToVector256(result); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v > - } - if (Avx.IsSupported) { - Vector128 result = Vector128.Create(value); // < v, v, v, v, v, v, v, v, ?, ?, ?, ?, ?, ?, ?, ? > - return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -353,19 +332,12 @@ static Vector256 SoftwareFallback(short value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m256i _mm256_set1_epi32 /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector256 Create(int value) { - if (Avx2.IsSupported) - { - Vector128 result = Vector128.CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ? > - return Avx2.BroadcastScalarToVector256(result); // < v, v, v, v, v, v, v, v > - } - if (Avx.IsSupported) { - Vector128 result = Vector128.Create(value); // < v, v, v, v, ?, ?, ?, ? > - return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1); // < v, v, v, v, v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -392,21 +364,12 @@ static Vector256 SoftwareFallback(int value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m256i _mm256_set1_epi64x /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector256 Create(long value) { - if (Sse2.X64.IsSupported) + if (Sse2.X64.IsSupported && Avx.IsSupported) { - if (Avx2.IsSupported) - { - Vector128 result = Vector128.CreateScalarUnsafe(value); // < v, ?, ?, ? > - return Avx2.BroadcastScalarToVector256(result); // < v, v, v, v > - } - else if (Avx.IsSupported) - { - Vector128 result = Vector128.Create(value); // < v, v, ?, ? > - return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1); // < v, v, v, v > - } + return Create(value); } return SoftwareFallback(value); @@ -429,20 +392,13 @@ static Vector256 SoftwareFallback(long value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m256i _mm256_set1_epi8 /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector256 Create(sbyte value) { - if (Avx2.IsSupported) - { - Vector128 result = Vector128.CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - return Avx2.BroadcastScalarToVector256(result); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v > - } - if (Avx.IsSupported) { - Vector128 result = Vector128.Create(value); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -493,19 +449,12 @@ static Vector256 SoftwareFallback(sbyte value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m256 _mm256_set1_ps /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector256 Create(float value) { - if (Avx2.IsSupported) - { - Vector128 result = Vector128.CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ? > - return Avx2.BroadcastScalarToVector256(result); // < v, v, v, v, v, v, v, v > - } - if (Avx.IsSupported) { - Vector128 result = Vector128.Create(value); // < v, v, v, v, ?, ?, ?, ? > - return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1); // < v, v, v, v, v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -532,20 +481,13 @@ static Vector256 SoftwareFallback(float value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m256i _mm256_set1_epi16 /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector256 Create(ushort value) { - if (Avx2.IsSupported) - { - Vector128 result = Vector128.CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? > - return Avx2.BroadcastScalarToVector256(result); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v > - } - if (Avx.IsSupported) { - Vector128 result = Vector128.Create(value); // < v, v, v, v, v, v, v, v, ?, ?, ?, ?, ?, ?, ?, ? > - return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1); // < v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -580,20 +522,13 @@ static Vector256 SoftwareFallback(ushort value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m256i _mm256_set1_epi32 /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector256 Create(uint value) { - if (Avx2.IsSupported) - { - Vector128 result = Vector128.CreateScalarUnsafe(value); // < v, ?, ?, ?, ?, ?, ?, ? > - return Avx2.BroadcastScalarToVector256(result); // < v, v, v, v, v, v, v, v > - } - if (Avx.IsSupported) { - Vector128 result = Vector128.Create(value); // < v, v, v, v, ?, ?, ?, ? > - return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1); // < v, v, v, v, v, v, v, v > + return Create(value); } return SoftwareFallback(value); @@ -620,22 +555,13 @@ static Vector256 SoftwareFallback(uint value) /// The value that all elements will be initialized to. /// On x86, this method corresponds to __m256i _mm256_set1_epi64x /// A new with all elements initialized to . - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector256 Create(ulong value) { - if (Sse2.X64.IsSupported) + if (Sse2.X64.IsSupported && Avx.IsSupported) { - if (Avx2.IsSupported) - { - Vector128 result = Vector128.CreateScalarUnsafe(value); // < v, ?, ?, ? > - return Avx2.BroadcastScalarToVector256(result); // < v, v, v, v > - } - else if (Avx.IsSupported) - { - Vector128 result = Vector128.Create(value); // < v, v, ?, ? > - return Avx.InsertVector128(result.ToVector256Unsafe(), result, 1); // < v, v, v, v > - } + return Create(value); } return SoftwareFallback(value); @@ -689,14 +615,12 @@ static Vector256 SoftwareFallback(ulong value) /// The value that element 31 will be initialized to. /// On x86, this method corresponds to __m256i _mm256_setr_epi8 /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector256 Create(byte e0, byte e1, byte e2, byte e3, byte e4, byte e5, byte e6, byte e7, byte e8, byte e9, byte e10, byte e11, byte e12, byte e13, byte e14, byte e15, byte e16, byte e17, byte e18, byte e19, byte e20, byte e21, byte e22, byte e23, byte e24, byte e25, byte e26, byte e27, byte e28, byte e29, byte e30, byte e31) { if (Avx.IsSupported) { - Vector128 lo128 = Vector128.Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); - Vector128 hi128 = Vector128.Create(e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31); - return Create(lo128, hi128); + return Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31); } return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31); @@ -750,14 +674,12 @@ static Vector256 SoftwareFallback(byte e0, byte e1, byte e2, byte e3, byte /// The value that element 3 will be initialized to. /// On x86, this method corresponds to __m256d _mm256_setr_pd /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector256 Create(double e0, double e1, double e2, double e3) { if (Avx.IsSupported) { - Vector128 lo128 = Vector128.Create(e0, e1); - Vector128 hi128 = Vector128.Create(e2, e3); - return Create(lo128, hi128); + return Create(e0, e1, e2, e3); } return SoftwareFallback(e0, e1, e2, e3); @@ -795,14 +717,12 @@ static Vector256 SoftwareFallback(double e0, double e1, double e2, doubl /// The value that element 15 will be initialized to. /// On x86, this method corresponds to __m256i _mm256_setr_epi16 /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector256 Create(short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7, short e8, short e9, short e10, short e11, short e12, short e13, short e14, short e15) { if (Avx.IsSupported) { - Vector128 lo128 = Vector128.Create(e0, e1, e2, e3, e4, e5, e6, e7); - Vector128 hi128 = Vector128.Create(e8, e9, e10, e11, e12, e13, e14, e15); - return Create(lo128, hi128); + return Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); } return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); @@ -844,14 +764,12 @@ static Vector256 SoftwareFallback(short e0, short e1, short e2, short e3, /// The value that element 7 will be initialized to. /// On x86, this method corresponds to __m256i _mm256_setr_epi32 /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector256 Create(int e0, int e1, int e2, int e3, int e4, int e5, int e6, int e7) { if (Avx.IsSupported) { - Vector128 lo128 = Vector128.Create(e0, e1, e2, e3); - Vector128 hi128 = Vector128.Create(e4, e5, e6, e7); - return Create(lo128, hi128); + return Create(e0, e1, e2, e3, e4, e5, e6, e7); } return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7); @@ -881,14 +799,12 @@ static Vector256 SoftwareFallback(int e0, int e1, int e2, int e3, int e4, i /// The value that element 3 will be initialized to. /// On x86, this method corresponds to __m256i _mm256_setr_epi64x /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector256 Create(long e0, long e1, long e2, long e3) { if (Sse2.X64.IsSupported && Avx.IsSupported) { - Vector128 lo128 = Vector128.Create(e0, e1); - Vector128 hi128 = Vector128.Create(e2, e3); - return Create(lo128, hi128); + return Create(e0, e1, e2, e3); } return SoftwareFallback(e0, e1, e2, e3); @@ -942,15 +858,13 @@ static Vector256 SoftwareFallback(long e0, long e1, long e2, long e3) /// The value that element 31 will be initialized to. /// On x86, this method corresponds to __m256i _mm256_setr_epi8 /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector256 Create(sbyte e0, sbyte e1, sbyte e2, sbyte e3, sbyte e4, sbyte e5, sbyte e6, sbyte e7, sbyte e8, sbyte e9, sbyte e10, sbyte e11, sbyte e12, sbyte e13, sbyte e14, sbyte e15, sbyte e16, sbyte e17, sbyte e18, sbyte e19, sbyte e20, sbyte e21, sbyte e22, sbyte e23, sbyte e24, sbyte e25, sbyte e26, sbyte e27, sbyte e28, sbyte e29, sbyte e30, sbyte e31) { if (Avx.IsSupported) { - Vector128 lo128 = Vector128.Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); - Vector128 hi128 = Vector128.Create(e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31); - return Create(lo128, hi128); + return Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31); } return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19, e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31); @@ -1008,14 +922,12 @@ static Vector256 SoftwareFallback(sbyte e0, sbyte e1, sbyte e2, sbyte e3, /// The value that element 7 will be initialized to. /// On x86, this method corresponds to __m256 _mm256_setr_ps /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] public static unsafe Vector256 Create(float e0, float e1, float e2, float e3, float e4, float e5, float e6, float e7) { if (Avx.IsSupported) { - Vector128 lo128 = Vector128.Create(e0, e1, e2, e3); - Vector128 hi128 = Vector128.Create(e4, e5, e6, e7); - return Create(lo128, hi128); + return Create(e0, e1, e2, e3, e4, e5, e6, e7); } return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7); @@ -1057,15 +969,13 @@ static Vector256 SoftwareFallback(float e0, float e1, float e2, float e3, /// The value that element 15 will be initialized to. /// On x86, this method corresponds to __m256i _mm256_setr_epi16 /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector256 Create(ushort e0, ushort e1, ushort e2, ushort e3, ushort e4, ushort e5, ushort e6, ushort e7, ushort e8, ushort e9, ushort e10, ushort e11, ushort e12, ushort e13, ushort e14, ushort e15) { if (Avx.IsSupported) { - Vector128 lo128 = Vector128.Create(e0, e1, e2, e3, e4, e5, e6, e7); - Vector128 hi128 = Vector128.Create(e8, e9, e10, e11, e12, e13, e14, e15); - return Create(lo128, hi128); + return Create(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); } return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); @@ -1107,15 +1017,13 @@ static Vector256 SoftwareFallback(ushort e0, ushort e1, ushort e2, ushor /// The value that element 7 will be initialized to. /// On x86, this method corresponds to __m256i _mm256_setr_epi32 /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector256 Create(uint e0, uint e1, uint e2, uint e3, uint e4, uint e5, uint e6, uint e7) { if (Avx.IsSupported) { - Vector128 lo128 = Vector128.Create(e0, e1, e2, e3); - Vector128 hi128 = Vector128.Create(e4, e5, e6, e7); - return Create(lo128, hi128); + return Create(e0, e1, e2, e3, e4, e5, e6, e7); } return SoftwareFallback(e0, e1, e2, e3, e4, e5, e6, e7); @@ -1145,15 +1053,13 @@ static Vector256 SoftwareFallback(uint e0, uint e1, uint e2, uint e3, uint /// The value that element 3 will be initialized to. /// On x86, this method corresponds to __m256i _mm256_setr_epi64x /// A new with each element initialized to corresponding specified value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector256 Create(ulong e0, ulong e1, ulong e2, ulong e3) { if (Sse2.X64.IsSupported && Avx.IsSupported) { - Vector128 lo128 = Vector128.Create(e0, e1); - Vector128 hi128 = Vector128.Create(e2, e3); - return Create(lo128, hi128); + return Create(e0, e1, e2, e3); } return SoftwareFallback(e0, e1, e2, e3);