Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize Vector128 and Vector256.Create methods #35857

Merged
merged 14 commits into from
May 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 47 additions & 2 deletions src/coreclr/src/ToolBox/superpmi/superpmi/icorjitinfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1578,6 +1578,18 @@ bool MyICJI::runWithErrorTrap(void (*function)(void*), void* param)
return RunWithErrorTrap(function, param);
}

// Ideally we'd just use the copies of this in standardmacros.h
// however, superpmi is missing various other dependencies as well
static size_t ALIGN_UP_SPMI(size_t val, size_t alignment)
{
return (val + (alignment - 1)) & ~(alignment - 1);
}

static void* ALIGN_UP_SPMI(void* val, size_t alignment)
{
return (void*)ALIGN_UP_SPMI((size_t)val, alignment);
}

// get a block of memory for the code, readonly data, and read-write data
void MyICJI::allocMem(ULONG hotCodeSize, /* IN */
ULONG coldCodeSize, /* IN */
Expand All @@ -1590,13 +1602,46 @@ void MyICJI::allocMem(ULONG hotCodeSize, /* IN */
)
{
jitInstance->mc->cr->AddCall("allocMem");
// TODO-Cleanup: investigate if we need to check roDataBlock as well. Could hot block size be ever 0?

// TODO-Cleanup: Could hot block size be ever 0?
*hotCodeBlock = jitInstance->mc->cr->allocateMemory(hotCodeSize);

if (coldCodeSize > 0)
*coldCodeBlock = jitInstance->mc->cr->allocateMemory(coldCodeSize);
else
*coldCodeBlock = nullptr;
*roDataBlock = jitInstance->mc->cr->allocateMemory(roDataSize);

if (roDataSize > 0)
{
size_t roDataAlignment = sizeof(void*);
size_t roDataAlignedSize = static_cast<size_t>(roDataSize);

if ((flag & CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN) != 0)
{
roDataAlignment = 32;
}
else if ((flag & CORJIT_ALLOCMEM_FLG_RODATA_16BYTE_ALIGN) != 0)
{
roDataAlignment = 16;
}
else if (roDataSize >= 8)
{
roDataAlignment = 8;
}

// We need to round the roDataSize up to the alignment size and then
// overallocate by at most alignment - sizeof(void*) to ensure that
// we can offset roDataBlock to be an aligned address and that the
// allocation contains at least the originally requested size after

roDataAlignedSize = ALIGN_UP_SPMI(roDataAlignedSize, roDataAlignment);
roDataAlignedSize = roDataAlignedSize + (roDataAlignment - sizeof(void*));
*roDataBlock = jitInstance->mc->cr->allocateMemory(roDataAlignedSize);
*roDataBlock = ALIGN_UP_SPMI(*roDataBlock, roDataAlignment);
}
else
*roDataBlock = nullptr;

jitInstance->mc->cr->recAllocMem(hotCodeSize, coldCodeSize, roDataSize, xcptnsCount, flag, hotCodeBlock,
coldCodeBlock, roDataBlock);
}
Expand Down
7 changes: 4 additions & 3 deletions src/coreclr/src/jit/emit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5339,10 +5339,11 @@ UNATIVE_OFFSET emitter::emitDataGenBeg(UNATIVE_OFFSET size, bool align)
{
// Data can have any size but since alignment is deduced from the size there's no
// way to have a larger data size (e.g. 128) and request 4/8/16 byte alignment.
// 32 bytes (and more) alignment requires VM support (see ICorJitInfo::allocMem).
assert(size <= 16);
// As such, we restrict data above 16 bytes to be a multiple of 16 and assume 16-byte
// alignment. Alignment greater than 16 requires VM support (see ICorJitInfo::allocMem).
assert((size <= 16) || ((size % 16) == 0));

if (size == 16)
if (size >= 16)
{
emitConsDsc.align16 = true;
}
Expand Down
3 changes: 2 additions & 1 deletion src/coreclr/src/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10765,7 +10765,8 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
}

// Check that the offset is properly aligned (i.e. the ddd in [ddd])
assert((emitChkAlign == false) || (ins == INS_lea) || (((size_t)addr & (byteSize - 1)) == 0));
assert((emitChkAlign == false) || (ins == INS_lea) ||
((byteSize < 16) && (((size_t)addr & (byteSize - 1)) == 0)) || (((size_t)addr & (16 - 1)) == 0));
}
else
{
Expand Down
24 changes: 18 additions & 6 deletions src/coreclr/src/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -292,16 +292,19 @@ CORINFO_CLASS_HANDLE Compiler::gtGetStructHandleForHWSIMD(var_types simdType, va
// lookupId: Gets the NamedIntrinsic for a given method name and InstructionSet
//
// Arguments:
// comp -- The compiler
// sig -- The signature of the intrinsic
// className -- The name of the class associated with the HWIntrinsic to lookup
// methodName -- The name of the method associated with the HWIntrinsic to lookup
// enclosingClassName -- The name of the enclosing class of X64 classes
//
// Return Value:
// The NamedIntrinsic associated with methodName and isa
NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp,
const char* className,
const char* methodName,
const char* enclosingClassName)
NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp,
CORINFO_SIG_INFO* sig,
const char* className,
const char* methodName,
const char* enclosingClassName)
{
// TODO-Throughput: replace sequential search by binary search
CORINFO_InstructionSet isa = lookupIsa(className, enclosingClassName);
Expand All @@ -324,14 +327,23 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp,

for (int i = 0; i < (NI_HW_INTRINSIC_END - NI_HW_INTRINSIC_START - 1); i++)
{
const HWIntrinsicInfo& intrinsicInfo = hwIntrinsicInfoArray[i];

if (isa != hwIntrinsicInfoArray[i].isa)
{
continue;
}

if (strcmp(methodName, hwIntrinsicInfoArray[i].name) == 0)
int numArgs = static_cast<unsigned>(intrinsicInfo.numArgs);

if ((numArgs != -1) && (sig->numArgs != static_cast<unsigned>(intrinsicInfo.numArgs)))
{
continue;
}

if (strcmp(methodName, intrinsicInfo.name) == 0)
{
return hwIntrinsicInfoArray[i].id;
return intrinsicInfo.id;
}
}

Expand Down
9 changes: 5 additions & 4 deletions src/coreclr/src/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,10 +258,11 @@ struct HWIntrinsicInfo

static const HWIntrinsicInfo& lookup(NamedIntrinsic id);

static NamedIntrinsic lookupId(Compiler* comp,
const char* className,
const char* methodName,
const char* enclosingClassName);
static NamedIntrinsic lookupId(Compiler* comp,
CORINFO_SIG_INFO* sig,
const char* className,
const char* methodName,
const char* enclosingClassName);
static CORINFO_InstructionSet lookupIsa(const char* className, const char* enclosingClassName);

static unsigned lookupSimdSize(Compiler* comp, NamedIntrinsic id, CORINFO_SIG_INFO* sig);
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/src/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ HARDWARE_INTRINSIC(Vector128, AsVector2,
HARDWARE_INTRINSIC(Vector128, AsVector3, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, AsVector4, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, AsVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, Create, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, CreateScalarUnsafe, 16, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
// The instruction generated for float/double depends on which ISAs are supported
HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmppd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
Expand Down Expand Up @@ -76,6 +77,7 @@ HARDWARE_INTRINSIC(Vector256, AsVector256,
HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmppd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, get_Count, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, get_Zero, 32, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, Create, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, CreateScalarUnsafe, 32, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector256, WithElement, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
Expand Down
65 changes: 65 additions & 0 deletions src/coreclr/src/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
{
GenTree* retNode = nullptr;
GenTree* op1 = nullptr;
GenTree* op2 = nullptr;

if (!featureSIMD)
{
Expand Down Expand Up @@ -747,6 +748,70 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_Vector128_Create:
case NI_Vector256_Create:
{
#if defined(TARGET_X86)
if (varTypeIsLong(baseType))
{
// TODO-XARCH-CQ: It may be beneficial to emit the movq
// instruction, which takes a 64-bit memory address and
// works on 32-bit x86 systems.
break;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason you chose not to do this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not something we are handling today anywhere else and we have an identical TODO in the other places.

I believe this would just require us creating a GT_IND node since x86 requires it be a long* or ulong*, but I'm not positive if that is the ideal/correct way to handle it

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noting, this is also not something we are handling in the managed implementation today.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I gathered that, but was just curious, given the comment.

Copy link
Member Author

@tannergooding tannergooding May 6, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not super familiar with the differences between the different addressing forms in the JIT and when each is appropriate to use.

Do we have a helper function that will create a T* or ref T from an arbitrary T (In this case a ulong* from a ulong) when the given operand on the stack could be a constant, local, field, byref, or other indirection?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC you really want a GT_ADDR, to get the address of an operand. I the JIT addresses are not strongly typed, they are always TYP_BYREF, so it would just be something like:

GenTree* addr = gtNewOperNode(GT_ADDR, TYP_BYREF, op);

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Though I'd note that I'm not really pressing for you to do this now - I was just curious why you added the comment rather than doing it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll attempt this in a follow up PR, hopefully it is that straightforward 😄

}
#endif // TARGET_X86

// We shouldn't handle this as an intrinsic if the
// respective ISAs have been disabled by the user.

if (intrinsic == NI_Vector256_Create)
{
if (!compExactlyDependsOn(InstructionSet_AVX))
{
break;
}
}
else if (baseType == TYP_FLOAT)
{
if (!compExactlyDependsOn(InstructionSet_SSE))
{
break;
}
}
else if (!compExactlyDependsOn(InstructionSet_SSE2))
{
break;
}

if (sig->numArgs == 1)
{
op1 = impPopStack().val;
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
}
else if (sig->numArgs == 2)
{
op2 = impPopStack().val;
op1 = impPopStack().val;
retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, baseType, simdSize);
}
else
{
assert(sig->numArgs >= 3);

GenTreeArgList* tmp = nullptr;

for (unsigned i = 0; i < sig->numArgs; i++)
{
tmp = gtNewArgList(impPopStack().val);
tmp->gtOp2 = op1;
op1 = tmp;
}

retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
}
break;
}

case NI_Vector128_CreateScalarUnsafe:
{
assert(sig->numArgs == 1);
Expand Down
5 changes: 4 additions & 1 deletion src/coreclr/src/jit/importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4504,7 +4504,10 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method)

if ((namespaceName[0] == '\0') || (strcmp(namespaceName, platformNamespaceName) == 0))
{
result = HWIntrinsicInfo::lookupId(this, className, methodName, enclosingClassName);
CORINFO_SIG_INFO sig;
info.compCompHnd->getMethodSig(method, &sig);

result = HWIntrinsicInfo::lookupId(this, &sig, className, methodName, enclosingClassName);
}
else if (strcmp(methodName, "get_IsSupported") == 0)
{
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/src/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ class Lowering final : public Phase
#ifdef FEATURE_HW_INTRINSICS
void LowerHWIntrinsic(GenTreeHWIntrinsic* node);
void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition);
void LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node);
void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node);
#endif // FEATURE_HW_INTRINSICS

Expand Down
Loading