Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize Vector128 and Vector256.Create methods #35857

Merged
merged 14 commits into from
May 8, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/coreclr/src/jit/emit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5339,10 +5339,11 @@ UNATIVE_OFFSET emitter::emitDataGenBeg(UNATIVE_OFFSET size, bool align)
{
// Data can have any size but since alignment is deduced from the size there's no
// way to have a larger data size (e.g. 128) and request 4/8/16 byte alignment.
// 32 bytes (and more) alignment requires VM support (see ICorJitInfo::allocMem).
assert(size <= 16);
// As such, we restrict data above 16 bytes to be a multiple of 16 and assume 16-byte
// alignment. Alignment greater than 16 requires VM support (see ICorJitInfo::allocMem).
assert((size <= 16) || ((size % 16) == 0));

if (size == 16)
if (size >= 16)
{
emitConsDsc.align16 = true;
}
Expand Down
3 changes: 2 additions & 1 deletion src/coreclr/src/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10765,7 +10765,8 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
}

// Check that the offset is properly aligned (i.e. the ddd in [ddd])
assert((emitChkAlign == false) || (ins == INS_lea) || (((size_t)addr & (byteSize - 1)) == 0));
assert((emitChkAlign == false) || (ins == INS_lea) ||
((byteSize < 16) && (((size_t)addr & (byteSize - 1)) == 0)) || (((size_t)addr & (16 - 1)) == 0));
}
else
{
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/src/jit/hwintrinsiclistxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ HARDWARE_INTRINSIC(Vector128, AsVector2,
HARDWARE_INTRINSIC(Vector128, AsVector3, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, AsVector4, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, AsVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, Create, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, CreateScalarUnsafe, 16, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
// The instruction generated for float/double depends on which ISAs are supported
HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmppd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
Expand Down Expand Up @@ -76,6 +77,7 @@ HARDWARE_INTRINSIC(Vector256, AsVector256,
HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmppd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, get_Count, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, get_Zero, 32, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, Create, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, CreateScalarUnsafe, 32, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector256, WithElement, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
Expand Down
65 changes: 65 additions & 0 deletions src/coreclr/src/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
{
GenTree* retNode = nullptr;
GenTree* op1 = nullptr;
GenTree* op2 = nullptr;

if (!featureSIMD)
{
Expand Down Expand Up @@ -747,6 +748,70 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_Vector128_Create:
case NI_Vector256_Create:
{
#if defined(TARGET_X86)
if (varTypeIsLong(baseType))
{
// TODO-XARCH-CQ: It may be beneficial to emit the movq
// instruction, which takes a 64-bit memory address and
// works on 32-bit x86 systems.
break;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason you chose not to do this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not something we are handling today anywhere else and we have an identical TODO in the other places.

I believe this would just require us creating a GT_IND node since x86 requires it be a long* or ulong*, but I'm not positive if that is the ideal/correct way to handle it

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noting, this is also not something we are handling in the managed implementation today.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I gathered that, but was just curious, given the comment.

Copy link
Member Author

@tannergooding tannergooding May 6, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not super familiar with the differences between the different addressing forms in the JIT and when each is appropriate to use.

Do we have a helper function that will create a T* or ref T from an arbitrary T (In this case a ulong* from a ulong) when the given operand on the stack could be a constant, local, field, byref, or other indirection?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC you really want a GT_ADDR, to get the address of an operand. I the JIT addresses are not strongly typed, they are always TYP_BYREF, so it would just be something like:

GenTree* addr = gtNewOperNode(GT_ADDR, TYP_BYREF, op);

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Though I'd note that I'm not really pressing for you to do this now - I was just curious why you added the comment rather than doing it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll attempt this in a follow up PR, hopefully it is that straightforward 😄

}
#endif // TARGET_X86

// We shouldn't handle this as an intrinsic if the
// respective ISAs have been disabled by the user.

if (intrinsic == NI_Vector256_Create)
{
if (!compExactlyDependsOn(InstructionSet_AVX))
{
break;
}
}
else if (baseType == TYP_FLOAT)
{
if (!compExactlyDependsOn(InstructionSet_SSE))
{
break;
}
}
else if (!compExactlyDependsOn(InstructionSet_SSE2))
{
break;
}

if (sig->numArgs == 1)
{
op1 = impPopStack().val;
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
}
else if (sig->numArgs == 2)
{
op2 = impPopStack().val;
op1 = impPopStack().val;
retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, baseType, simdSize);
}
else
{
assert(sig->numArgs >= 3);

GenTreeArgList* tmp = nullptr;

for (unsigned i = 0; i < sig->numArgs; i++)
{
tmp = gtNewArgList(impPopStack().val);
tmp->gtOp2 = op1;
op1 = tmp;
}

retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
}
break;
}

case NI_Vector128_CreateScalarUnsafe:
{
assert(sig->numArgs == 1);
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/src/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,7 @@ class Lowering final : public Phase
#ifdef FEATURE_HW_INTRINSICS
void LowerHWIntrinsic(GenTreeHWIntrinsic* node);
void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition);
void LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node);
void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node);
#endif // FEATURE_HW_INTRINSICS

Expand Down
Loading