Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize bswap+mov to movbe on xarch #66965

Merged
merged 11 commits into from
May 8, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/coreclr/inc/clrconfigvalues.h
Original file line number Diff line number Diff line change
Expand Up @@ -757,6 +757,7 @@ RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableBMI2, W("EnableBMI2"), 1
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableFMA, W("EnableFMA"), 1, "Allows FMA+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableLZCNT, W("EnableLZCNT"), 1, "Allows LZCNT+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnablePCLMULQDQ, W("EnablePCLMULQDQ"), 1, "Allows PCLMULQDQ+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableMOVBE, W("EnableMOVBE"), 1, "Allows MOVBE+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnablePOPCNT, W("EnablePOPCNT"), 1, "Allows POPCNT+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableSSE, W("EnableSSE"), 1, "Allows SSE+ hardware intrinsics to be disabled")
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableSSE2, W("EnableSSE2"), 1, "Allows SSE2+ hardware intrinsics to be disabled")
Expand Down
90 changes: 56 additions & 34 deletions src/coreclr/inc/corinfoinstructionset.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,23 +58,25 @@ enum CORINFO_InstructionSet
InstructionSet_Vector128=17,
InstructionSet_Vector256=18,
InstructionSet_AVXVNNI=19,
InstructionSet_X86Base_X64=20,
InstructionSet_SSE_X64=21,
InstructionSet_SSE2_X64=22,
InstructionSet_SSE3_X64=23,
InstructionSet_SSSE3_X64=24,
InstructionSet_SSE41_X64=25,
InstructionSet_SSE42_X64=26,
InstructionSet_AVX_X64=27,
InstructionSet_AVX2_X64=28,
InstructionSet_AES_X64=29,
InstructionSet_BMI1_X64=30,
InstructionSet_BMI2_X64=31,
InstructionSet_FMA_X64=32,
InstructionSet_LZCNT_X64=33,
InstructionSet_PCLMULQDQ_X64=34,
InstructionSet_POPCNT_X64=35,
InstructionSet_AVXVNNI_X64=36,
InstructionSet_MOVBE=20,
InstructionSet_X86Base_X64=21,
InstructionSet_SSE_X64=22,
InstructionSet_SSE2_X64=23,
InstructionSet_SSE3_X64=24,
InstructionSet_SSSE3_X64=25,
InstructionSet_SSE41_X64=26,
InstructionSet_SSE42_X64=27,
InstructionSet_AVX_X64=28,
InstructionSet_AVX2_X64=29,
InstructionSet_AES_X64=30,
InstructionSet_BMI1_X64=31,
InstructionSet_BMI2_X64=32,
InstructionSet_FMA_X64=33,
InstructionSet_LZCNT_X64=34,
InstructionSet_PCLMULQDQ_X64=35,
InstructionSet_POPCNT_X64=36,
InstructionSet_AVXVNNI_X64=37,
InstructionSet_MOVBE_X64=38,
#endif // TARGET_AMD64
#ifdef TARGET_X86
InstructionSet_X86Base=1,
Expand All @@ -96,23 +98,25 @@ enum CORINFO_InstructionSet
InstructionSet_Vector128=17,
InstructionSet_Vector256=18,
InstructionSet_AVXVNNI=19,
InstructionSet_X86Base_X64=20,
InstructionSet_SSE_X64=21,
InstructionSet_SSE2_X64=22,
InstructionSet_SSE3_X64=23,
InstructionSet_SSSE3_X64=24,
InstructionSet_SSE41_X64=25,
InstructionSet_SSE42_X64=26,
InstructionSet_AVX_X64=27,
InstructionSet_AVX2_X64=28,
InstructionSet_AES_X64=29,
InstructionSet_BMI1_X64=30,
InstructionSet_BMI2_X64=31,
InstructionSet_FMA_X64=32,
InstructionSet_LZCNT_X64=33,
InstructionSet_PCLMULQDQ_X64=34,
InstructionSet_POPCNT_X64=35,
InstructionSet_AVXVNNI_X64=36,
InstructionSet_MOVBE=20,
InstructionSet_X86Base_X64=21,
InstructionSet_SSE_X64=22,
InstructionSet_SSE2_X64=23,
InstructionSet_SSE3_X64=24,
InstructionSet_SSSE3_X64=25,
InstructionSet_SSE41_X64=26,
InstructionSet_SSE42_X64=27,
InstructionSet_AVX_X64=28,
InstructionSet_AVX2_X64=29,
InstructionSet_AES_X64=30,
InstructionSet_BMI1_X64=31,
InstructionSet_BMI2_X64=32,
InstructionSet_FMA_X64=33,
InstructionSet_LZCNT_X64=34,
InstructionSet_PCLMULQDQ_X64=35,
InstructionSet_POPCNT_X64=36,
InstructionSet_AVXVNNI_X64=37,
InstructionSet_MOVBE_X64=38,
#endif // TARGET_X86

};
Expand Down Expand Up @@ -212,6 +216,8 @@ struct CORINFO_InstructionSetFlags
AddInstructionSet(InstructionSet_POPCNT_X64);
if (HasInstructionSet(InstructionSet_AVXVNNI))
AddInstructionSet(InstructionSet_AVXVNNI_X64);
if (HasInstructionSet(InstructionSet_MOVBE))
AddInstructionSet(InstructionSet_MOVBE_X64);
#endif // TARGET_AMD64
#ifdef TARGET_X86
#endif // TARGET_X86
Expand Down Expand Up @@ -357,6 +363,10 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins
resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI);
if (resultflags.HasInstructionSet(InstructionSet_AVXVNNI_X64) && !resultflags.HasInstructionSet(InstructionSet_AVXVNNI))
resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI_X64);
if (resultflags.HasInstructionSet(InstructionSet_MOVBE) && !resultflags.HasInstructionSet(InstructionSet_MOVBE_X64))
resultflags.RemoveInstructionSet(InstructionSet_MOVBE);
if (resultflags.HasInstructionSet(InstructionSet_MOVBE_X64) && !resultflags.HasInstructionSet(InstructionSet_MOVBE))
resultflags.RemoveInstructionSet(InstructionSet_MOVBE_X64);
if (resultflags.HasInstructionSet(InstructionSet_SSE) && !resultflags.HasInstructionSet(InstructionSet_X86Base))
resultflags.RemoveInstructionSet(InstructionSet_SSE);
if (resultflags.HasInstructionSet(InstructionSet_SSE2) && !resultflags.HasInstructionSet(InstructionSet_SSE))
Expand Down Expand Up @@ -393,6 +403,8 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins
resultflags.RemoveInstructionSet(InstructionSet_Vector256);
if (resultflags.HasInstructionSet(InstructionSet_AVXVNNI) && !resultflags.HasInstructionSet(InstructionSet_AVX2))
resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI);
if (resultflags.HasInstructionSet(InstructionSet_MOVBE) && !resultflags.HasInstructionSet(InstructionSet_SSE42))
resultflags.RemoveInstructionSet(InstructionSet_MOVBE);
#endif // TARGET_AMD64
#ifdef TARGET_X86
if (resultflags.HasInstructionSet(InstructionSet_SSE) && !resultflags.HasInstructionSet(InstructionSet_X86Base))
Expand Down Expand Up @@ -431,6 +443,8 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins
resultflags.RemoveInstructionSet(InstructionSet_Vector256);
if (resultflags.HasInstructionSet(InstructionSet_AVXVNNI) && !resultflags.HasInstructionSet(InstructionSet_AVX2))
resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI);
if (resultflags.HasInstructionSet(InstructionSet_MOVBE) && !resultflags.HasInstructionSet(InstructionSet_SSE42))
resultflags.RemoveInstructionSet(InstructionSet_MOVBE);
#endif // TARGET_X86

} while (!oldflags.Equals(resultflags));
Expand Down Expand Up @@ -563,6 +577,10 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
return "AVXVNNI";
case InstructionSet_AVXVNNI_X64 :
return "AVXVNNI_X64";
case InstructionSet_MOVBE :
return "MOVBE";
case InstructionSet_MOVBE_X64 :
return "MOVBE_X64";
#endif // TARGET_AMD64
#ifdef TARGET_X86
case InstructionSet_X86Base :
Expand Down Expand Up @@ -603,6 +621,8 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
return "Vector256";
case InstructionSet_AVXVNNI :
return "AVXVNNI";
case InstructionSet_MOVBE :
return "MOVBE";
#endif // TARGET_X86

default:
Expand Down Expand Up @@ -652,6 +672,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
case READYTORUN_INSTRUCTION_Pclmulqdq: return InstructionSet_PCLMULQDQ;
case READYTORUN_INSTRUCTION_Popcnt: return InstructionSet_POPCNT;
case READYTORUN_INSTRUCTION_AvxVnni: return InstructionSet_AVXVNNI;
case READYTORUN_INSTRUCTION_Movbe: return InstructionSet_MOVBE;
#endif // TARGET_AMD64
#ifdef TARGET_X86
case READYTORUN_INSTRUCTION_X86Base: return InstructionSet_X86Base;
Expand All @@ -671,6 +692,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
case READYTORUN_INSTRUCTION_Pclmulqdq: return InstructionSet_PCLMULQDQ;
case READYTORUN_INSTRUCTION_Popcnt: return InstructionSet_POPCNT;
case READYTORUN_INSTRUCTION_AvxVnni: return InstructionSet_AVXVNNI;
case READYTORUN_INSTRUCTION_Movbe: return InstructionSet_MOVBE;
#endif // TARGET_X86

default:
Expand Down
10 changes: 5 additions & 5 deletions src/coreclr/inc/jiteeversionguid.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ typedef const GUID *LPCGUID;
#define GUID_DEFINED
#endif // !GUID_DEFINED

constexpr GUID JITEEVersionIdentifier = { /* b0719856-6fe6-407c-bf40-7a57e22b2382 */
0xb0719856,
0x6fe6,
0x407c,
{0xbf, 0x40, 0x7a, 0x57, 0xe2, 0x2b, 0x23, 0x82}
constexpr GUID JITEEVersionIdentifier = { /* dfda4767-9618-4d6a-8105-a86034dc52eb */
0xdfda4767,
0x9618,
0x4d6a,
{0x81, 0x05, 0xa8, 0x60, 0x34, 0xdc, 0x52, 0xeb}
};

//////////////////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/inc/readytoruninstructionset.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ enum ReadyToRunInstructionSet
READYTORUN_INSTRUCTION_Rdm=24,
READYTORUN_INSTRUCTION_AvxVnni=25,
READYTORUN_INSTRUCTION_Rcpc=26,
READYTORUN_INSTRUCTION_Movbe=27,

};

Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/codegenlinear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1653,7 +1653,7 @@ void CodeGen::genConsumeRegs(GenTree* tree)
}
#endif // FEATURE_HW_INTRINSICS
#endif // TARGET_XARCH
else if (tree->OperIs(GT_BITCAST, GT_NEG, GT_CAST, GT_LSH))
else if (tree->OperIs(GT_BITCAST, GT_NEG, GT_CAST, GT_LSH, GT_BSWAP, GT_BSWAP16))
{
genConsumeRegs(tree->gtGetOp1());
}
Expand Down
42 changes: 24 additions & 18 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -553,36 +553,39 @@ void CodeGen::genCodeForNegNot(GenTree* tree)
//
void CodeGen::genCodeForBswap(GenTree* tree)
{
// TODO: If we're swapping immediately after a read from memory or immediately before
// a write to memory, use the MOVBE instruction instead of the BSWAP instruction if
// the platform supports it.

assert(tree->OperIs(GT_BSWAP, GT_BSWAP16));

regNumber targetReg = tree->GetRegNum();
var_types targetType = tree->TypeGet();

GenTree* operand = tree->gtGetOp1();
assert(operand->isUsedFromReg());
regNumber operandReg = genConsumeReg(operand);

inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true);
genConsumeRegs(operand);

if (tree->OperIs(GT_BSWAP))
if (operand->isUsedFromReg())
aromaa marked this conversation as resolved.
Show resolved Hide resolved
aromaa marked this conversation as resolved.
Show resolved Hide resolved
{
// 32-bit and 64-bit byte swaps use "bswap reg"
inst_RV(INS_bswap, targetReg, targetType);
}
else
{
// 16-bit byte swaps use "ror reg.16, 8"
inst_RV_IV(INS_ror_N, targetReg, 8 /* val */, emitAttr::EA_2BYTE);
inst_Mov(targetType, targetReg, operand->GetRegNum(), /* canSkip */ true);

if (!genCanOmitNormalizationForBswap16(tree))
if (tree->OperIs(GT_BSWAP))
{
// 32-bit and 64-bit byte swaps use "bswap reg"
inst_RV(INS_bswap, targetReg, targetType);
}
else
{
GetEmitter()->emitIns_Mov(INS_movzx, EA_2BYTE, targetReg, targetReg, /* canSkip */ false);
// 16-bit byte swaps use "ror reg.16, 8"
inst_RV_IV(INS_ror_N, targetReg, 8 /* val */, emitAttr::EA_2BYTE);

if (!genCanOmitNormalizationForBswap16(tree))
{
GetEmitter()->emitIns_Mov(INS_movzx, EA_2BYTE, targetReg, targetReg, /* canSkip */ false);
}
}
}
else
{
GetEmitter()->emitInsBinary(INS_movbe, emitTypeSize(operand), tree, operand);
}
aromaa marked this conversation as resolved.
Show resolved Hide resolved

genProduceReg(tree);
}
Expand Down Expand Up @@ -5142,7 +5145,10 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree)
}
else
{
GetEmitter()->emitInsStoreInd(ins_Store(data->TypeGet()), emitTypeSize(tree), tree);
GetEmitter()->emitInsStoreInd(data->OperIs(GT_BSWAP, GT_BSWAP16) && data->isContained()
? INS_movbe
: ins_Store(data->TypeGet()),
emitTypeSize(tree), tree);
}
}
}
Expand Down
42 changes: 42 additions & 0 deletions src/coreclr/jit/emitxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3339,6 +3339,13 @@ void emitter::emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* m
GenTree* addr = mem->Addr();
GenTree* data = mem->Data();

if (data->OperIs(GT_BSWAP, GT_BSWAP16) && data->isContained())
{
assert(ins == INS_movbe);

data = data->gtGetOp1();
}

if (addr->OperGet() == GT_CLS_VAR_ADDR)
{
if (data->isContainedIntOrIImmed())
Expand Down Expand Up @@ -10422,6 +10429,13 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
// Is this a 'big' opcode?
else if (code & 0xFF000000)
{
if (size == EA_2BYTE)
{
assert(ins == INS_movbe);

dst += emitOutputByte(dst, 0x66);
}

// Output the REX prefix
dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);

Expand Down Expand Up @@ -11206,6 +11220,13 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
// Is this a 'big' opcode?
else if (code & 0xFF000000)
{
if (size == EA_2BYTE)
{
assert(ins == INS_movbe);

dst += emitOutputByte(dst, 0x66);
}

// Output the REX prefix
dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);

Expand Down Expand Up @@ -11676,6 +11697,13 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc)
// Is this a 'big' opcode?
else if (code & 0xFF000000)
{
if (size == EA_2BYTE)
{
assert(ins == INS_movbe);

dst += emitOutputByte(dst, 0x66);
}

// Output the REX prefix
dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code);

Expand Down Expand Up @@ -16282,6 +16310,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
break;
}

case INS_movbe:
if (memAccessKind == PERFSCORE_MEMORY_READ)
{
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_1C;
}
else
{
assert(memAccessKind == PERFSCORE_MEMORY_WRITE);
result.insThroughput = PERFSCORE_THROUGHPUT_2X;
result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_1C;
}
break;

default:
// unhandled instruction insFmt combination
perfScoreUnhandledInstruction(id, &result);
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/instrsxarch.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868,
#define SSEDBL(c) PACK3(0xf2, 0x0f, c)
#define PCKDBL(c) PACK3(0x66, 0x0f, c)
#define PCKFLT(c) PACK2(0x0f,c)
#define PCKMVB(c) PACK3(0x0F, 0x38, c)

// These macros encode extra byte that is implicit in the macro.
#define PACK4(byte1,byte2,byte3,byte4) (((byte1) << 16) | ((byte2) << 24) | (byte3) | ((byte4) << 8))
Expand Down Expand Up @@ -618,6 +619,9 @@ INST3(tzcnt, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE,
// LZCNT
INST3(lzcnt, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBD), Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF )

// MOVBE
INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE, PCKMVB(0xF0), INS_FLAGS_None )

// POPCNT
INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF )

Expand Down
7 changes: 7 additions & 0 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,13 @@ GenTree* Lowering::LowerNode(GenTree* node)
}
break;

#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
case GT_BSWAP:
case GT_BSWAP16:
LowerBswapOp(node->AsOp());
break;
#endif // FEATURE_HW_INTRINSICS && TARGET_XARCH

default:
break;
}
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ class Lowering final : public Phase
GenTree* TryLowerAndOpToResetLowestSetBit(GenTreeOp* andNode);
GenTree* TryLowerAndOpToExtractLowestSetBit(GenTreeOp* andNode);
GenTree* TryLowerAndOpToAndNot(GenTreeOp* andNode);
void LowerBswapOp(GenTreeOp* node);
#elif defined(TARGET_ARM64)
bool IsValidConstForMovImm(GenTreeHWIntrinsic* node);
void LowerHWIntrinsicFusedMultiplyAddScalar(GenTreeHWIntrinsic* node);
Expand Down
Loading