Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable AVX-512 for string/span Equals/StartsWith #84885

Merged
merged 4 commits into from
Apr 17, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 41 additions & 100 deletions src/coreclr/jit/importervectorization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
#pragma hdrstop
#endif

// For now the max possible size is Vector256<ushort>.Count * 2
#define MaxPossibleUnrollSize 32
// For now the max possible size is Vector512<ushort>.Count * 2
#define MaxPossibleUnrollSize 64

//------------------------------------------------------------------------
// importer_vectorization.cpp
Expand Down Expand Up @@ -71,7 +71,7 @@ static bool ConvertToLowerCase(WCHAR* input, WCHAR* mask, int length)
#if defined(FEATURE_HW_INTRINSICS)
//------------------------------------------------------------------------
// impExpandHalfConstEqualsSIMD: Attempts to unroll and vectorize
// Equals against a constant WCHAR data for Length in [8..32] range
// Equals against a constant WCHAR data for Length in [8..64] range
// using SIMD instructions. C# equivalent of what this function emits:
//
// bool IsTestString(ReadOnlySpan<char> span)
Expand Down Expand Up @@ -108,121 +108,44 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(
{
assert(len >= 8 && len <= MaxPossibleUnrollSize);

if (!IsBaselineSimdIsaSupported())
const int byteLen = len * sizeof(WCHAR);
const int simdSize = (int)roundDownSIMDSize(byteLen);
if (byteLen > (simdSize * 2))
{
// We need baseline SIMD support at least
// Data is too big to be processed via two SIMD loads
// or baseline has no SIMD support
return nullptr;
}

CorInfoType baseType = CORINFO_TYPE_NATIVEUINT;

int simdSize;
var_types simdType;

NamedIntrinsic niEquals;

GenTreeVecCon* cnsVec1 = nullptr;
GenTreeVecCon* cnsVec2 = nullptr;
GenTree* toLowerVec1 = nullptr;
GenTree* toLowerVec2 = nullptr;

// Optimization: don't use two vectors for Length == 8 or 16
bool useSingleVector = false;
assert((byteLen >= simdSize) && (simdSize >= 16));

WCHAR cnsValue[MaxPossibleUnrollSize] = {};
WCHAR toLowerMask[MaxPossibleUnrollSize] = {};

memcpy((UINT8*)cnsValue, (UINT8*)cns, len * sizeof(WCHAR));
memcpy(cnsValue, cns, byteLen);

if ((cmpMode == OrdinalIgnoreCase) && !ConvertToLowerCase(cnsValue, toLowerMask, len))
{
// value contains non-ASCII chars, we can't proceed further
return nullptr;
}

#if defined(TARGET_XARCH)
if (compOpportunisticallyDependsOn(InstructionSet_Vector256) && len >= 16)
{
// Handle [16..32] inputs via two Vector256
assert(len >= 16 && len <= 32);

simdSize = 32;
simdType = TYP_SIMD32;

niEquals = NI_Vector256_op_Equality;

// Special case: use a single vector for Length == 16
useSingleVector = len == 16;

cnsVec1 = gtNewVconNode(simdType, cnsValue);
cnsVec2 = gtNewVconNode(simdType, cnsValue + len - 16);

if (cmpMode == OrdinalIgnoreCase)
{
toLowerVec1 = gtNewVconNode(simdType, toLowerMask);
toLowerVec2 = gtNewVconNode(simdType, toLowerMask + len - 16);
}
}
else
#endif // TARGET_XARCH
if (len <= 16)
{
// Handle [8..16] inputs via two Vector128
assert(len >= 8 && len <= 16);

simdSize = 16;
simdType = TYP_SIMD16;
const var_types simdType = getSIMDTypeForSize(simdSize);
const CorInfoType baseType = CORINFO_TYPE_NATIVEUINT;

niEquals = NI_Vector128_op_Equality;
GenTreeVecCon* cnsVec1 = gtNewVconNode(simdType, cnsValue);
GenTreeVecCon* cnsVec2 = gtNewVconNode(simdType, (BYTE*)cnsValue + byteLen - simdSize);

// Special case: use a single vector for Length == 8
useSingleVector = len == 8;

cnsVec1 = gtNewVconNode(simdType, cnsValue);
cnsVec2 = gtNewVconNode(simdType, cnsValue + len - 8);

if (cmpMode == OrdinalIgnoreCase)
{
toLowerVec1 = gtNewVconNode(simdType, toLowerMask);
toLowerVec2 = gtNewVconNode(simdType, toLowerMask + len - 8);
}
}
else
{
JITDUMP("impExpandHalfConstEqualsSIMD: No V256 support and data is too big for V128\n");
// NOTE: We might consider using four V128 for ARM64
return nullptr;
}

GenTree* zero = gtNewZeroConNode(simdType);

GenTree* offset1 = gtNewIconNode(dataOffset, TYP_I_IMPL);
GenTree* offset2 = gtNewIconNode(dataOffset + len * sizeof(USHORT) - simdSize, TYP_I_IMPL);
GenTree* dataPtr1 = gtNewOperNode(GT_ADD, TYP_BYREF, data, offset1);
GenTree* dataPtr2 = gtNewOperNode(GT_ADD, TYP_BYREF, gtClone(data), offset2);

GenTree* vec1 = gtNewIndir(simdType, dataPtr1);
GenTree* vec2 = gtNewIndir(simdType, dataPtr2);

// TODO-Unroll-CQ: Spill vec1 and vec2 for better pipelining, currently we end up emitting:
//
// vmovdqu xmm0, xmmword ptr [rcx+12]
// vpxor xmm0, xmm0, xmmword ptr[reloc @RWD00]
// vmovdqu xmm1, xmmword ptr [rcx+20]
// vpxor xmm1, xmm1, xmmword ptr[reloc @RWD16]
//
// While we should re-order them to be:
//
// vmovdqu xmm0, xmmword ptr [rcx+12]
// vmovdqu xmm1, xmmword ptr [rcx+20]
// vpxor xmm0, xmm0, xmmword ptr[reloc @RWD00]
// vpxor xmm1, xmm1, xmmword ptr[reloc @RWD16]
//
GenTree* offset1 = gtNewIconNode(dataOffset, TYP_I_IMPL);
GenTree* offset2 = gtNewIconNode(dataOffset + byteLen - simdSize, TYP_I_IMPL);
GenTree* vec1 = gtNewIndir(simdType, gtNewOperNode(GT_ADD, TYP_BYREF, data, offset1));
GenTree* vec2 = gtNewIndir(simdType, gtNewOperNode(GT_ADD, TYP_BYREF, gtClone(data), offset2));
EgorBo marked this conversation as resolved.
Show resolved Hide resolved

if (cmpMode == OrdinalIgnoreCase)
{
// Apply ASCII-only ToLowerCase mask (bitwise OR 0x20 for all a-Z chars)
assert((toLowerVec1 != nullptr) && (toLowerVec2 != nullptr));
GenTreeVecCon* toLowerVec1 = gtNewVconNode(simdType, toLowerMask);
GenTreeVecCon* toLowerVec2 = gtNewVconNode(simdType, (BYTE*)toLowerMask + byteLen - simdSize);

vec1 = gtNewSimdBinOpNode(GT_OR, simdType, vec1, toLowerVec1, baseType, simdSize);
vec2 = gtNewSimdBinOpNode(GT_OR, simdType, vec2, toLowerVec2, baseType, simdSize);
}
Expand All @@ -231,7 +154,25 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(
GenTree* xor1 = gtNewSimdBinOpNode(GT_XOR, simdType, vec1, cnsVec1, baseType, simdSize);
GenTree* xor2 = gtNewSimdBinOpNode(GT_XOR, simdType, vec2, cnsVec2, baseType, simdSize);
GenTree* orr = gtNewSimdBinOpNode(GT_OR, simdType, xor1, xor2, baseType, simdSize);
return gtNewSimdHWIntrinsicNode(TYP_BOOL, useSingleVector ? xor1 : orr, zero, niEquals, baseType, simdSize);

// Optimization: use a single load when byteLen equals simdSize.
// For code simplicity we always create nodes for two vectors case.
const bool useSingleVector = simdSize == byteLen;
return gtNewSimdCmpOpAllNode(GT_EQ, TYP_BOOL, useSingleVector ? xor1 : orr, gtNewZeroConNode(simdType), baseType,
simdSize);

// Codegen example for byteLen=40 and OrdinalIgnoreCase mode with AVX:
//
// vmovups ymm0, ymmword ptr [rcx+0CH]
// vpor ymm0, ymm0, ymmword ptr [reloc @RWD00]
// vpxor ymm0, ymm0, ymmword ptr [reloc @RWD32]
// vmovups ymm1, ymmword ptr [rcx+28H]
// vpor ymm1, ymm1, ymmword ptr [reloc @RWD64]
// vpxor ymm1, ymm1, ymmword ptr [reloc @RWD96]
// vpor ymm0, ymm0, ymm1
// vptest ymm0, ymm0
// sete al
// movzx rax, al
}
#endif // defined(FEATURE_HW_INTRINSICS)

Expand Down Expand Up @@ -491,7 +432,7 @@ GenTree* Compiler::impExpandHalfConstEquals(GenTreeLclVar* data,
indirCmp = impExpandHalfConstEqualsSWAR(gtClone(data)->AsLclVar(), cnsData, len, dataOffset, cmpMode);
}
#if defined(FEATURE_HW_INTRINSICS)
else if (len <= 32)
else if (IsBaselineSimdIsaSupported())
{
indirCmp = impExpandHalfConstEqualsSIMD(gtClone(data)->AsLclVar(), cnsData, len, dataOffset, cmpMode);
}
Expand Down