Skip to content

Commit

Permalink
Enable AVX-512 for block unrollings (both copying and zeroing) (#85389)
Browse files Browse the repository at this point in the history
  • Loading branch information
EgorBo authored Apr 27, 2023
1 parent 92ba066 commit 953d290
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 17 deletions.
34 changes: 24 additions & 10 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3172,9 +3172,12 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
{
regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT);

unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
? YMM_REGSIZE_BYTES
: XMM_REGSIZE_BYTES;
unsigned regSize = compiler->roundDownSIMDSize(size);
if (size < ZMM_RECOMMENDED_THRESHOLD)
{
// Involve ZMM only for large data due to possible downclocking.
regSize = min(regSize, YMM_REGSIZE_BYTES);
}

bool zeroing = false;
if (src->gtSkipReloadOrCopy()->IsIntegralConst(0))
Expand All @@ -3186,6 +3189,9 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
}
else
{
// TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk.
regSize = min(regSize, YMM_REGSIZE_BYTES);

emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false);
emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg);
#ifdef TARGET_X86
Expand Down Expand Up @@ -3232,6 +3238,11 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
dstOffset += regSize;
bytesWritten += regSize;

if (!zeroing)
{
assert(regSize <= YMM_REGSIZE_BYTES);
}

if (!zeroing && regSize == YMM_REGSIZE_BYTES && size - bytesWritten < YMM_REGSIZE_BYTES)
{
regSize = XMM_REGSIZE_BYTES;
Expand All @@ -3250,8 +3261,8 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
}
else
{
// if reminder is <=16 then switch to XMM
regSize = size <= XMM_REGSIZE_BYTES ? XMM_REGSIZE_BYTES : regSize;
// Get optimal register size to cover the whole remainder (with overlapping)
regSize = compiler->roundUpSIMDSize(size);

// Rewind dstOffset so we can fit a vector for the while remainder
dstOffset -= (regSize - size);
Expand Down Expand Up @@ -3469,9 +3480,12 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
instruction simdMov = simdUnalignedMovIns();

// Get the largest SIMD register available if the size is large enough
unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
? YMM_REGSIZE_BYTES
: XMM_REGSIZE_BYTES;
unsigned regSize = compiler->roundDownSIMDSize(size);
if (size < ZMM_RECOMMENDED_THRESHOLD)
{
// Involve ZMM only for large data due to possible downclocking.
regSize = min(regSize, YMM_REGSIZE_BYTES);
}

auto emitSimdMovs = [&]() {
if (srcLclNum != BAD_VAR_NUM)
Expand Down Expand Up @@ -3516,8 +3530,8 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
}
else
{
// if reminder is <=16 then switch to XMM
regSize = size <= XMM_REGSIZE_BYTES ? XMM_REGSIZE_BYTES : regSize;
// Get optimal register size to cover the whole remainder (with overlapping)
regSize = compiler->roundUpSIMDSize(size);

// Rewind dstOffset so we can fit a vector for the while remainder
srcOffset -= (regSize - size);
Expand Down
8 changes: 1 addition & 7 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -8896,12 +8896,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
{
maxRegSize = maxSIMDStructBytes();
#if defined(TARGET_XARCH)
if (type != UnrollKind::Memmove)
{
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial.
// Enabled for Memmove only for now.
maxRegSize = min(maxRegSize, YMM_REGSIZE_BYTES);
}
threshold = maxRegSize;
#elif defined(TARGET_ARM64)
// ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
Expand Down Expand Up @@ -8933,7 +8927,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
//
// | arch | memset | memcpy |
// |-------------|--------|--------|
// | x86 avx512 | 512 | 256 | (TODO-XARCH-AVX512: ignored for now)
// | x86 avx512 | 512 | 256 |
// | x86 avx | 256 | 128 |
// | x86 sse | 128 | 64 |
// | arm64 | 256 | 128 | ldp/stp (2x128bit)
Expand Down

0 comments on commit 953d290

Please sign in to comment.