Skip to content

Commit

Permalink
Merge pull request #2023 from lioncash/vpopcnt
Browse files Browse the repository at this point in the history
IR: Handle 256-bit VPopcount
  • Loading branch information
Sonicadvance1 authored Sep 27, 2022
2 parents 3e287a3 + 59aa324 commit 4e441e5
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 35 deletions.
13 changes: 8 additions & 5 deletions External/FEXCore/Source/Interface/Core/Interpreter/VectorOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -434,21 +434,24 @@ DEF_OP(VAbs) {
}

DEF_OP(VPopcount) {
auto Op = IROp->C<IR::IROp_VPopcount>();
const auto Op = IROp->C<IR::IROp_VPopcount>();
const uint8_t OpSize = IROp->Size;

void *Src = GetSrc<void*>(Data->SSAData, Op->Vector);
uint8_t Tmp[16];
uint8_t Tmp[Core::CPUState::XMM_AVX_REG_SIZE];

const uint8_t Elements = OpSize / Op->Header.ElementSize;
const uint8_t ElementSize = Op->Header.ElementSize;
const uint8_t Elements = OpSize / ElementSize;

const auto Func = [](auto a) { return std::popcount(a); };
switch (Op->Header.ElementSize) {
switch (ElementSize) {
DO_VECTOR_1SRC_OP(1, uint8_t, Func)
DO_VECTOR_1SRC_OP(2, uint16_t, Func)
DO_VECTOR_1SRC_OP(4, uint32_t, Func)
DO_VECTOR_1SRC_OP(8, uint64_t, Func)
default: LOGMAN_MSG_A_FMT("Unknown Element Size: {}", Op->Header.ElementSize); break;
default:
LOGMAN_MSG_A_FMT("Unknown Element Size: {}", ElementSize);
break;
}
memcpy(GDP, Tmp, OpSize);
}
Expand Down
67 changes: 50 additions & 17 deletions External/FEXCore/Source/Interface/Core/JIT/Arm64/VectorOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -751,25 +751,58 @@ DEF_OP(VAbs) {
}

DEF_OP(VPopcount) {
auto Op = IROp->C<IR::IROp_VPopcount>();
const uint8_t OpSize = IROp->Size;
if (OpSize == 8) {
// Scalar
switch (Op->Header.ElementSize) {
case 1: {
cnt(GetDst(Node).V8B(), GetSrc(Op->Vector.ID()).V8B());
break;
}
default: LOGMAN_MSG_A_FMT("Unknown Element Size: {}", Op->Header.ElementSize); break;
}
}
else {
// Vector
switch (Op->Header.ElementSize) {
const auto Op = IROp->C<IR::IROp_VPopcount>();
const auto OpSize = IROp->Size;
const bool IsScalar = OpSize == 8;

const auto ElementSize = Op->Header.ElementSize;

const auto Dst = GetDst(Node);
const auto Src = GetSrc(Op->Vector.ID());

if (HostSupportsSVE && !IsScalar) {
const auto Pred = OpSize == 16 ? PRED_TMP_16B.Merging()
: PRED_TMP_32B.Merging();

switch (ElementSize) {
case 1:
cnt(GetDst(Node).V16B(), GetSrc(Op->Vector.ID()).V16B());
cnt(Dst.Z().VnB(), Pred, Src.Z().VnB());
break;
default: LOGMAN_MSG_A_FMT("Unknown Element Size: {}", Op->Header.ElementSize); break;
case 2:
cnt(Dst.Z().VnH(), Pred, Src.Z().VnH());
break;
case 4:
cnt(Dst.Z().VnS(), Pred, Src.Z().VnS());
break;
case 8:
cnt(Dst.Z().VnD(), Pred, Src.Z().VnD());
break;
default:
LOGMAN_MSG_A_FMT("Unknown Element Size: {}", ElementSize);
break;
}
} else {
if (IsScalar) {
// Scalar
switch (ElementSize) {
case 1: {
cnt(Dst.V8B(), Src.V8B());
break;
}
default:
LOGMAN_MSG_A_FMT("Unknown Element Size: {}", ElementSize);
break;
}
} else {
// Vector
switch (ElementSize) {
case 1:
cnt(Dst.V16B(), Src.V16B());
break;
default:
LOGMAN_MSG_A_FMT("Unknown Element Size: {}", ElementSize);
break;
}
}
}
}
Expand Down
49 changes: 36 additions & 13 deletions External/FEXCore/Source/Interface/Core/JIT/x86_64/VectorOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -517,30 +517,53 @@ DEF_OP(VAbs) {
}
}

// This only supports 8bit popcount on 8byte to 32byte registers
DEF_OP(VPopcount) {
auto Op = IROp->C<IR::IROp_VPopcount>();
const auto Op = IROp->C<IR::IROp_VPopcount>();
const uint8_t OpSize = IROp->Size;
// This only supports 8bit popcount on 8byte to 16byte registers
const bool Is256Bit = OpSize == 32;

const auto Src = GetSrc(Op->Vector.ID());
const auto Dest = GetDst(Node);

const uint8_t ElementSize = Op->Header.ElementSize;
const uint8_t Elements = OpSize / ElementSize;

auto Src = GetSrc(Op->Vector.ID());
auto Dest = GetDst(Node);
vpxor(xmm15, xmm15, xmm15);
const uint8_t Elements = OpSize / Op->Header.ElementSize;

// This is disgustingly bad on x86-64 but we only need it for compatibility
switch (Op->Header.ElementSize) {
// NOTE: If, in the distant future, we ever use AVX-512, consider
// using vpopcnt{b, d, q, w} to shorten all of this down to one
// instruction.
switch (ElementSize) {
case 1: {
for (size_t i = 0; i < Elements; ++i) {
pextrb(eax, Src, i);
popcnt(eax, eax);
pinsrb(xmm15, eax, i);
const uint8_t NumElements = Is256Bit ? Elements / 2
: Elements;

const auto VectorPopcount = [this, NumElements](const Xbyak::Xmm& src, const Xbyak::Xmm& dst) {
for (size_t i = 0; i < NumElements; ++i) {
pextrb(eax, src, i);
popcnt(eax, eax);
pinsrb(dst, eax, i);
}
};

// Bottom 128 bits
VectorPopcount(Src, xmm15);
movaps(Dest, xmm15);

// Now do the top 128 bits, if necessary
if (Is256Bit) {
vextracti128(xmm14, ToYMM(Src), 1);
VectorPopcount(xmm14, xmm14);
vinserti128(ToYMM(Dest), ToYMM(Dest), xmm14, 1);
}
break;
}
default: LOGMAN_MSG_A_FMT("Unknown Element Size: {}", Op->Header.ElementSize); break;
default:
LOGMAN_MSG_A_FMT("Unknown Element Size: {}", ElementSize);
break;
}

movaps(Dest, xmm15);
}

DEF_OP(VFAdd) {
Expand Down

0 comments on commit 4e441e5

Please sign in to comment.