Skip to content

Commit

Permalink
Merge pull request #2250 from Sonicadvance1/optimize_spilling_filling
Browse files Browse the repository at this point in the history
Arm64: Optimizing spilling and filling
  • Loading branch information
lioncash authored Dec 16, 2022
2 parents 65e8bf9 + 1beb791 commit 9a8852f
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 95 deletions.
161 changes: 93 additions & 68 deletions External/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,6 @@ void Arm64Emitter::PopCalleeSavedRegisters() {
}
}


void Arm64Emitter::SpillStaticRegs(bool FPRs, uint32_t GPRSpillMask, uint32_t FPRSpillMask) {
if (StaticRegisterAllocation()) {
for (size_t i = 0; i < SRA64.size(); i+=2) {
Expand Down Expand Up @@ -231,19 +230,34 @@ void Arm64Emitter::SpillStaticRegs(bool FPRs, uint32_t GPRSpillMask, uint32_t FP
}
}
} else {
for (size_t i = 0; i < SRAFPR.size(); i += 2) {
const auto Reg1 = SRAFPR[i];
const auto Reg2 = SRAFPR[i + 1];

if (((1U << Reg1.GetCode()) & FPRSpillMask) &&
((1U << Reg2.GetCode()) & FPRSpillMask)) {
stp(Reg1.Q(), Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
}
else if (((1U << Reg1.GetCode()) & FPRSpillMask)) {
str(Reg1.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
if (GPRSpillMask && FPRSpillMask == ~0U) {
// Optimize the common case where we can spill four registers per instruction
auto TmpReg = SRA64[__builtin_ffs(GPRSpillMask)];
// Load the sse offset in to the temporary register
add(TmpReg, STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[0][0]));
for (size_t i = 0; i < SRAFPR.size(); i += 4) {
const auto Reg1 = SRAFPR[i];
const auto Reg2 = SRAFPR[i + 1];
const auto Reg3 = SRAFPR[i + 2];
const auto Reg4 = SRAFPR[i + 3];
st1(Reg1.V2D(), Reg2.V2D(), Reg3.V2D(), Reg4.V2D(), MemOperand(TmpReg, 64, PostIndex));
}
else if (((1U << Reg2.GetCode()) & FPRSpillMask)) {
str(Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i+1][0])));
}
else {
for (size_t i = 0; i < SRAFPR.size(); i += 2) {
const auto Reg1 = SRAFPR[i];
const auto Reg2 = SRAFPR[i + 1];

if (((1U << Reg1.GetCode()) & FPRSpillMask) &&
((1U << Reg2.GetCode()) & FPRSpillMask)) {
stp(Reg1.Q(), Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
}
else if (((1U << Reg1.GetCode()) & FPRSpillMask)) {
str(Reg1.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
}
else if (((1U << Reg2.GetCode()) & FPRSpillMask)) {
str(Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i+1][0])));
}
}
}
}
Expand All @@ -253,21 +267,6 @@ void Arm64Emitter::SpillStaticRegs(bool FPRs, uint32_t GPRSpillMask, uint32_t FP

void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRFillMask) {
if (StaticRegisterAllocation()) {
for (size_t i = 0; i < SRA64.size(); i+=2) {
auto Reg1 = SRA64[i];
auto Reg2 = SRA64[i+1];
if (((1U << Reg1.GetCode()) & GPRFillMask) &&
((1U << Reg2.GetCode()) & GPRFillMask)) {
ldp(Reg1, Reg2, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.gregs[i])));
}
else if (((1U << Reg1.GetCode()) & GPRFillMask)) {
ldr(Reg1, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.gregs[i])));
}
else if (((1U << Reg2.GetCode()) & GPRFillMask)) {
ldr(Reg2, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.gregs[i+1])));
}
}

if (FPRs) {
if (EmitterCTX->HostFeatures.SupportsAVX) {
// Set up predicate registers.
Expand All @@ -286,29 +285,60 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
}
}
} else {
for (size_t i = 0; i < SRAFPR.size(); i += 2) {
const auto Reg1 = SRAFPR[i];
const auto Reg2 = SRAFPR[i + 1];

if (((1U << Reg1.GetCode()) & FPRFillMask) &&
((1U << Reg2.GetCode()) & FPRFillMask)) {
ldp(Reg1.Q(), Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
}
else if (((1U << Reg1.GetCode()) & FPRFillMask)) {
ldr(Reg1.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
if (GPRFillMask && FPRFillMask == ~0U) {
// Optimize the common case where we can fill four registers per instruction.
// Use one of the filling static registers before we fill it.
auto TmpReg = SRA64[__builtin_ffs(GPRFillMask)];
// Load the sse offset in to the temporary register
add(TmpReg, STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[0][0]));
for (size_t i = 0; i < SRAFPR.size(); i += 4) {
const auto Reg1 = SRAFPR[i];
const auto Reg2 = SRAFPR[i + 1];
const auto Reg3 = SRAFPR[i + 2];
const auto Reg4 = SRAFPR[i + 3];
ld1(Reg1.V2D(), Reg2.V2D(), Reg3.V2D(), Reg4.V2D(), MemOperand(TmpReg, 64, PostIndex));
}
else if (((1U << Reg2.GetCode()) & FPRFillMask)) {
ldr(Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i+1][0])));
}
else {
for (size_t i = 0; i < SRAFPR.size(); i += 2) {
const auto Reg1 = SRAFPR[i];
const auto Reg2 = SRAFPR[i + 1];

if (((1U << Reg1.GetCode()) & FPRFillMask) &&
((1U << Reg2.GetCode()) & FPRFillMask)) {
ldp(Reg1.Q(), Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
}
else if (((1U << Reg1.GetCode()) & FPRFillMask)) {
ldr(Reg1.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
}
else if (((1U << Reg2.GetCode()) & FPRFillMask)) {
ldr(Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i+1][0])));
}
}
}
}
}

for (size_t i = 0; i < SRA64.size(); i+=2) {
auto Reg1 = SRA64[i];
auto Reg2 = SRA64[i+1];
if (((1U << Reg1.GetCode()) & GPRFillMask) &&
((1U << Reg2.GetCode()) & GPRFillMask)) {
ldp(Reg1, Reg2, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.gregs[i])));
}
else if (((1U << Reg1.GetCode()) & GPRFillMask)) {
ldr(Reg1, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.gregs[i])));
}
else if (((1U << Reg2.GetCode()) & GPRFillMask)) {
ldr(Reg2, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.gregs[i+1])));
}
}
}
}

void Arm64Emitter::PushDynamicRegsAndLR() {
void Arm64Emitter::PushDynamicRegsAndLR(aarch64::Register TmpReg) {
const auto CanUseSVE = EmitterCTX->HostFeatures.SupportsAVX;
const auto GPRSize = (RA64.size() + 1) * Core::CPUState::GPR_REG_SIZE;
const auto GPRSize = 1 * Core::CPUState::GPR_REG_SIZE;
const auto FPRRegSize = CanUseSVE ? Core::CPUState::XMM_AVX_REG_SIZE
: Core::CPUState::XMM_SSE_REG_SIZE;
const auto FPRSize = RAFPR.size() * FPRRegSize;
Expand All @@ -323,26 +353,24 @@ void Arm64Emitter::PushDynamicRegsAndLR() {
st1b(RA.Z().VnB(), PRED_TMP_32B, SVEMemOperand(sp, TMP4));
i += 4;
}
str(lr, MemOperand(sp, i * 8));
} else {
for (const auto& RA : RAFPR) {
str(RA.Q(), MemOperand(sp, i * 8));
i += 2;
// rsp capable move
add(TmpReg, aarch64::sp, 0);
for (size_t i = 0; i < RAFPR.size(); i += 4) {
const auto Reg1 = RAFPR[i];
const auto Reg2 = RAFPR[i + 1];
const auto Reg3 = RAFPR[i + 2];
const auto Reg4 = RAFPR[i + 3];
st1(Reg1.V2D(), Reg2.V2D(), Reg3.V2D(), Reg4.V2D(), MemOperand(TmpReg, 64, PostIndex));
}
str(aarch64::lr, MemOperand(TmpReg, 0));
}

#if 0 // All GPRs should be caller saved
for (const auto& RA : RA64) {
str(RA, MemOperand(sp, i * 8));
i++;
}
#endif

str(lr, MemOperand(sp, i * 8));
}

void Arm64Emitter::PopDynamicRegsAndLR() {
const auto CanUseSVE = EmitterCTX->HostFeatures.SupportsAVX;
const auto GPRSize = (RA64.size() + 1) * Core::CPUState::GPR_REG_SIZE;
const auto GPRSize = 1 * Core::CPUState::GPR_REG_SIZE;
const auto FPRRegSize = CanUseSVE ? Core::CPUState::XMM_AVX_REG_SIZE
: Core::CPUState::XMM_SSE_REG_SIZE;
const auto FPRSize = RAFPR.size() * FPRRegSize;
Expand All @@ -355,23 +383,20 @@ void Arm64Emitter::PopDynamicRegsAndLR() {
ld1b(RA.Z().VnB(), PRED_TMP_32B.Zeroing(), SVEMemOperand(sp, TMP4));
i += 4;
}
ldr(lr, MemOperand(sp, i * 8));
add(sp, sp, SPOffset);
} else {
for (const auto& RA : RAFPR) {
ldr(RA.Q(), MemOperand(sp, i * 8));
i += 2;

for (size_t i = 0; i < RAFPR.size(); i += 4) {
const auto Reg1 = RAFPR[i];
const auto Reg2 = RAFPR[i + 1];
const auto Reg3 = RAFPR[i + 2];
const auto Reg4 = RAFPR[i + 3];
ld1(Reg1.V2D(), Reg2.V2D(), Reg3.V2D(), Reg4.V2D(), MemOperand(aarch64::sp, 64, PostIndex));
}
}

#if 0 // All GPRs should be caller saved
for (const auto& RA : RA64) {
ldr(RA, MemOperand(sp, i * 8));
i++;
ldr(aarch64::lr, MemOperand(aarch64::sp, 16, PostIndex));
}
#endif

ldr(lr, MemOperand(sp, i * 8));

add(sp, sp, SPOffset);
}

void Arm64Emitter::Align16B() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,10 @@ const std::array<aarch64::VRegister, 12> RAFPR = {
#define TMP4 x3

// Vector temporaries
#define VTMP1 v1
#define VTMP2 v2
#define VTMP3 v3
#define VTMP1 v0
#define VTMP2 v1
#define VTMP3 v2
#define VTMP4 v3

// Predicate register temporaries (used when AVX support is enabled)
// PRED_TMP_16B indicates a predicate register that indicates the first 16 bytes set to 1.
Expand Down Expand Up @@ -99,7 +100,7 @@ class Arm64Emitter : public vixl::aarch64::Assembler {
// We can't guarantee only the lower 64bits are used so flush everything
static constexpr uint32_t CALLER_FPR_MASK = ~0U;

void PushDynamicRegsAndLR();
void PushDynamicRegsAndLR(aarch64::Register TmpReg);
void PopDynamicRegsAndLR();

void PushCalleeSavedRegisters();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
{
LUDIVHandlerAddress = GetCursorAddress<uint64_t>();

PushDynamicRegsAndLR();
PushDynamicRegsAndLR(x3);
SpillStaticRegs();

ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LUDIV));
Expand All @@ -463,7 +463,7 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
{
LDIVHandlerAddress = GetCursorAddress<uint64_t>();

PushDynamicRegsAndLR();
PushDynamicRegsAndLR(x3);
SpillStaticRegs();

ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LDIV));
Expand All @@ -485,7 +485,7 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
{
LUREMHandlerAddress = GetCursorAddress<uint64_t>();

PushDynamicRegsAndLR();
PushDynamicRegsAndLR(x3);
SpillStaticRegs();

ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LUREM));
Expand All @@ -507,7 +507,7 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
{
LREMHandlerAddress = GetCursorAddress<uint64_t>();

PushDynamicRegsAndLR();
PushDynamicRegsAndLR(x3);
SpillStaticRegs();

ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LREM));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ DEF_OP(Syscall) {
// X2: Pointer to SyscallArguments

FEXCore::IR::SyscallFlags Flags = Op->Flags;
PushDynamicRegsAndLR();
PushDynamicRegsAndLR(TMP1);

if ((Flags & FEXCore::IR::SyscallFlags::NOSYNCSTATEONENTRY) != FEXCore::IR::SyscallFlags::NOSYNCSTATEONENTRY) {
SpillStaticRegs();
Expand Down Expand Up @@ -378,7 +378,7 @@ DEF_OP(Thunk) {

SpillStaticRegs(); // spill to ctx before ra64 spill

PushDynamicRegsAndLR();
PushDynamicRegsAndLR(TMP1);

mov(x0, GetReg<RA_64>(Op->ArgPtr.ID()));

Expand Down Expand Up @@ -448,7 +448,7 @@ DEF_OP(ThreadRemoveCodeEntry) {
// X0: Thread
// X1: RIP

PushDynamicRegsAndLR();
PushDynamicRegsAndLR(TMP1);

mov(x0, STATE);
LoadConstant(x1, Entry);
Expand All @@ -469,7 +469,7 @@ DEF_OP(ThreadRemoveCodeEntry) {
DEF_OP(CPUID) {
auto Op = IROp->C<IR::IROp_CPUID>();

PushDynamicRegsAndLR();
PushDynamicRegsAndLR(TMP1);
SpillStaticRegs();

// x0 = CPUID Handler
Expand Down
Loading

0 comments on commit 9a8852f

Please sign in to comment.