Merge pull request #2250 from Sonicadvance1/optimize_spilling_filling

Arm64: Optimizing spilling and filling
FEX-Emu · Dec 16, 2022 · 9a8852f · 9a8852f
2 parents 65e8bf9 + 1beb791
commit 9a8852f
Show file tree

Hide file tree

Showing 6 changed files with 121 additions and 95 deletions.
diff --git a/External/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp b/External/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.cpp
@@ -202,7 +202,6 @@ void Arm64Emitter::PopCalleeSavedRegisters() {
   }
 }
 
-
 void Arm64Emitter::SpillStaticRegs(bool FPRs, uint32_t GPRSpillMask, uint32_t FPRSpillMask) {
   if (StaticRegisterAllocation()) {
     for (size_t i = 0; i < SRA64.size(); i+=2) {
@@ -231,19 +230,34 @@ void Arm64Emitter::SpillStaticRegs(bool FPRs, uint32_t GPRSpillMask, uint32_t FP
           }
         }
       } else {
-        for (size_t i = 0; i < SRAFPR.size(); i += 2) {
-          const auto Reg1 = SRAFPR[i];
-          const auto Reg2 = SRAFPR[i + 1];
-
-          if (((1U << Reg1.GetCode()) & FPRSpillMask) &&
-              ((1U << Reg2.GetCode()) & FPRSpillMask)) {
-            stp(Reg1.Q(), Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
-          }
-          else if (((1U << Reg1.GetCode()) & FPRSpillMask)) {
-            str(Reg1.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
+        if (GPRSpillMask && FPRSpillMask == ~0U) {
+          // Optimize the common case where we can spill four registers per instruction
+          auto TmpReg = SRA64[__builtin_ffs(GPRSpillMask)];
+          // Load the sse offset in to the temporary register
+          add(TmpReg, STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[0][0]));
+          for (size_t i = 0; i < SRAFPR.size(); i += 4) {
+            const auto Reg1 = SRAFPR[i];
+            const auto Reg2 = SRAFPR[i + 1];
+            const auto Reg3 = SRAFPR[i + 2];
+            const auto Reg4 = SRAFPR[i + 3];
+            st1(Reg1.V2D(), Reg2.V2D(), Reg3.V2D(), Reg4.V2D(), MemOperand(TmpReg, 64, PostIndex));
           }
-          else if (((1U << Reg2.GetCode()) & FPRSpillMask)) {
-            str(Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i+1][0])));
+        }
+        else {
+          for (size_t i = 0; i < SRAFPR.size(); i += 2) {
+            const auto Reg1 = SRAFPR[i];
+            const auto Reg2 = SRAFPR[i + 1];
+
+            if (((1U << Reg1.GetCode()) & FPRSpillMask) &&
+                ((1U << Reg2.GetCode()) & FPRSpillMask)) {
+              stp(Reg1.Q(), Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
+            }
+            else if (((1U << Reg1.GetCode()) & FPRSpillMask)) {
+              str(Reg1.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
+            }
+            else if (((1U << Reg2.GetCode()) & FPRSpillMask)) {
+              str(Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i+1][0])));
+            }
           }
         }
       }
@@ -253,21 +267,6 @@ void Arm64Emitter::SpillStaticRegs(bool FPRs, uint32_t GPRSpillMask, uint32_t FP
 
 void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRFillMask) {
   if (StaticRegisterAllocation()) {
-    for (size_t i = 0; i < SRA64.size(); i+=2) {
-      auto Reg1 = SRA64[i];
-      auto Reg2 = SRA64[i+1];
-      if (((1U << Reg1.GetCode()) & GPRFillMask) &&
-          ((1U << Reg2.GetCode()) & GPRFillMask)) {
-        ldp(Reg1, Reg2, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.gregs[i])));
-      }
-      else if (((1U << Reg1.GetCode()) & GPRFillMask)) {
-        ldr(Reg1, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.gregs[i])));
-      }
-      else if (((1U << Reg2.GetCode()) & GPRFillMask)) {
-        ldr(Reg2, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.gregs[i+1])));
-      }
-    }
-
     if (FPRs) {
       if (EmitterCTX->HostFeatures.SupportsAVX) {
         // Set up predicate registers.
@@ -286,29 +285,60 @@ void Arm64Emitter::FillStaticRegs(bool FPRs, uint32_t GPRFillMask, uint32_t FPRF
           }
         }
       } else {
-        for (size_t i = 0; i < SRAFPR.size(); i += 2) {
-          const auto Reg1 = SRAFPR[i];
-          const auto Reg2 = SRAFPR[i + 1];
-
-          if (((1U << Reg1.GetCode()) & FPRFillMask) &&
-              ((1U << Reg2.GetCode()) & FPRFillMask)) {
-            ldp(Reg1.Q(), Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
-          }
-          else if (((1U << Reg1.GetCode()) & FPRFillMask)) {
-            ldr(Reg1.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
+        if (GPRFillMask && FPRFillMask == ~0U) {
+          // Optimize the common case where we can fill four registers per instruction.
+          // Use one of the filling static registers before we fill it.
+          auto TmpReg = SRA64[__builtin_ffs(GPRFillMask)];
+          // Load the sse offset in to the temporary register
+          add(TmpReg, STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[0][0]));
+          for (size_t i = 0; i < SRAFPR.size(); i += 4) {
+            const auto Reg1 = SRAFPR[i];
+            const auto Reg2 = SRAFPR[i + 1];
+            const auto Reg3 = SRAFPR[i + 2];
+            const auto Reg4 = SRAFPR[i + 3];
+            ld1(Reg1.V2D(), Reg2.V2D(), Reg3.V2D(), Reg4.V2D(), MemOperand(TmpReg, 64, PostIndex));
           }
-          else if (((1U << Reg2.GetCode()) & FPRFillMask)) {
-            ldr(Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i+1][0])));
+        }
+        else {
+          for (size_t i = 0; i < SRAFPR.size(); i += 2) {
+            const auto Reg1 = SRAFPR[i];
+            const auto Reg2 = SRAFPR[i + 1];
+
+            if (((1U << Reg1.GetCode()) & FPRFillMask) &&
+                ((1U << Reg2.GetCode()) & FPRFillMask)) {
+              ldp(Reg1.Q(), Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
+            }
+            else if (((1U << Reg1.GetCode()) & FPRFillMask)) {
+              ldr(Reg1.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i][0])));
+            }
+            else if (((1U << Reg2.GetCode()) & FPRFillMask)) {
+              ldr(Reg2.Q(), MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.xmm.sse.data[i+1][0])));
+            }
           }
         }
       }
     }
+
+    for (size_t i = 0; i < SRA64.size(); i+=2) {
+      auto Reg1 = SRA64[i];
+      auto Reg2 = SRA64[i+1];
+      if (((1U << Reg1.GetCode()) & GPRFillMask) &&
+          ((1U << Reg2.GetCode()) & GPRFillMask)) {
+        ldp(Reg1, Reg2, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.gregs[i])));
+      }
+      else if (((1U << Reg1.GetCode()) & GPRFillMask)) {
+        ldr(Reg1, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.gregs[i])));
+      }
+      else if (((1U << Reg2.GetCode()) & GPRFillMask)) {
+        ldr(Reg2, MemOperand(STATE, offsetof(FEXCore::Core::CpuStateFrame, State.gregs[i+1])));
+      }
+    }
   }
 }
 
-void Arm64Emitter::PushDynamicRegsAndLR() {
+void Arm64Emitter::PushDynamicRegsAndLR(aarch64::Register TmpReg) {
   const auto CanUseSVE = EmitterCTX->HostFeatures.SupportsAVX;
-  const auto GPRSize = (RA64.size() + 1) * Core::CPUState::GPR_REG_SIZE;
+  const auto GPRSize = 1 * Core::CPUState::GPR_REG_SIZE;
   const auto FPRRegSize = CanUseSVE ? Core::CPUState::XMM_AVX_REG_SIZE
                                     : Core::CPUState::XMM_SSE_REG_SIZE;
   const auto FPRSize = RAFPR.size() * FPRRegSize;
@@ -323,26 +353,24 @@ void Arm64Emitter::PushDynamicRegsAndLR() {
       st1b(RA.Z().VnB(), PRED_TMP_32B, SVEMemOperand(sp, TMP4));
       i += 4;
     }
+    str(lr, MemOperand(sp, i * 8));
   } else {
-    for (const auto& RA : RAFPR) {
-      str(RA.Q(), MemOperand(sp, i * 8));
-      i += 2;
+    // rsp capable move
+    add(TmpReg, aarch64::sp, 0);
+    for (size_t i = 0; i < RAFPR.size(); i += 4) {
+      const auto Reg1 = RAFPR[i];
+      const auto Reg2 = RAFPR[i + 1];
+      const auto Reg3 = RAFPR[i + 2];
+      const auto Reg4 = RAFPR[i + 3];
+      st1(Reg1.V2D(), Reg2.V2D(), Reg3.V2D(), Reg4.V2D(), MemOperand(TmpReg, 64, PostIndex));
     }
+    str(aarch64::lr, MemOperand(TmpReg, 0));
   }
-
-#if 0 // All GPRs should be caller saved
-  for (const auto& RA : RA64) {
-    str(RA, MemOperand(sp, i * 8));
-    i++;
-  }
-#endif
-
-  str(lr, MemOperand(sp, i * 8));
 }
 
 void Arm64Emitter::PopDynamicRegsAndLR() {
   const auto CanUseSVE = EmitterCTX->HostFeatures.SupportsAVX;
-  const auto GPRSize = (RA64.size() + 1) * Core::CPUState::GPR_REG_SIZE;
+  const auto GPRSize = 1 * Core::CPUState::GPR_REG_SIZE;
   const auto FPRRegSize = CanUseSVE ? Core::CPUState::XMM_AVX_REG_SIZE
                                     : Core::CPUState::XMM_SSE_REG_SIZE;
   const auto FPRSize = RAFPR.size() * FPRRegSize;
@@ -355,23 +383,20 @@ void Arm64Emitter::PopDynamicRegsAndLR() {
       ld1b(RA.Z().VnB(), PRED_TMP_32B.Zeroing(), SVEMemOperand(sp, TMP4));
       i += 4;
     }
+    ldr(lr, MemOperand(sp, i * 8));
+    add(sp, sp, SPOffset);
   } else {
-    for (const auto& RA : RAFPR) {
-      ldr(RA.Q(), MemOperand(sp, i * 8));
-      i += 2;
+
+    for (size_t i = 0; i < RAFPR.size(); i += 4) {
+      const auto Reg1 = RAFPR[i];
+      const auto Reg2 = RAFPR[i + 1];
+      const auto Reg3 = RAFPR[i + 2];
+      const auto Reg4 = RAFPR[i + 3];
+      ld1(Reg1.V2D(), Reg2.V2D(), Reg3.V2D(), Reg4.V2D(), MemOperand(aarch64::sp, 64, PostIndex));
     }
-  }
 
-#if 0 // All GPRs should be caller saved
-  for (const auto& RA : RA64) {
-    ldr(RA, MemOperand(sp, i * 8));
-    i++;
+    ldr(aarch64::lr, MemOperand(aarch64::sp, 16, PostIndex));
   }
-#endif
-
-  ldr(lr, MemOperand(sp, i * 8));
-
-  add(sp, sp, SPOffset);
 }
 
 void Arm64Emitter::Align16B() {

diff --git a/External/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h b/External/FEXCore/Source/Interface/Core/ArchHelpers/Arm64Emitter.h
@@ -66,9 +66,10 @@ const std::array<aarch64::VRegister, 12> RAFPR = {
 #define TMP4 x3
 
 // Vector temporaries
-#define VTMP1 v1
-#define VTMP2 v2
-#define VTMP3 v3
+#define VTMP1 v0
+#define VTMP2 v1
+#define VTMP3 v2
+#define VTMP4 v3
 
 // Predicate register temporaries (used when AVX support is enabled)
 // PRED_TMP_16B indicates a predicate register that indicates the first 16 bytes set to 1.
@@ -99,7 +100,7 @@ class Arm64Emitter : public vixl::aarch64::Assembler {
   // We can't guarantee only the lower 64bits are used so flush everything
   static constexpr uint32_t CALLER_FPR_MASK = ~0U;
 
-  void PushDynamicRegsAndLR();
+  void PushDynamicRegsAndLR(aarch64::Register TmpReg);
   void PopDynamicRegsAndLR();
 
   void PushCalleeSavedRegisters();

diff --git a/External/FEXCore/Source/Interface/Core/Dispatcher/Arm64Dispatcher.cpp b/External/FEXCore/Source/Interface/Core/Dispatcher/Arm64Dispatcher.cpp
@@ -441,7 +441,7 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
   {
     LUDIVHandlerAddress = GetCursorAddress<uint64_t>();
 
-    PushDynamicRegsAndLR();
+    PushDynamicRegsAndLR(x3);
     SpillStaticRegs();
 
     ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LUDIV));
@@ -463,7 +463,7 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
   {
     LDIVHandlerAddress = GetCursorAddress<uint64_t>();
 
-    PushDynamicRegsAndLR();
+    PushDynamicRegsAndLR(x3);
     SpillStaticRegs();
 
     ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LDIV));
@@ -485,7 +485,7 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
   {
     LUREMHandlerAddress = GetCursorAddress<uint64_t>();
 
-    PushDynamicRegsAndLR();
+    PushDynamicRegsAndLR(x3);
     SpillStaticRegs();
 
     ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LUREM));
@@ -507,7 +507,7 @@ Arm64Dispatcher::Arm64Dispatcher(FEXCore::Context::Context *ctx, const Dispatche
   {
     LREMHandlerAddress = GetCursorAddress<uint64_t>();
 
-    PushDynamicRegsAndLR();
+    PushDynamicRegsAndLR(x3);
     SpillStaticRegs();
 
     ldr(x3, STATE_PTR(CpuStateFrame, Pointers.AArch64.LREM));

diff --git a/External/FEXCore/Source/Interface/Core/JIT/Arm64/BranchOps.cpp b/External/FEXCore/Source/Interface/Core/JIT/Arm64/BranchOps.cpp
@@ -176,7 +176,7 @@ DEF_OP(Syscall) {
   // X2: Pointer to SyscallArguments
 
   FEXCore::IR::SyscallFlags Flags = Op->Flags;
-  PushDynamicRegsAndLR();
+  PushDynamicRegsAndLR(TMP1);
 
   if ((Flags & FEXCore::IR::SyscallFlags::NOSYNCSTATEONENTRY) != FEXCore::IR::SyscallFlags::NOSYNCSTATEONENTRY) {
     SpillStaticRegs();
@@ -378,7 +378,7 @@ DEF_OP(Thunk) {
 
   SpillStaticRegs(); // spill to ctx before ra64 spill
 
-  PushDynamicRegsAndLR();
+  PushDynamicRegsAndLR(TMP1);
 
   mov(x0, GetReg<RA_64>(Op->ArgPtr.ID()));
 
@@ -448,7 +448,7 @@ DEF_OP(ThreadRemoveCodeEntry) {
   // X0: Thread
   // X1: RIP
 
-  PushDynamicRegsAndLR();
+  PushDynamicRegsAndLR(TMP1);
 
   mov(x0, STATE);
   LoadConstant(x1, Entry);
@@ -469,7 +469,7 @@ DEF_OP(ThreadRemoveCodeEntry) {
 DEF_OP(CPUID) {
   auto Op = IROp->C<IR::IROp_CPUID>();
 
-  PushDynamicRegsAndLR();
+  PushDynamicRegsAndLR(TMP1);
   SpillStaticRegs();
 
   // x0 = CPUID Handler