Skip to content

Commit

Permalink
Merge pull request #3356 from Sonicadvance1/modify_code_lock
Browse files Browse the repository at this point in the history
Jitarm64: Implements spin-loop futex for JIT blocks
  • Loading branch information
lioncash authored Jan 23, 2024
2 parents 56d8080 + a6c57f7 commit 750b0b7
Show file tree
Hide file tree
Showing 7 changed files with 494 additions and 62 deletions.
1 change: 1 addition & 0 deletions FEXCore/Source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ set (FEXCORE_BASE_SRCS
Utils/FileLoading.cpp
Utils/ForcedAssert.cpp
Utils/LogManager.cpp
Utils/SpinWaitLock.cpp
)

if (NOT MINGW_BUILD)
Expand Down
1 change: 1 addition & 0 deletions FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -858,6 +858,7 @@ CPUBackend::CompiledCode Arm64JITCore::CompileCode(uint64_t Entry,
// TODO: This needs to be a data RIP relocation once code caching works.
// Current relocation code doesn't support this feature yet.
JITBlockTail->RIP = Entry;
JITBlockTail->SpinLockFutex = 0;

{
// Store the RIP entries.
Expand Down
136 changes: 74 additions & 62 deletions FEXCore/Source/Utils/ArchHelpers/Arm64.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
// SPDX-License-Identifier: MIT

#include "Utils/SpinWaitLock.h"

#include <FEXCore/Debug/InternalThreadState.h>
#include <FEXCore/Utils/EnumUtils.h>
#include <FEXCore/Utils/LogManager.h>
#include <FEXCore/Utils/Telemetry.h>
Expand Down Expand Up @@ -2044,9 +2048,11 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_
uint32_t Size = (Instr & 0xC000'0000) >> 30;
uint32_t AddrReg = (Instr >> 5) & 0x1F;
uint32_t DataReg = Instr & 0x1F;
if ((Instr & LDAXR_MASK) == LDAR_INST || // LDAR*
(Instr & LDAXR_MASK) == LDAPR_INST) { // LDAPR*
if (ParanoidTSO) {

// ParanoidTSO path doesn't modify any code.
if (ParanoidTSO) [[unlikely]] {
if ((Instr & LDAXR_MASK) == LDAR_INST || // LDAR*
(Instr & LDAXR_MASK) == LDAPR_INST) { // LDAPR*
if (ArchHelpers::Arm64::HandleAtomicLoad(Instr, GPRs, 0)) {
// Skip this instruction now
return std::make_pair(true, 4);
Expand All @@ -2056,20 +2062,7 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_
return NotHandled;
}
}
else {
uint32_t LDR = 0b0011'1000'0111'1111'0110'1000'0000'0000;
LDR |= Size << 30;
LDR |= AddrReg << 5;
LDR |= DataReg;
PC[0] = LDR;
PC[1] = DMB_LD; // Back-patch the half-barrier.
ClearICache(&PC[-1], 16);
// With the instruction modified, now execute again.
return std::make_pair(true, 0);
}
}
else if ( (Instr & LDAXR_MASK) == STLR_INST) { // STLR*
if (ParanoidTSO) {
else if ( (Instr & LDAXR_MASK) == STLR_INST) { // STLR*
if (ArchHelpers::Arm64::HandleAtomicStore(Instr, GPRs, 0)) {
// Skip this instruction now
return std::make_pair(true, 4);
Expand All @@ -2079,22 +2072,9 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_
return NotHandled;
}
}
else {
uint32_t STR = 0b0011'1000'0011'1111'0110'1000'0000'0000;
STR |= Size << 30;
STR |= AddrReg << 5;
STR |= DataReg;
PC[-1] = DMB; // Back-patch the half-barrier.
PC[0] = STR;
ClearICache(&PC[-1], 16);
// Back up one instruction and have another go
return std::make_pair(true, -4);
}
}
else if ((Instr & RCPC2_MASK) == LDAPUR_INST) { // LDAPUR*
// Extract the 9-bit offset from the instruction
int32_t Offset = static_cast<int32_t>(Instr) << 11 >> 23;
if (ParanoidTSO) {
else if ((Instr & RCPC2_MASK) == LDAPUR_INST) { // LDAPUR*
// Extract the 9-bit offset from the instruction
int32_t Offset = static_cast<int32_t>(Instr) << 11 >> 23;
if (ArchHelpers::Arm64::HandleAtomicLoad(Instr, GPRs, Offset)) {
// Skip this instruction now
return std::make_pair(true, 4);
Expand All @@ -2104,23 +2084,9 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_
return NotHandled;
}
}
else {
uint32_t LDUR = 0b0011'1000'0100'0000'0000'0000'0000'0000;
LDUR |= Size << 30;
LDUR |= AddrReg << 5;
LDUR |= DataReg;
LDUR |= Instr & (0b1'1111'1111 << 9);
PC[0] = LDUR;
PC[1] = DMB_LD; // Back-patch the half-barrier.
ClearICache(&PC[-1], 16);
// With the instruction modified, now execute again.
return std::make_pair(true, 0);
}
}
else if ((Instr & RCPC2_MASK) == STLUR_INST) { // STLUR*
// Extract the 9-bit offset from the instruction
int32_t Offset = static_cast<int32_t>(Instr) << 11 >> 23;
if (ParanoidTSO) {
else if ((Instr & RCPC2_MASK) == STLUR_INST) { // STLUR*
// Extract the 9-bit offset from the instruction
int32_t Offset = static_cast<int32_t>(Instr) << 11 >> 23;
if (ArchHelpers::Arm64::HandleAtomicStore(Instr, GPRs, Offset)) {
// Skip this instruction now
return std::make_pair(true, 4);
Expand All @@ -2130,18 +2096,64 @@ static uint64_t HandleAtomicLoadstoreExclusive(uintptr_t ProgramCounter, uint64_
return NotHandled;
}
}
else {
uint32_t STUR = 0b0011'1000'0000'0000'0000'0000'0000'0000;
STUR |= Size << 30;
STUR |= AddrReg << 5;
STUR |= DataReg;
STUR |= Instr & (0b1'1111'1111 << 9);
PC[-1] = DMB; // Back-patch the half-barrier.
PC[0] = STUR;
ClearICache(&PC[-1], 16);
// Back up one instruction and have another go
return std::make_pair(true, -4);
}
}

const auto Frame = Thread->CurrentFrame;
const uint64_t BlockBegin = Frame->State.InlineJITBlockHeader;
auto InlineHeader = reinterpret_cast<const CPU::CPUBackend::JITCodeHeader *>(BlockBegin);
auto InlineTail = reinterpret_cast<CPU::CPUBackend::JITCodeTail *>(Frame->State.InlineJITBlockHeader + InlineHeader->OffsetToBlockTail);

// Lock code mutex during any SIGBUS handling that potentially changes code.
// Need to be careful to not read any code part-way through modification.
FEXCore::Utils::SpinWaitLock::UniqueSpinMutex lk(&InlineTail->SpinLockFutex);

if ((Instr & LDAXR_MASK) == LDAR_INST || // LDAR*
(Instr & LDAXR_MASK) == LDAPR_INST) { // LDAPR*
uint32_t LDR = 0b0011'1000'0111'1111'0110'1000'0000'0000;
LDR |= Size << 30;
LDR |= AddrReg << 5;
LDR |= DataReg;
PC[0] = LDR;
PC[1] = DMB_LD; // Back-patch the half-barrier.
ClearICache(&PC[-1], 16);
// With the instruction modified, now execute again.
return std::make_pair(true, 0);
}
else if ( (Instr & LDAXR_MASK) == STLR_INST) { // STLR*
uint32_t STR = 0b0011'1000'0011'1111'0110'1000'0000'0000;
STR |= Size << 30;
STR |= AddrReg << 5;
STR |= DataReg;
PC[-1] = DMB; // Back-patch the half-barrier.
PC[0] = STR;
ClearICache(&PC[-1], 16);
// Back up one instruction and have another go
return std::make_pair(true, -4);
}
else if ((Instr & RCPC2_MASK) == LDAPUR_INST) { // LDAPUR*
// Extract the 9-bit offset from the instruction
uint32_t LDUR = 0b0011'1000'0100'0000'0000'0000'0000'0000;
LDUR |= Size << 30;
LDUR |= AddrReg << 5;
LDUR |= DataReg;
LDUR |= Instr & (0b1'1111'1111 << 9);
PC[0] = LDUR;
PC[1] = DMB_LD; // Back-patch the half-barrier.
ClearICache(&PC[-1], 16);
// With the instruction modified, now execute again.
return std::make_pair(true, 0);
}
else if ((Instr & RCPC2_MASK) == STLUR_INST) { // STLUR*
uint32_t STUR = 0b0011'1000'0000'0000'0000'0000'0000'0000;
STUR |= Size << 30;
STUR |= AddrReg << 5;
STUR |= DataReg;
STUR |= Instr & (0b1'1111'1111 << 9);
PC[-1] = DMB; // Back-patch the half-barrier.
PC[0] = STUR;
ClearICache(&PC[-1], 16);
// Back up one instruction and have another go
return std::make_pair(true, -4);
}
else if ((Instr & ArchHelpers::Arm64::LDAXP_MASK) == ArchHelpers::Arm64::LDAXP_INST) { // LDAXP
//Should be compare and swap pair only. LDAXP not used elsewhere
Expand Down
27 changes: 27 additions & 0 deletions FEXCore/Source/Utils/SpinWaitLock.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#include "Utils/SpinWaitLock.h"

namespace FEXCore::Utils::SpinWaitLock {
#ifdef _M_ARM_64
constexpr uint64_t NanosecondsInSecond = 1'000'000'000ULL;

static uint32_t GetCycleCounterFrequency() {
uint64_t Result{};
__asm("mrs %[Res], CNTFRQ_EL0"
: [Res] "=r" (Result));
return Result;
}

static uint64_t CalculateCyclesPerNanosecond() {
// Snapdragon devices historically use a 19.2Mhz cycle counter frequency
// This means that the number of cycles per nanosecond ends up being 52.0833...
//
// ARMv8.6 and ARMv9.1 requires the cycle counter frequency to be 1Ghz.
// This means the number of cycles per nanosecond ends up being 1.
uint64_t CounterFrequency = GetCycleCounterFrequency();
return NanosecondsInSecond / CounterFrequency;
}

uint32_t CycleCounterFrequency = GetCycleCounterFrequency();
uint64_t CyclesPerNanosecond = CalculateCyclesPerNanosecond();
#endif
}
Loading

0 comments on commit 750b0b7

Please sign in to comment.