Skip to content

Commit

Permalink
Implement Ryzen speedups
Browse files Browse the repository at this point in the history
  • Loading branch information
fireice-uk committed Dec 29, 2019
1 parent e5fb0e0 commit ea47136
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 28 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
mov rcx, rbp ;# ecx = ma
shr rcx, 32
and ecx, RANDOMX_DATASET_BASE_MASK
xor rbp, rax ;# modify "mx"
mov rax, qword ptr [rdi+rcx]
mov edx, ebp ;# edx = mx
and edx, RANDOMX_DATASET_BASE_MASK
prefetchnta byte ptr [rdi+rdx]
ror rbp, 32 ;# swap "ma" and "mx"
xor r8, rax
xor r9, qword ptr [rdi+rcx+8]
xor r10, qword ptr [rdi+rcx+16]
xor r11, qword ptr [rdi+rcx+24]
xor r12, qword ptr [rdi+rcx+32]
xor r13, qword ptr [rdi+rcx+40]
xor r14, qword ptr [rdi+rcx+48]
xor r15, qword ptr [rdi+rcx+56]
76 changes: 52 additions & 24 deletions xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,6 @@ namespace randomx {
{0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
};

bool JitCompilerX86::BranchesWithin32B = false;

size_t JitCompilerX86::getCodeSize() {
return codePos < prologueSize ? 0 : codePos - prologueSize;
}
Expand All @@ -241,8 +239,14 @@ namespace randomx {
# endif
}

std::atomic<uint64_t> JitCompilerX86::flags_set(0);
uint64_t JitCompilerX86::flags = 0;
// CPU-specific tweaks
void JitCompilerX86::applyTweaks() {

if(flags_set.fetch_add(1) != 0)
return;

int32_t info[4];
cpuid(0, info);

Expand All @@ -252,38 +256,42 @@ namespace randomx {
manufacturer[2] = info[2];
manufacturer[3] = 0;

if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) {
struct
{
unsigned int stepping : 4;
unsigned int model : 4;
unsigned int family : 4;
unsigned int processor_type : 2;
unsigned int reserved1 : 2;
unsigned int ext_model : 4;
unsigned int ext_family : 8;
unsigned int reserved2 : 4;
} processor_info;

cpuid(1, info);
memcpy(&processor_info, info, sizeof(processor_info));
struct
{
unsigned int stepping : 4;
unsigned int model : 4;
unsigned int family : 4;
unsigned int processor_type : 2;
unsigned int reserved1 : 2;
unsigned int ext_model : 4;
unsigned int ext_family : 8;
unsigned int reserved2 : 4;
} processor_info;

cpuid(1, info);
memcpy(&processor_info, info, sizeof(processor_info));

if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) {
// Intel JCC erratum mitigation
if (processor_info.family == 6) {
const uint32_t model = processor_info.model | (processor_info.ext_model << 4);
const uint32_t stepping = processor_info.stepping;

// Affected CPU models and stepping numbers are taken from https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
BranchesWithin32B =
set_flag(BRANCHES_WITHIN_32B,
((model == 0x4E) && (stepping == 0x3)) ||
((model == 0x55) && (stepping == 0x4)) ||
((model == 0x5E) && (stepping == 0x3)) ||
((model == 0x8E) && (stepping >= 0x9) && (stepping <= 0xC)) ||
((model == 0x9E) && (stepping >= 0x9) && (stepping <= 0xD)) ||
((model == 0xA6) && (stepping == 0x0)) ||
((model == 0xAE) && (stepping == 0xA));
((model == 0xAE) && (stepping == 0xA)));
}
}

if (strcmp((const char*)manufacturer, "AuthenticAMD") == 0) {
set_flag(AMD_RYZEN_FAMILY, processor_info.family == 0x17);
}
}

static std::atomic<size_t> codeOffset;
Expand All @@ -303,8 +311,20 @@ namespace randomx {

void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) {
generateProgramPrologue(prog, pcfg);
memcpy(code + codePos, RandomX_CurrentConfig.codeReadDatasetTweaked, readDatasetSize);
codePos += readDatasetSize;

uint8_t* p;
uint32_t n;
if (check_flag(AMD_RYZEN_FAMILY)) {
p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked;
n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize;
}
else {
p = RandomX_CurrentConfig.codeReadDatasetTweaked;
n = RandomX_CurrentConfig.codeReadDatasetTweakedSize;
}
memcpy(code + codePos, p, n);
codePos += n;

generateProgramEpilogue(prog, pcfg);
}

Expand Down Expand Up @@ -396,7 +416,7 @@ namespace randomx {
memcpy(code + codePos, codeLoopStore, loopStoreSize);
codePos += loopStoreSize;

if (BranchesWithin32B) {
if (check_flag(BRANCHES_WITHIN_32B)) {
const uint32_t branch_begin = static_cast<uint32_t>(codePos);
const uint32_t branch_end = static_cast<uint32_t>(branch_begin + 9);

Expand Down Expand Up @@ -989,6 +1009,8 @@ namespace randomx {
codePos = pos;
}

static const uint8_t AND_OR_MOV_LDMXCSR_RYZEN[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x3B, 0x44, 0x24, 0xFC, 0x74, 0x09, 0x89, 0x44, 0x24, 0xFC, 0x0F, 0xAE, 0x54, 0x24, 0xFC };

void JitCompilerX86::h_CFROUND(const Instruction& instr) {
uint8_t* const p = code;
int pos = codePos;
Expand All @@ -1000,7 +1022,13 @@ namespace randomx {
emit(ROL_RAX, p, pos);
emitByte(rotate, p, pos);
}
emit(AND_OR_MOV_LDMXCSR, p, pos);

if (check_flag(AMD_RYZEN_FAMILY)) {
emit(AND_OR_MOV_LDMXCSR_RYZEN, p, pos);
}
else {
emit(AND_OR_MOV_LDMXCSR, p, pos);
}

codePos = pos;
}
Expand All @@ -1012,7 +1040,7 @@ namespace randomx {
const int reg = instr.dst;
int32_t jmp_offset = registerUsage[reg] - (pos + 16);

if (BranchesWithin32B) {
if (check_flag(BRANCHES_WITHIN_32B)) {
const uint32_t branch_begin = static_cast<uint32_t>(pos + 7);
const uint32_t branch_end = static_cast<uint32_t>(branch_begin + ((jmp_offset >= -128) ? 9 : 13));

Expand Down
19 changes: 18 additions & 1 deletion xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <cstdint>
#include <cstring>
#include <vector>
#include <atomic>
#include "crypto/randomx/common.hpp"

namespace randomx {
Expand Down Expand Up @@ -71,7 +72,23 @@ namespace randomx {
uint8_t* code;
int32_t codePos;

static bool BranchesWithin32B;
static std::atomic<uint64_t> flags_set;
static constexpr uint64_t BRANCHES_WITHIN_32B = 1;
static constexpr uint64_t AMD_RYZEN_FAMILY = 2;
static uint64_t flags;

static inline bool check_flag(uint64_t f)
{
return (flags & f) != 0;
}

static inline void set_flag(uint64_t f, bool v)
{
if(v)
flags |= f;
else
flags &= ~f;
}

static void applyTweaks();
void generateProgramPrologue(Program&, ProgramConfiguration&);
Expand Down
5 changes: 5 additions & 0 deletions xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.S
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
.global DECL(randomx_program_loop_load)
.global DECL(randomx_program_start)
.global DECL(randomx_program_read_dataset)
.global DECL(randomx_program_read_dataset_ryzen)
.global DECL(randomx_program_read_dataset_sshash_init)
.global DECL(randomx_program_read_dataset_sshash_fin)
.global DECL(randomx_program_loop_store)
Expand Down Expand Up @@ -92,6 +93,7 @@ DECL(randomx_program_prologue_first_load):
and eax, RANDOMX_SCRATCHPAD_MASK
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
stmxcsr dword ptr [rsp-20]
jmp DECL(randomx_program_loop_begin)

.balign 64
Expand All @@ -110,6 +112,9 @@ DECL(randomx_program_start):
DECL(randomx_program_read_dataset):
#include "asm/program_read_dataset.inc"

DECL(randomx_program_read_dataset_ryzen):
#include "asm/program_read_dataset_ryzen.inc"

DECL(randomx_program_read_dataset_sshash_init):
#include "asm/program_read_dataset_sshash_init.inc"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ PUBLIC randomx_program_loop_begin
PUBLIC randomx_program_loop_load
PUBLIC randomx_program_start
PUBLIC randomx_program_read_dataset
PUBLIC randomx_program_read_dataset_ryzen
PUBLIC randomx_program_read_dataset_sshash_init
PUBLIC randomx_program_read_dataset_sshash_fin
PUBLIC randomx_dataset_init
Expand Down Expand Up @@ -80,6 +81,7 @@ randomx_program_prologue_first_load PROC
and eax, RANDOMX_SCRATCHPAD_MASK
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
stmxcsr dword ptr [rsp-20]
jmp randomx_program_loop_begin
randomx_program_prologue_first_load ENDP

Expand All @@ -103,6 +105,10 @@ randomx_program_read_dataset PROC
include asm/program_read_dataset.inc
randomx_program_read_dataset ENDP

randomx_program_read_dataset_ryzen PROC
include asm/program_read_dataset_ryzen.inc
randomx_program_read_dataset_ryzen ENDP

randomx_program_read_dataset_sshash_init PROC
include asm/program_read_dataset_sshash_init.inc
randomx_program_read_dataset_sshash_init ENDP
Expand Down Expand Up @@ -220,4 +226,4 @@ _RANDOMX_JITX86_STATIC ENDS

ENDIF

END
END
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ extern "C" {
void randomx_program_loop_load();
void randomx_program_start();
void randomx_program_read_dataset();
void randomx_program_read_dataset_ryzen();
void randomx_program_read_dataset_sshash_init();
void randomx_program_read_dataset_sshash_fin();
void randomx_program_loop_store();
Expand Down
9 changes: 8 additions & 1 deletion xmrstak/backend/cpu/crypto/randomx/randomx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,15 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
}
{
const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset;
const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init;
const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_ryzen;
memcpy(codeReadDatasetTweaked, a, b - a);
codeReadDatasetTweakedSize = b - a;
}
{
const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_ryzen;
const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init;
memcpy(codeReadDatasetRyzenTweaked, a, b - a);
codeReadDatasetRyzenTweakedSize = b - a;
}
{
const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_sshash_init;
Expand Down
5 changes: 4 additions & 1 deletion xmrstak/backend/cpu/crypto/randomx/randomx.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,10 @@ struct RandomX_ConfigurationBase
rx_vec_i128 fillAes4Rx4_Key[8];

uint8_t codeShhPrefetchTweaked[20];
uint8_t codeReadDatasetTweaked[64];
uint8_t codeReadDatasetTweaked[72];
uint32_t codeReadDatasetTweakedSize;
uint8_t codeReadDatasetRyzenTweaked[72];
uint32_t codeReadDatasetRyzenTweakedSize;
uint8_t codeReadDatasetLightSshInitTweaked[68];
uint8_t codePrefetchScratchpadTweaked[32];

Expand Down

0 comments on commit ea47136

Please sign in to comment.